diff options
| author | rafiramadhana <rf.ramadhana@gmail.com> | 2023-11-15 22:18:06 +0700 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-11-15 23:48:30 +0100 |
| commit | beadfd1db3d2398b9b1e66d60779a7b2649af044 (patch) | |
| tree | b746ff1ad17234a650c4ffb0ac25158ecd89e6d8 /modules | |
| parent | 190056ee8d6a4eca61d92a79cc25aad645e69d4a (diff) | |
Add multiple starting URLs
Add multiple starting URLs by adding Module.URLs.
Diffstat (limited to 'modules')
| -rw-r--r-- | modules/starturl/starturl.go | 14 | ||||
| -rw-r--r-- | modules/starturl/starturl_test.go | 83 |
2 files changed, 90 insertions, 7 deletions
diff --git a/modules/starturl/starturl.go b/modules/starturl/starturl.go index 9e3ec31..a016324 100644 --- a/modules/starturl/starturl.go +++ b/modules/starturl/starturl.go @@ -13,7 +13,8 @@ func init() { } type Module struct { - URL string `json:"url"` + URL string `json:"url"` + URLs []string `json:"urls"` } func (Module) ModuleInfo() flyscrape.ModuleInfo { @@ -24,14 +25,13 @@ func (Module) ModuleInfo() flyscrape.ModuleInfo { } func (m *Module) Provision(ctx flyscrape.Context) { - if m.disabled() { - return + if m.URL != "" { + ctx.Visit(m.URL) } - ctx.Visit(m.URL) -} -func (m *Module) disabled() bool { - return m.URL == "" + for _, url := range m.URLs { + ctx.Visit(url) + } } var _ flyscrape.Provisioner = (*Module)(nil) diff --git a/modules/starturl/starturl_test.go b/modules/starturl/starturl_test.go index 78efa6a..54f899a 100644 --- a/modules/starturl/starturl_test.go +++ b/modules/starturl/starturl_test.go @@ -38,3 +38,86 @@ func TestStartURL(t *testing.T) { require.Equal(t, "http://www.example.com/foo/bar", url) require.Equal(t, 0, depth) } + +func TestStartURL_MultipleStartingURLs(t *testing.T) { + testCases := []struct { + name string + startURLModFn func() *starturl.Module + urls []string + }{ + { + name: ".URL and .URLs", + startURLModFn: func() *starturl.Module { + return &starturl.Module{ + URL: "http://www.example.com/foo", + URLs: []string{ + "http://www.example.com/bar", + "http://www.example.com/baz", + }, + } + }, + urls: []string{ + "http://www.example.com/foo", + "http://www.example.com/bar", + "http://www.example.com/baz", + }, + }, + { + name: "only .URL", + startURLModFn: func() *starturl.Module { + return &starturl.Module{ + URL: "http://www.example.com/foo", + } + }, + urls: []string{ + "http://www.example.com/foo", + }, + }, + { + name: "only .URLs", + startURLModFn: func() *starturl.Module { + return &starturl.Module{ + URLs: []string{ + "http://www.example.com/bar", + "http://www.example.com/baz", + }, + } + }, + urls: []string{ + "http://www.example.com/bar", + "http://www.example.com/baz", + }, + }, + { + name: "empty", + startURLModFn: func() *starturl.Module { + return &starturl.Module{} + }, + urls: []string{}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + urls := []string{} + + mods := []flyscrape.Module{ + tc.startURLModFn(), + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(http.StatusOK, "") + }, + BuildRequestFn: func(r *flyscrape.Request) { + urls = append(urls, r.URL) + }, + }, + } + + scraper := flyscrape.NewScraper() + scraper.Modules = mods + scraper.Run() + + require.ElementsMatch(t, tc.urls, urls) + }) + } +} |