diff options
Diffstat (limited to 'modules')
| -rw-r--r-- | modules/depth/depth.go | 30 | ||||
| -rw-r--r-- | modules/depth/depth_test.go | 47 | ||||
| -rw-r--r-- | modules/domainfilter/domainfilter.go | 62 | ||||
| -rw-r--r-- | modules/domainfilter/domainfilter_test.go | 92 | ||||
| -rw-r--r-- | modules/followlinks/followlinks.go | 30 | ||||
| -rw-r--r-- | modules/followlinks/followlinks_test.go | 39 | ||||
| -rw-r--r-- | modules/jsonprinter/jsonprinter.go | 47 | ||||
| -rw-r--r-- | modules/jsonprinter/jsonprinter_test.go | 47 | ||||
| -rw-r--r-- | modules/ratelimit/ratelimit.go | 54 | ||||
| -rw-r--r-- | modules/ratelimit/ratelimit_test.go | 45 | ||||
| -rw-r--r-- | modules/starturl/starturl.go | 30 | ||||
| -rw-r--r-- | modules/starturl/starturl_test.go | 31 | ||||
| -rw-r--r-- | modules/urlfilter/urlfilter.go | 85 | ||||
| -rw-r--r-- | modules/urlfilter/urlfilter_test.go | 71 |
14 files changed, 710 insertions, 0 deletions
diff --git a/modules/depth/depth.go b/modules/depth/depth.go new file mode 100644 index 0000000..5efedc8 --- /dev/null +++ b/modules/depth/depth.go @@ -0,0 +1,30 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package depth + +import ( + "github.com/philippta/flyscrape" +) + +func init() { + flyscrape.RegisterModule(new(Module)) +} + +type Module struct { + Depth int `json:"depth"` +} + +func (m *Module) ID() string { + return "depth" +} + +func (m *Module) CanRequest(url string, depth int) bool { + return depth <= m.Depth +} + +var ( + _ flyscrape.Module = (*Module)(nil) + _ flyscrape.CanRequest = (*Module)(nil) +) diff --git a/modules/depth/depth_test.go b/modules/depth/depth_test.go new file mode 100644 index 0000000..309e628 --- /dev/null +++ b/modules/depth/depth_test.go @@ -0,0 +1,47 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package depth_test + +import ( + "net/http" + "testing" + + "github.com/philippta/flyscrape" + "github.com/philippta/flyscrape/modules/depth" + "github.com/philippta/flyscrape/modules/followlinks" + "github.com/philippta/flyscrape/modules/starturl" + "github.com/stretchr/testify/require" +) + +func TestDepth(t *testing.T) { + scraper := flyscrape.NewScraper() + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) + scraper.LoadModule(&followlinks.Module{}) + scraper.LoadModule(&depth.Module{Depth: 2}) + + scraper.SetTransport(func(r *http.Request) (*http.Response, error) { + switch r.URL.String() { + case "http://www.example.com/": + return flyscrape.MockResponse(200, `<a href="http://www.google.com">Google</a>`) + case "http://www.google.com/": + return flyscrape.MockResponse(200, `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`) + case "http://www.duckduckgo.com/": + return flyscrape.MockResponse(200, `<a href="http://www.example.com">Example</a>`) + } + return flyscrape.MockResponse(200, "") + }) + + var urls []string + scraper.OnRequest(func(req *flyscrape.Request) { + urls = append(urls, req.URL) + }) + + scraper.Run() + + require.Len(t, urls, 3) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.google.com/") + require.Contains(t, urls, "http://www.duckduckgo.com/") +} diff --git a/modules/domainfilter/domainfilter.go b/modules/domainfilter/domainfilter.go new file mode 100644 index 0000000..b892882 --- /dev/null +++ b/modules/domainfilter/domainfilter.go @@ -0,0 +1,62 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package domainfilter + +import ( + "github.com/nlnwa/whatwg-url/url" + "github.com/philippta/flyscrape" +) + +func init() { + flyscrape.RegisterModule(new(Module)) +} + +type Module struct { + URL string `json:"url"` + AllowedDomains []string `json:"allowedDomains"` + BlockedDomains []string `json:"blockedDomains"` +} + +func (m *Module) ID() string { + return "domainfilter" +} + +func (m *Module) OnLoad(v flyscrape.Visitor) { + if u, err := url.Parse(m.URL); err == nil { + m.AllowedDomains = append(m.AllowedDomains, u.Host()) + } +} + +func (m *Module) CanRequest(rawurl string, depth int) bool { + u, err := url.Parse(rawurl) + if err != nil { + return false + } + + host := u.Host() + ok := false + + for _, domain := range m.AllowedDomains { + if domain == "*" || host == domain { + ok = true + break + } + } + + for _, domain := range m.BlockedDomains { + if host == domain { + ok = false + break + } + } + + return ok +} + +var ( + _ flyscrape.Module = (*Module)(nil) + _ flyscrape.CanRequest = (*Module)(nil) + _ flyscrape.OnLoad = (*Module)(nil) +) diff --git a/modules/domainfilter/domainfilter_test.go b/modules/domainfilter/domainfilter_test.go new file mode 100644 index 0000000..97bdc9c --- /dev/null +++ b/modules/domainfilter/domainfilter_test.go @@ -0,0 +1,92 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package domainfilter_test + +import ( + "testing" + + "github.com/philippta/flyscrape" + "github.com/philippta/flyscrape/modules/domainfilter" + "github.com/philippta/flyscrape/modules/followlinks" + "github.com/philippta/flyscrape/modules/starturl" + "github.com/stretchr/testify/require" +) + +func TestDomainfilterAllowed(t *testing.T) { + scraper := flyscrape.NewScraper() + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) + scraper.LoadModule(&followlinks.Module{}) + scraper.LoadModule(&domainfilter.Module{ + URL: "http://www.example.com", + AllowedDomains: []string{"www.google.com"}, + }) + + scraper.SetTransport(flyscrape.MockTransport(200, ` + <a href="http://www.google.com">Google</a> + <a href="http://www.duckduckgo.com">DuckDuckGo</a>`)) + + var urls []string + scraper.OnRequest(func(req *flyscrape.Request) { + urls = append(urls, req.URL) + }) + + scraper.Run() + + require.Len(t, urls, 2) + require.Contains(t, urls, "http://www.example.com") + require.Contains(t, urls, "http://www.google.com/") +} + +func TestDomainfilterAllowedAll(t *testing.T) { + scraper := flyscrape.NewScraper() + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) + scraper.LoadModule(&followlinks.Module{}) + scraper.LoadModule(&domainfilter.Module{ + URL: "http://www.example.com", + AllowedDomains: []string{"*"}, + }) + + scraper.SetTransport(flyscrape.MockTransport(200, ` + <a href="http://www.google.com">Google</a> + <a href="http://www.duckduckgo.com">DuckDuckGo</a>`)) + + var urls []string + scraper.OnRequest(func(req *flyscrape.Request) { + urls = append(urls, req.URL) + }) + + scraper.Run() + + require.Len(t, urls, 3) + require.Contains(t, urls, "http://www.example.com") + require.Contains(t, urls, "http://www.duckduckgo.com/") + require.Contains(t, urls, "http://www.google.com/") +} + +func TestDomainfilterBlocked(t *testing.T) { + scraper := flyscrape.NewScraper() + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) + scraper.LoadModule(&followlinks.Module{}) + scraper.LoadModule(&domainfilter.Module{ + URL: "http://www.example.com", + AllowedDomains: []string{"*"}, + BlockedDomains: []string{"www.google.com"}, + }) + + scraper.SetTransport(flyscrape.MockTransport(200, ` + <a href="http://www.google.com">Google</a> + <a href="http://www.duckduckgo.com">DuckDuckGo</a>`)) + + var urls []string + scraper.OnRequest(func(req *flyscrape.Request) { + urls = append(urls, req.URL) + }) + + scraper.Run() + + require.Len(t, urls, 2) + require.Contains(t, urls, "http://www.example.com") + require.Contains(t, urls, "http://www.duckduckgo.com/") +} diff --git a/modules/followlinks/followlinks.go b/modules/followlinks/followlinks.go new file mode 100644 index 0000000..dde0e90 --- /dev/null +++ b/modules/followlinks/followlinks.go @@ -0,0 +1,30 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package followlinks + +import ( + "github.com/philippta/flyscrape" +) + +func init() { + flyscrape.RegisterModule(new(Module)) +} + +type Module struct{} + +func (m *Module) ID() string { + return "followlinks" +} + +func (m *Module) OnResponse(resp *flyscrape.Response) { + for _, link := range flyscrape.ParseLinks(resp.HTML, resp.URL) { + resp.Visit(link) + } +} + +var ( + _ flyscrape.Module = (*Module)(nil) + _ flyscrape.OnResponse = (*Module)(nil) +) diff --git a/modules/followlinks/followlinks_test.go b/modules/followlinks/followlinks_test.go new file mode 100644 index 0000000..03c3a6b --- /dev/null +++ b/modules/followlinks/followlinks_test.go @@ -0,0 +1,39 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package followlinks_test + +import ( + "testing" + + "github.com/philippta/flyscrape" + "github.com/philippta/flyscrape/modules/followlinks" + "github.com/philippta/flyscrape/modules/starturl" + "github.com/stretchr/testify/require" +) + +func TestFollowLinks(t *testing.T) { + scraper := flyscrape.NewScraper() + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"}) + scraper.LoadModule(&followlinks.Module{}) + + scraper.SetTransport(flyscrape.MockTransport(200, ` + <a href="/baz">Baz</a> + <a href="baz">Baz</a> + <a href="http://www.google.com">Google</a>`)) + + var urls []string + scraper.OnRequest(func(req *flyscrape.Request) { + urls = append(urls, req.URL) + }) + + scraper.Run() + + require.Len(t, urls, 5) + require.Contains(t, urls, "http://www.example.com/baz") + require.Contains(t, urls, "http://www.example.com/foo/bar") + require.Contains(t, urls, "http://www.example.com/foo/baz") + require.Contains(t, urls, "http://www.google.com/") + require.Contains(t, urls, "http://www.google.com/baz") +} diff --git a/modules/jsonprinter/jsonprinter.go b/modules/jsonprinter/jsonprinter.go new file mode 100644 index 0000000..3936277 --- /dev/null +++ b/modules/jsonprinter/jsonprinter.go @@ -0,0 +1,47 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package jsonprinter + +import ( + "fmt" + + "github.com/philippta/flyscrape" +) + +func init() { + flyscrape.RegisterModule(new(Module)) +} + +type Module struct { + first bool +} + +func (m *Module) ID() string { + return "jsonprinter" +} + +func (m *Module) OnResponse(resp *flyscrape.Response) { + if resp.Error == nil && resp.Data == nil { + return + } + + if m.first { + fmt.Println("[") + } else { + fmt.Println(",") + } + + fmt.Print(flyscrape.PrettyPrint(resp.ScrapeResult, " ")) +} + +func (m *Module) OnComplete() { + fmt.Println("\n]") +} + +var ( + _ flyscrape.Module = (*Module)(nil) + _ flyscrape.OnResponse = (*Module)(nil) + _ flyscrape.OnComplete = (*Module)(nil) +) diff --git a/modules/jsonprinter/jsonprinter_test.go b/modules/jsonprinter/jsonprinter_test.go new file mode 100644 index 0000000..29cc438 --- /dev/null +++ b/modules/jsonprinter/jsonprinter_test.go @@ -0,0 +1,47 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package jsonprinter_test + +import ( + "net/http" + "testing" + + "github.com/philippta/flyscrape" + "github.com/philippta/flyscrape/modules/depth" + "github.com/philippta/flyscrape/modules/followlinks" + "github.com/philippta/flyscrape/modules/starturl" + "github.com/stretchr/testify/require" +) + +func TestDepth(t *testing.T) { + scraper := flyscrape.NewScraper() + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) + scraper.LoadModule(&followlinks.Module{}) + scraper.LoadModule(&depth.Module{Depth: 2}) + + scraper.SetTransport(func(r *http.Request) (*http.Response, error) { + switch r.URL.String() { + case "http://www.example.com/": + return flyscrape.MockResponse(200, `<a href="http://www.google.com">Google</a>`) + case "http://www.google.com/": + return flyscrape.MockResponse(200, `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`) + case "http://www.duckduckgo.com/": + return flyscrape.MockResponse(200, `<a href="http://www.example.com">Example</a>`) + } + return flyscrape.MockResponse(200, "") + }) + + var urls []string + scraper.OnRequest(func(req *flyscrape.Request) { + urls = append(urls, req.URL) + }) + + scraper.Run() + + require.Len(t, urls, 3) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.google.com/") + require.Contains(t, urls, "http://www.duckduckgo.com/") +} diff --git a/modules/ratelimit/ratelimit.go b/modules/ratelimit/ratelimit.go new file mode 100644 index 0000000..b02f5d5 --- /dev/null +++ b/modules/ratelimit/ratelimit.go @@ -0,0 +1,54 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package ratelimit + +import ( + "time" + + "github.com/philippta/flyscrape" +) + +func init() { + flyscrape.RegisterModule(new(Module)) +} + +type Module struct { + Rate float64 `json:"rate"` + + ticker *time.Ticker + semaphore chan struct{} +} + +func (m *Module) ID() string { + return "ratelimit" +} + +func (m *Module) OnLoad(v flyscrape.Visitor) { + rate := time.Duration(float64(time.Second) / m.Rate) + + m.ticker = time.NewTicker(rate) + m.semaphore = make(chan struct{}, 1) + + go func() { + for range m.ticker.C { + m.semaphore <- struct{}{} + } + }() +} + +func (m *Module) OnRequest(_ *flyscrape.Request) { + <-m.semaphore +} + +func (m *Module) OnComplete() { + m.ticker.Stop() +} + +var ( + _ flyscrape.Module = (*Module)(nil) + _ flyscrape.OnRequest = (*Module)(nil) + _ flyscrape.OnLoad = (*Module)(nil) + _ flyscrape.OnComplete = (*Module)(nil) +) diff --git a/modules/ratelimit/ratelimit_test.go b/modules/ratelimit/ratelimit_test.go new file mode 100644 index 0000000..c166371 --- /dev/null +++ b/modules/ratelimit/ratelimit_test.go @@ -0,0 +1,45 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package ratelimit_test + +import ( + "testing" + "time" + + "github.com/philippta/flyscrape" + "github.com/philippta/flyscrape/modules/followlinks" + "github.com/philippta/flyscrape/modules/ratelimit" + "github.com/philippta/flyscrape/modules/starturl" + "github.com/stretchr/testify/require" +) + +func TestRatelimit(t *testing.T) { + scraper := flyscrape.NewScraper() + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) + scraper.LoadModule(&followlinks.Module{}) + scraper.LoadModule(&ratelimit.Module{ + Rate: 100, + }) + + scraper.SetTransport(flyscrape.MockTransport(200, `<a href="foo">foo</a>`)) + + var times []time.Time + scraper.OnRequest(func(req *flyscrape.Request) { + times = append(times, time.Now()) + }) + + start := time.Now() + + scraper.Run() + + first := times[0].Add(-10 * time.Millisecond) + second := times[1].Add(-20 * time.Millisecond) + + require.Less(t, first.Sub(start), 2*time.Millisecond) + require.Less(t, second.Sub(start), 2*time.Millisecond) + + require.Less(t, start.Sub(first), 2*time.Millisecond) + require.Less(t, start.Sub(second), 2*time.Millisecond) +} diff --git a/modules/starturl/starturl.go b/modules/starturl/starturl.go new file mode 100644 index 0000000..b2e6c47 --- /dev/null +++ b/modules/starturl/starturl.go @@ -0,0 +1,30 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package starturl + +import ( + "github.com/philippta/flyscrape" +) + +func init() { + flyscrape.RegisterModule(new(Module)) +} + +type Module struct { + URL string `json:"url"` +} + +func (m *Module) ID() string { + return "starturl" +} + +func (m *Module) OnLoad(v flyscrape.Visitor) { + v.Visit(m.URL) +} + +var ( + _ flyscrape.Module = (*Module)(nil) + _ flyscrape.OnLoad = (*Module)(nil) +) diff --git a/modules/starturl/starturl_test.go b/modules/starturl/starturl_test.go new file mode 100644 index 0000000..647e197 --- /dev/null +++ b/modules/starturl/starturl_test.go @@ -0,0 +1,31 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package starturl_test + +import ( + "testing" + + "github.com/philippta/flyscrape" + "github.com/philippta/flyscrape/modules/starturl" + "github.com/stretchr/testify/require" +) + +func TestFollowLinks(t *testing.T) { + scraper := flyscrape.NewScraper() + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"}) + scraper.SetTransport(flyscrape.MockTransport(200, "")) + + var url string + var depth int + scraper.OnRequest(func(req *flyscrape.Request) { + url = req.URL + depth = req.Depth + }) + + scraper.Run() + + require.Equal(t, "http://www.example.com/foo/bar", url) + require.Equal(t, 0, depth) +} diff --git a/modules/urlfilter/urlfilter.go b/modules/urlfilter/urlfilter.go new file mode 100644 index 0000000..14576f0 --- /dev/null +++ b/modules/urlfilter/urlfilter.go @@ -0,0 +1,85 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package urlfilter + +import ( + "regexp" + + "github.com/philippta/flyscrape" +) + +func init() { + flyscrape.RegisterModule(new(Module)) +} + +type Module struct { + URL string `json:"url"` + AllowedURLs []string `json:"allowedURLs"` + BlockedURLs []string `json:"blockedURLs"` + + allowedURLsRE []*regexp.Regexp + blockedURLsRE []*regexp.Regexp +} + +func (m *Module) ID() string { + return "urlfilter" +} + +func (m *Module) OnLoad(v flyscrape.Visitor) { + for _, pat := range m.AllowedURLs { + re, err := regexp.Compile(pat) + if err != nil { + continue + } + m.allowedURLsRE = append(m.allowedURLsRE, re) + } + + for _, pat := range m.BlockedURLs { + re, err := regexp.Compile(pat) + if err != nil { + continue + } + m.blockedURLsRE = append(m.blockedURLsRE, re) + } +} + +func (m *Module) CanRequest(rawurl string, depth int) bool { + // allow root url + if rawurl == m.URL { + return true + } + + // allow if no filter is set + if len(m.allowedURLsRE) == 0 && len(m.blockedURLsRE) == 0 { + return true + } + + ok := false + if len(m.allowedURLsRE) == 0 { + ok = true + } + + for _, re := range m.allowedURLsRE { + if re.MatchString(rawurl) { + ok = true + break + } + } + + for _, re := range m.blockedURLsRE { + if re.MatchString(rawurl) { + ok = false + break + } + } + + return ok +} + +var ( + _ flyscrape.Module = (*Module)(nil) + _ flyscrape.CanRequest = (*Module)(nil) + _ flyscrape.OnLoad = (*Module)(nil) +) diff --git a/modules/urlfilter/urlfilter_test.go b/modules/urlfilter/urlfilter_test.go new file mode 100644 index 0000000..e383a32 --- /dev/null +++ b/modules/urlfilter/urlfilter_test.go @@ -0,0 +1,71 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package urlfilter_test + +import ( + "testing" + + "github.com/philippta/flyscrape" + "github.com/philippta/flyscrape/modules/followlinks" + "github.com/philippta/flyscrape/modules/starturl" + "github.com/philippta/flyscrape/modules/urlfilter" + "github.com/stretchr/testify/require" +) + +func TestURLFilterAllowed(t *testing.T) { + scraper := flyscrape.NewScraper() + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) + scraper.LoadModule(&followlinks.Module{}) + scraper.LoadModule(&urlfilter.Module{ + URL: "http://www.example.com/", + AllowedURLs: []string{`/foo\?id=\d+`, `/bar$`}, + }) + + scraper.SetTransport(flyscrape.MockTransport(200, ` + <a href="foo?id=123">123</a> + <a href="foo?id=ABC">ABC</a> + <a href="/bar">bar</a> + <a href="/barz">barz</a>`)) + + var urls []string + scraper.OnRequest(func(req *flyscrape.Request) { + urls = append(urls, req.URL) + }) + + scraper.Run() + + require.Len(t, urls, 3) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.example.com/foo?id=123") + require.Contains(t, urls, "http://www.example.com/bar") +} + +func TestURLFilterBlocked(t *testing.T) { + scraper := flyscrape.NewScraper() + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) + scraper.LoadModule(&followlinks.Module{}) + scraper.LoadModule(&urlfilter.Module{ + URL: "http://www.example.com/", + BlockedURLs: []string{`/foo\?id=\d+`, `/bar$`}, + }) + + scraper.SetTransport(flyscrape.MockTransport(200, ` + <a href="foo?id=123">123</a> + <a href="foo?id=ABC">ABC</a> + <a href="/bar">bar</a> + <a href="/barz">barz</a>`)) + + var urls []string + scraper.OnRequest(func(req *flyscrape.Request) { + urls = append(urls, req.URL) + }) + + scraper.Run() + + require.Len(t, urls, 3) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.example.com/foo?id=ABC") + require.Contains(t, urls, "http://www.example.com/barz") +} |