diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-10-30 19:02:50 +0100 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-10-30 19:02:50 +0100 |
| commit | 2d3cd6584dedce45ea709d1757a28ce7537f3472 (patch) | |
| tree | c21ce25cd66731c56b3fd13c86734bd13ebd7d25 /modules | |
| parent | 2bfae5b426bf4a0b99d3979ed12d63cb50c39b17 (diff) | |
Refactor to prepare for builtin JS functions
Diffstat (limited to 'modules')
| -rw-r--r-- | modules/depth/depth_test.go | 49 | ||||
| -rw-r--r-- | modules/domainfilter/domainfilter_test.go | 107 | ||||
| -rw-r--r-- | modules/followlinks/followlinks_test.go | 124 | ||||
| -rw-r--r-- | modules/proxy/proxy_test.go | 14 | ||||
| -rw-r--r-- | modules/ratelimit/ratelimit_test.go | 27 | ||||
| -rw-r--r-- | modules/starturl/starturl_test.go | 23 | ||||
| -rw-r--r-- | modules/urlfilter/urlfilter_test.go | 70 |
7 files changed, 225 insertions, 189 deletions
diff --git a/modules/depth/depth_test.go b/modules/depth/depth_test.go index 10b67e9..a596eb4 100644 --- a/modules/depth/depth_test.go +++ b/modules/depth/depth_test.go @@ -21,31 +21,34 @@ func TestDepth(t *testing.T) { var urls []string var mu sync.Mutex - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) - scraper.LoadModule(&followlinks.Module{}) - scraper.LoadModule(&depth.Module{Depth: 2}) - scraper.LoadModule(hook.Module{ - AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { - return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { - switch r.URL.String() { - case "http://www.example.com": - return flyscrape.MockResponse(200, `<a href="http://www.google.com">Google</a>`) - case "http://www.google.com": - return flyscrape.MockResponse(200, `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`) - case "http://www.duckduckgo.com": - return flyscrape.MockResponse(200, `<a href="http://www.example.com">Example</a>`) - } - return flyscrape.MockResponse(200, "") - }) - }, - ReceiveResponseFn: func(r *flyscrape.Response) { - mu.Lock() - urls = append(urls, r.Request.URL) - mu.Unlock() + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com"}, + &followlinks.Module{}, + &depth.Module{Depth: 2}, + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { + switch r.URL.String() { + case "http://www.example.com": + return flyscrape.MockResponse(200, `<a href="http://www.google.com">Google</a>`) + case "http://www.google.com": + return flyscrape.MockResponse(200, `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`) + case "http://www.duckduckgo.com": + return flyscrape.MockResponse(200, `<a href="http://www.example.com">Example</a>`) + } + return flyscrape.MockResponse(200, "") + }) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, }, - }) + } + scraper := flyscrape.NewScraper() + scraper.Modules = mods scraper.Run() require.Len(t, urls, 3) diff --git a/modules/domainfilter/domainfilter_test.go b/modules/domainfilter/domainfilter_test.go index a1c8401..ace9430 100644 --- a/modules/domainfilter/domainfilter_test.go +++ b/modules/domainfilter/domainfilter_test.go @@ -21,26 +21,29 @@ func TestDomainfilterAllowed(t *testing.T) { var urls []string var mu sync.Mutex - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) - scraper.LoadModule(&followlinks.Module{}) - scraper.LoadModule(&domainfilter.Module{ - URL: "http://www.example.com", - AllowedDomains: []string{"www.google.com"}, - }) - scraper.LoadModule(hook.Module{ - AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { - return flyscrape.MockTransport(200, ` + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com"}, + &followlinks.Module{}, + &domainfilter.Module{ + URL: "http://www.example.com", + AllowedDomains: []string{"www.google.com"}, + }, + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` <a href="http://www.google.com">Google</a> <a href="http://www.duckduckgo.com">DuckDuckGo</a>`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, }, - ReceiveResponseFn: func(r *flyscrape.Response) { - mu.Lock() - urls = append(urls, r.Request.URL) - mu.Unlock() - }, - }) + } + scraper := flyscrape.NewScraper() + scraper.Modules = mods scraper.Run() require.Len(t, urls, 2) @@ -52,26 +55,29 @@ func TestDomainfilterAllowedAll(t *testing.T) { var urls []string var mu sync.Mutex - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) - scraper.LoadModule(&followlinks.Module{}) - scraper.LoadModule(&domainfilter.Module{ - URL: "http://www.example.com", - AllowedDomains: []string{"*"}, - }) - scraper.LoadModule(hook.Module{ - AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { - return flyscrape.MockTransport(200, ` + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com"}, + &followlinks.Module{}, + &domainfilter.Module{ + URL: "http://www.example.com", + AllowedDomains: []string{"*"}, + }, + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` <a href="http://www.google.com">Google</a> <a href="http://www.duckduckgo.com">DuckDuckGo</a>`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, }, - ReceiveResponseFn: func(r *flyscrape.Response) { - mu.Lock() - urls = append(urls, r.Request.URL) - mu.Unlock() - }, - }) + } + scraper := flyscrape.NewScraper() + scraper.Modules = mods scraper.Run() require.Len(t, urls, 3) @@ -84,27 +90,30 @@ func TestDomainfilterBlocked(t *testing.T) { var urls []string var mu sync.Mutex - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) - scraper.LoadModule(&followlinks.Module{}) - scraper.LoadModule(&domainfilter.Module{ - URL: "http://www.example.com", - AllowedDomains: []string{"*"}, - BlockedDomains: []string{"www.google.com"}, - }) - scraper.LoadModule(hook.Module{ - AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { - return flyscrape.MockTransport(200, ` + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com"}, + &followlinks.Module{}, + &domainfilter.Module{ + URL: "http://www.example.com", + AllowedDomains: []string{"*"}, + BlockedDomains: []string{"www.google.com"}, + }, + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` <a href="http://www.google.com">Google</a> <a href="http://www.duckduckgo.com">DuckDuckGo</a>`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, }, - ReceiveResponseFn: func(r *flyscrape.Response) { - mu.Lock() - urls = append(urls, r.Request.URL) - mu.Unlock() - }, - }) + } + scraper := flyscrape.NewScraper() + scraper.Modules = mods scraper.Run() require.Len(t, urls, 2) diff --git a/modules/followlinks/followlinks_test.go b/modules/followlinks/followlinks_test.go index f3eb4fe..af186f9 100644 --- a/modules/followlinks/followlinks_test.go +++ b/modules/followlinks/followlinks_test.go @@ -20,24 +20,26 @@ func TestFollowLinks(t *testing.T) { var urls []string var mu sync.Mutex - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"}) - scraper.LoadModule(&followlinks.Module{}) - - scraper.LoadModule(hook.Module{ - AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { - return flyscrape.MockTransport(200, ` + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com/foo/bar"}, + &followlinks.Module{}, + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` <a href="/baz">Baz</a> <a href="baz">Baz</a> <a href="http://www.google.com">Google</a>`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, }, - ReceiveResponseFn: func(r *flyscrape.Response) { - mu.Lock() - urls = append(urls, r.Request.URL) - mu.Unlock() - }, - }) + } + scraper := flyscrape.NewScraper() + scraper.Modules = mods scraper.Run() require.Len(t, urls, 5) @@ -52,28 +54,30 @@ func TestFollowSelector(t *testing.T) { var urls []string var mu sync.Mutex - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"}) - scraper.LoadModule(&followlinks.Module{ - Follow: []string{".next a[href]"}, - }) - - scraper.LoadModule(hook.Module{ - AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { - return flyscrape.MockTransport(200, ` + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com/foo/bar"}, + &followlinks.Module{ + Follow: []string{".next a[href]"}, + }, + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` <a href="/baz">Baz</a> <a href="baz">Baz</a> <div class="next"> <a href="http://www.google.com">Google</a> </div>`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, }, - ReceiveResponseFn: func(r *flyscrape.Response) { - mu.Lock() - urls = append(urls, r.Request.URL) - mu.Unlock() - }, - }) + } + scraper := flyscrape.NewScraper() + scraper.Modules = mods scraper.Run() require.Len(t, urls, 2) @@ -85,26 +89,28 @@ func TestFollowDataAttr(t *testing.T) { var urls []string var mu sync.Mutex - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"}) - scraper.LoadModule(&followlinks.Module{ - Follow: []string{"[data-url]"}, - }) - - scraper.LoadModule(hook.Module{ - AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { - return flyscrape.MockTransport(200, ` + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com/foo/bar"}, + &followlinks.Module{ + Follow: []string{"[data-url]"}, + }, + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` <a href="/baz">Baz</a> <a href="baz">Baz</a> <div data-url="http://www.google.com">Google</div>`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, }, - ReceiveResponseFn: func(r *flyscrape.Response) { - mu.Lock() - urls = append(urls, r.Request.URL) - mu.Unlock() - }, - }) + } + scraper := flyscrape.NewScraper() + scraper.Modules = mods scraper.Run() require.Len(t, urls, 2) @@ -116,26 +122,28 @@ func TestFollowMultiple(t *testing.T) { var urls []string var mu sync.Mutex - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"}) - scraper.LoadModule(&followlinks.Module{ - Follow: []string{"a.prev", "a.next"}, - }) - - scraper.LoadModule(hook.Module{ - AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { - return flyscrape.MockTransport(200, ` + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com/foo/bar"}, + &followlinks.Module{ + Follow: []string{"a.prev", "a.next"}, + }, + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` <a href="/baz">Baz</a> <a class="prev" href="a">a</a> <a class="next" href="b">b</a>`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, }, - ReceiveResponseFn: func(r *flyscrape.Response) { - mu.Lock() - urls = append(urls, r.Request.URL) - mu.Unlock() - }, - }) + } + scraper := flyscrape.NewScraper() + scraper.Modules = mods scraper.Run() require.Len(t, urls, 3) diff --git a/modules/proxy/proxy_test.go b/modules/proxy/proxy_test.go index e6058b8..62da23a 100644 --- a/modules/proxy/proxy_test.go +++ b/modules/proxy/proxy_test.go @@ -20,13 +20,17 @@ func TestProxy(t *testing.T) { p := newProxy(func() { called = true }) defer p.Close() - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) - scraper.LoadModule(&proxy.Module{ - Proxies: []string{p.URL}, - }) + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com"}, + &proxy.Module{ + Proxies: []string{p.URL}, + }, + } + scraper := flyscrape.NewScraper() + scraper.Modules = mods scraper.Run() + require.True(t, called) } diff --git a/modules/ratelimit/ratelimit_test.go b/modules/ratelimit/ratelimit_test.go index 1fe22b1..7be29a1 100644 --- a/modules/ratelimit/ratelimit_test.go +++ b/modules/ratelimit/ratelimit_test.go @@ -20,22 +20,25 @@ import ( func TestRatelimit(t *testing.T) { var times []time.Time - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) - scraper.LoadModule(&followlinks.Module{}) - scraper.LoadModule(hook.Module{ - AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { - return flyscrape.MockTransport(200, `<a href="foo">foo</a>`) + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com"}, + &followlinks.Module{}, + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, `<a href="foo">foo</a>`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + times = append(times, time.Now()) + }, }, - ReceiveResponseFn: func(r *flyscrape.Response) { - times = append(times, time.Now()) + &ratelimit.Module{ + Rate: 100, }, - }) - scraper.LoadModule(&ratelimit.Module{ - Rate: 100, - }) + } start := time.Now() + scraper := flyscrape.NewScraper() + scraper.Modules = mods scraper.Run() first := times[0].Add(-10 * time.Millisecond) diff --git a/modules/starturl/starturl_test.go b/modules/starturl/starturl_test.go index 86e4ad7..78efa6a 100644 --- a/modules/starturl/starturl_test.go +++ b/modules/starturl/starturl_test.go @@ -18,18 +18,21 @@ func TestStartURL(t *testing.T) { var url string var depth int - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"}) - scraper.LoadModule(hook.Module{ - AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { - return flyscrape.MockTransport(200, "") - }, - BuildRequestFn: func(r *flyscrape.Request) { - url = r.URL - depth = r.Depth + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com/foo/bar"}, + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, "") + }, + BuildRequestFn: func(r *flyscrape.Request) { + url = r.URL + depth = r.Depth + }, }, - }) + } + scraper := flyscrape.NewScraper() + scraper.Modules = mods scraper.Run() require.Equal(t, "http://www.example.com/foo/bar", url) diff --git a/modules/urlfilter/urlfilter_test.go b/modules/urlfilter/urlfilter_test.go index 9ebb8a5..442780d 100644 --- a/modules/urlfilter/urlfilter_test.go +++ b/modules/urlfilter/urlfilter_test.go @@ -21,28 +21,31 @@ func TestURLFilterAllowed(t *testing.T) { var urls []string var mu sync.Mutex - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) - scraper.LoadModule(&followlinks.Module{}) - scraper.LoadModule(&urlfilter.Module{ - URL: "http://www.example.com/", - AllowedURLs: []string{`/foo\?id=\d+`, `/bar$`}, - }) - scraper.LoadModule(hook.Module{ - AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { - return flyscrape.MockTransport(200, ` + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com/"}, + &followlinks.Module{}, + &urlfilter.Module{ + URL: "http://www.example.com/", + AllowedURLs: []string{`/foo\?id=\d+`, `/bar$`}, + }, + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` <a href="foo?id=123">123</a> <a href="foo?id=ABC">ABC</a> <a href="/bar">bar</a> <a href="/barz">barz</a>`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, }, - ReceiveResponseFn: func(r *flyscrape.Response) { - mu.Lock() - urls = append(urls, r.Request.URL) - mu.Unlock() - }, - }) + } + scraper := flyscrape.NewScraper() + scraper.Modules = mods scraper.Run() require.Len(t, urls, 3) @@ -55,28 +58,31 @@ func TestURLFilterBlocked(t *testing.T) { var urls []string var mu sync.Mutex - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) - scraper.LoadModule(&followlinks.Module{}) - scraper.LoadModule(&urlfilter.Module{ - URL: "http://www.example.com/", - BlockedURLs: []string{`/foo\?id=\d+`, `/bar$`}, - }) - scraper.LoadModule(hook.Module{ - AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { - return flyscrape.MockTransport(200, ` + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com/"}, + &followlinks.Module{}, + &urlfilter.Module{ + URL: "http://www.example.com/", + BlockedURLs: []string{`/foo\?id=\d+`, `/bar$`}, + }, + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` <a href="foo?id=123">123</a> <a href="foo?id=ABC">ABC</a> <a href="/bar">bar</a> <a href="/barz">barz</a>`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, }, - ReceiveResponseFn: func(r *flyscrape.Response) { - mu.Lock() - urls = append(urls, r.Request.URL) - mu.Unlock() - }, - }) + } + scraper := flyscrape.NewScraper() + scraper.Modules = mods scraper.Run() require.Len(t, urls, 3) |