summaryrefslogtreecommitdiff
path: root/modules
diff options
context:
space:
mode:
Diffstat (limited to 'modules')
-rw-r--r--modules/depth/depth_test.go49
-rw-r--r--modules/domainfilter/domainfilter_test.go107
-rw-r--r--modules/followlinks/followlinks_test.go124
-rw-r--r--modules/proxy/proxy_test.go14
-rw-r--r--modules/ratelimit/ratelimit_test.go27
-rw-r--r--modules/starturl/starturl_test.go23
-rw-r--r--modules/urlfilter/urlfilter_test.go70
7 files changed, 225 insertions, 189 deletions
diff --git a/modules/depth/depth_test.go b/modules/depth/depth_test.go
index 10b67e9..a596eb4 100644
--- a/modules/depth/depth_test.go
+++ b/modules/depth/depth_test.go
@@ -21,31 +21,34 @@ func TestDepth(t *testing.T) {
var urls []string
var mu sync.Mutex
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"})
- scraper.LoadModule(&followlinks.Module{})
- scraper.LoadModule(&depth.Module{Depth: 2})
- scraper.LoadModule(hook.Module{
- AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
- return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
- switch r.URL.String() {
- case "http://www.example.com":
- return flyscrape.MockResponse(200, `<a href="http://www.google.com">Google</a>`)
- case "http://www.google.com":
- return flyscrape.MockResponse(200, `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`)
- case "http://www.duckduckgo.com":
- return flyscrape.MockResponse(200, `<a href="http://www.example.com">Example</a>`)
- }
- return flyscrape.MockResponse(200, "")
- })
- },
- ReceiveResponseFn: func(r *flyscrape.Response) {
- mu.Lock()
- urls = append(urls, r.Request.URL)
- mu.Unlock()
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: "http://www.example.com"},
+ &followlinks.Module{},
+ &depth.Module{Depth: 2},
+ hook.Module{
+ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
+ return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
+ switch r.URL.String() {
+ case "http://www.example.com":
+ return flyscrape.MockResponse(200, `<a href="http://www.google.com">Google</a>`)
+ case "http://www.google.com":
+ return flyscrape.MockResponse(200, `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`)
+ case "http://www.duckduckgo.com":
+ return flyscrape.MockResponse(200, `<a href="http://www.example.com">Example</a>`)
+ }
+ return flyscrape.MockResponse(200, "")
+ })
+ },
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ mu.Lock()
+ urls = append(urls, r.Request.URL)
+ mu.Unlock()
+ },
},
- })
+ }
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
scraper.Run()
require.Len(t, urls, 3)
diff --git a/modules/domainfilter/domainfilter_test.go b/modules/domainfilter/domainfilter_test.go
index a1c8401..ace9430 100644
--- a/modules/domainfilter/domainfilter_test.go
+++ b/modules/domainfilter/domainfilter_test.go
@@ -21,26 +21,29 @@ func TestDomainfilterAllowed(t *testing.T) {
var urls []string
var mu sync.Mutex
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"})
- scraper.LoadModule(&followlinks.Module{})
- scraper.LoadModule(&domainfilter.Module{
- URL: "http://www.example.com",
- AllowedDomains: []string{"www.google.com"},
- })
- scraper.LoadModule(hook.Module{
- AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
- return flyscrape.MockTransport(200, `
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: "http://www.example.com"},
+ &followlinks.Module{},
+ &domainfilter.Module{
+ URL: "http://www.example.com",
+ AllowedDomains: []string{"www.google.com"},
+ },
+ hook.Module{
+ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
+ return flyscrape.MockTransport(200, `
<a href="http://www.google.com">Google</a>
<a href="http://www.duckduckgo.com">DuckDuckGo</a>`)
+ },
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ mu.Lock()
+ urls = append(urls, r.Request.URL)
+ mu.Unlock()
+ },
},
- ReceiveResponseFn: func(r *flyscrape.Response) {
- mu.Lock()
- urls = append(urls, r.Request.URL)
- mu.Unlock()
- },
- })
+ }
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
scraper.Run()
require.Len(t, urls, 2)
@@ -52,26 +55,29 @@ func TestDomainfilterAllowedAll(t *testing.T) {
var urls []string
var mu sync.Mutex
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"})
- scraper.LoadModule(&followlinks.Module{})
- scraper.LoadModule(&domainfilter.Module{
- URL: "http://www.example.com",
- AllowedDomains: []string{"*"},
- })
- scraper.LoadModule(hook.Module{
- AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
- return flyscrape.MockTransport(200, `
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: "http://www.example.com"},
+ &followlinks.Module{},
+ &domainfilter.Module{
+ URL: "http://www.example.com",
+ AllowedDomains: []string{"*"},
+ },
+ hook.Module{
+ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
+ return flyscrape.MockTransport(200, `
<a href="http://www.google.com">Google</a>
<a href="http://www.duckduckgo.com">DuckDuckGo</a>`)
+ },
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ mu.Lock()
+ urls = append(urls, r.Request.URL)
+ mu.Unlock()
+ },
},
- ReceiveResponseFn: func(r *flyscrape.Response) {
- mu.Lock()
- urls = append(urls, r.Request.URL)
- mu.Unlock()
- },
- })
+ }
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
scraper.Run()
require.Len(t, urls, 3)
@@ -84,27 +90,30 @@ func TestDomainfilterBlocked(t *testing.T) {
var urls []string
var mu sync.Mutex
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"})
- scraper.LoadModule(&followlinks.Module{})
- scraper.LoadModule(&domainfilter.Module{
- URL: "http://www.example.com",
- AllowedDomains: []string{"*"},
- BlockedDomains: []string{"www.google.com"},
- })
- scraper.LoadModule(hook.Module{
- AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
- return flyscrape.MockTransport(200, `
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: "http://www.example.com"},
+ &followlinks.Module{},
+ &domainfilter.Module{
+ URL: "http://www.example.com",
+ AllowedDomains: []string{"*"},
+ BlockedDomains: []string{"www.google.com"},
+ },
+ hook.Module{
+ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
+ return flyscrape.MockTransport(200, `
<a href="http://www.google.com">Google</a>
<a href="http://www.duckduckgo.com">DuckDuckGo</a>`)
+ },
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ mu.Lock()
+ urls = append(urls, r.Request.URL)
+ mu.Unlock()
+ },
},
- ReceiveResponseFn: func(r *flyscrape.Response) {
- mu.Lock()
- urls = append(urls, r.Request.URL)
- mu.Unlock()
- },
- })
+ }
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
scraper.Run()
require.Len(t, urls, 2)
diff --git a/modules/followlinks/followlinks_test.go b/modules/followlinks/followlinks_test.go
index f3eb4fe..af186f9 100644
--- a/modules/followlinks/followlinks_test.go
+++ b/modules/followlinks/followlinks_test.go
@@ -20,24 +20,26 @@ func TestFollowLinks(t *testing.T) {
var urls []string
var mu sync.Mutex
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"})
- scraper.LoadModule(&followlinks.Module{})
-
- scraper.LoadModule(hook.Module{
- AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
- return flyscrape.MockTransport(200, `
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: "http://www.example.com/foo/bar"},
+ &followlinks.Module{},
+ hook.Module{
+ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
+ return flyscrape.MockTransport(200, `
<a href="/baz">Baz</a>
<a href="baz">Baz</a>
<a href="http://www.google.com">Google</a>`)
+ },
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ mu.Lock()
+ urls = append(urls, r.Request.URL)
+ mu.Unlock()
+ },
},
- ReceiveResponseFn: func(r *flyscrape.Response) {
- mu.Lock()
- urls = append(urls, r.Request.URL)
- mu.Unlock()
- },
- })
+ }
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
scraper.Run()
require.Len(t, urls, 5)
@@ -52,28 +54,30 @@ func TestFollowSelector(t *testing.T) {
var urls []string
var mu sync.Mutex
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"})
- scraper.LoadModule(&followlinks.Module{
- Follow: []string{".next a[href]"},
- })
-
- scraper.LoadModule(hook.Module{
- AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
- return flyscrape.MockTransport(200, `
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: "http://www.example.com/foo/bar"},
+ &followlinks.Module{
+ Follow: []string{".next a[href]"},
+ },
+ hook.Module{
+ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
+ return flyscrape.MockTransport(200, `
<a href="/baz">Baz</a>
<a href="baz">Baz</a>
<div class="next">
<a href="http://www.google.com">Google</a>
</div>`)
+ },
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ mu.Lock()
+ urls = append(urls, r.Request.URL)
+ mu.Unlock()
+ },
},
- ReceiveResponseFn: func(r *flyscrape.Response) {
- mu.Lock()
- urls = append(urls, r.Request.URL)
- mu.Unlock()
- },
- })
+ }
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
scraper.Run()
require.Len(t, urls, 2)
@@ -85,26 +89,28 @@ func TestFollowDataAttr(t *testing.T) {
var urls []string
var mu sync.Mutex
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"})
- scraper.LoadModule(&followlinks.Module{
- Follow: []string{"[data-url]"},
- })
-
- scraper.LoadModule(hook.Module{
- AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
- return flyscrape.MockTransport(200, `
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: "http://www.example.com/foo/bar"},
+ &followlinks.Module{
+ Follow: []string{"[data-url]"},
+ },
+ hook.Module{
+ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
+ return flyscrape.MockTransport(200, `
<a href="/baz">Baz</a>
<a href="baz">Baz</a>
<div data-url="http://www.google.com">Google</div>`)
+ },
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ mu.Lock()
+ urls = append(urls, r.Request.URL)
+ mu.Unlock()
+ },
},
- ReceiveResponseFn: func(r *flyscrape.Response) {
- mu.Lock()
- urls = append(urls, r.Request.URL)
- mu.Unlock()
- },
- })
+ }
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
scraper.Run()
require.Len(t, urls, 2)
@@ -116,26 +122,28 @@ func TestFollowMultiple(t *testing.T) {
var urls []string
var mu sync.Mutex
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"})
- scraper.LoadModule(&followlinks.Module{
- Follow: []string{"a.prev", "a.next"},
- })
-
- scraper.LoadModule(hook.Module{
- AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
- return flyscrape.MockTransport(200, `
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: "http://www.example.com/foo/bar"},
+ &followlinks.Module{
+ Follow: []string{"a.prev", "a.next"},
+ },
+ hook.Module{
+ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
+ return flyscrape.MockTransport(200, `
<a href="/baz">Baz</a>
<a class="prev" href="a">a</a>
<a class="next" href="b">b</a>`)
+ },
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ mu.Lock()
+ urls = append(urls, r.Request.URL)
+ mu.Unlock()
+ },
},
- ReceiveResponseFn: func(r *flyscrape.Response) {
- mu.Lock()
- urls = append(urls, r.Request.URL)
- mu.Unlock()
- },
- })
+ }
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
scraper.Run()
require.Len(t, urls, 3)
diff --git a/modules/proxy/proxy_test.go b/modules/proxy/proxy_test.go
index e6058b8..62da23a 100644
--- a/modules/proxy/proxy_test.go
+++ b/modules/proxy/proxy_test.go
@@ -20,13 +20,17 @@ func TestProxy(t *testing.T) {
p := newProxy(func() { called = true })
defer p.Close()
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"})
- scraper.LoadModule(&proxy.Module{
- Proxies: []string{p.URL},
- })
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: "http://www.example.com"},
+ &proxy.Module{
+ Proxies: []string{p.URL},
+ },
+ }
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
scraper.Run()
+
require.True(t, called)
}
diff --git a/modules/ratelimit/ratelimit_test.go b/modules/ratelimit/ratelimit_test.go
index 1fe22b1..7be29a1 100644
--- a/modules/ratelimit/ratelimit_test.go
+++ b/modules/ratelimit/ratelimit_test.go
@@ -20,22 +20,25 @@ import (
func TestRatelimit(t *testing.T) {
var times []time.Time
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"})
- scraper.LoadModule(&followlinks.Module{})
- scraper.LoadModule(hook.Module{
- AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
- return flyscrape.MockTransport(200, `<a href="foo">foo</a>`)
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: "http://www.example.com"},
+ &followlinks.Module{},
+ hook.Module{
+ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
+ return flyscrape.MockTransport(200, `<a href="foo">foo</a>`)
+ },
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ times = append(times, time.Now())
+ },
},
- ReceiveResponseFn: func(r *flyscrape.Response) {
- times = append(times, time.Now())
+ &ratelimit.Module{
+ Rate: 100,
},
- })
- scraper.LoadModule(&ratelimit.Module{
- Rate: 100,
- })
+ }
start := time.Now()
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
scraper.Run()
first := times[0].Add(-10 * time.Millisecond)
diff --git a/modules/starturl/starturl_test.go b/modules/starturl/starturl_test.go
index 86e4ad7..78efa6a 100644
--- a/modules/starturl/starturl_test.go
+++ b/modules/starturl/starturl_test.go
@@ -18,18 +18,21 @@ func TestStartURL(t *testing.T) {
var url string
var depth int
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"})
- scraper.LoadModule(hook.Module{
- AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
- return flyscrape.MockTransport(200, "")
- },
- BuildRequestFn: func(r *flyscrape.Request) {
- url = r.URL
- depth = r.Depth
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: "http://www.example.com/foo/bar"},
+ hook.Module{
+ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
+ return flyscrape.MockTransport(200, "")
+ },
+ BuildRequestFn: func(r *flyscrape.Request) {
+ url = r.URL
+ depth = r.Depth
+ },
},
- })
+ }
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
scraper.Run()
require.Equal(t, "http://www.example.com/foo/bar", url)
diff --git a/modules/urlfilter/urlfilter_test.go b/modules/urlfilter/urlfilter_test.go
index 9ebb8a5..442780d 100644
--- a/modules/urlfilter/urlfilter_test.go
+++ b/modules/urlfilter/urlfilter_test.go
@@ -21,28 +21,31 @@ func TestURLFilterAllowed(t *testing.T) {
var urls []string
var mu sync.Mutex
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"})
- scraper.LoadModule(&followlinks.Module{})
- scraper.LoadModule(&urlfilter.Module{
- URL: "http://www.example.com/",
- AllowedURLs: []string{`/foo\?id=\d+`, `/bar$`},
- })
- scraper.LoadModule(hook.Module{
- AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
- return flyscrape.MockTransport(200, `
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: "http://www.example.com/"},
+ &followlinks.Module{},
+ &urlfilter.Module{
+ URL: "http://www.example.com/",
+ AllowedURLs: []string{`/foo\?id=\d+`, `/bar$`},
+ },
+ hook.Module{
+ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
+ return flyscrape.MockTransport(200, `
<a href="foo?id=123">123</a>
<a href="foo?id=ABC">ABC</a>
<a href="/bar">bar</a>
<a href="/barz">barz</a>`)
+ },
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ mu.Lock()
+ urls = append(urls, r.Request.URL)
+ mu.Unlock()
+ },
},
- ReceiveResponseFn: func(r *flyscrape.Response) {
- mu.Lock()
- urls = append(urls, r.Request.URL)
- mu.Unlock()
- },
- })
+ }
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
scraper.Run()
require.Len(t, urls, 3)
@@ -55,28 +58,31 @@ func TestURLFilterBlocked(t *testing.T) {
var urls []string
var mu sync.Mutex
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"})
- scraper.LoadModule(&followlinks.Module{})
- scraper.LoadModule(&urlfilter.Module{
- URL: "http://www.example.com/",
- BlockedURLs: []string{`/foo\?id=\d+`, `/bar$`},
- })
- scraper.LoadModule(hook.Module{
- AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
- return flyscrape.MockTransport(200, `
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: "http://www.example.com/"},
+ &followlinks.Module{},
+ &urlfilter.Module{
+ URL: "http://www.example.com/",
+ BlockedURLs: []string{`/foo\?id=\d+`, `/bar$`},
+ },
+ hook.Module{
+ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
+ return flyscrape.MockTransport(200, `
<a href="foo?id=123">123</a>
<a href="foo?id=ABC">ABC</a>
<a href="/bar">bar</a>
<a href="/barz">barz</a>`)
+ },
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ mu.Lock()
+ urls = append(urls, r.Request.URL)
+ mu.Unlock()
+ },
},
- ReceiveResponseFn: func(r *flyscrape.Response) {
- mu.Lock()
- urls = append(urls, r.Request.URL)
- mu.Unlock()
- },
- })
+ }
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
scraper.Run()
require.Len(t, urls, 3)