diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-27 19:10:49 +0200 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-27 19:10:49 +0200 |
| commit | 5c16435e2218344a6e232ebb48cf022a32ba85d5 (patch) | |
| tree | 3cfa1dbc1f489ba4509fc408a8c0afccca7f9c7c /scrape_test.go | |
| parent | 52107c13b4c2c4efa9269b187916f3195be5a10d (diff) | |
add tests and allow urls
Diffstat (limited to 'scrape_test.go')
| -rw-r--r-- | scrape_test.go | 234 |
1 files changed, 211 insertions, 23 deletions
diff --git a/scrape_test.go b/scrape_test.go index 602be9f..acfbbbf 100644 --- a/scrape_test.go +++ b/scrape_test.go @@ -5,46 +5,234 @@ package flyscrape_test import ( - "sort" + "net/http" + "net/http/httptest" "testing" + "time" "flyscrape" "github.com/stretchr/testify/require" ) -func TestScrape(t *testing.T) { - svc := flyscrape.Scraper{ +func TestScrapeFollowLinks(t *testing.T) { + scr := flyscrape.Scraper{ ScrapeOptions: flyscrape.ScrapeOptions{ - URL: "http://example.com/foo/bar", - Depth: 1, - AllowedDomains: []string{"example.com", "www.google.com"}, + URL: "http://www.example.com/foo/bar", + Depth: 1, + AllowDomains: []string{"www.google.com"}, }, ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { - return map[string]any{ - "url": params.URL, - }, nil + return "foobar", nil }, FetchFunc: func(url string) (string, error) { - return `<html> - <body> - <a href="/baz">Baz</a> + return `<a href="/baz">Baz</a> <a href="baz">Baz</a> - <a href="http://www.google.com">Google</a> - </body> - </html>`, nil + <a href="http://www.google.com">Google</a>`, nil }, } - var urls []string - for res := range svc.Scrape() { - urls = append(urls, res.URL) + urls := make(map[string]struct{}) + for res := range scr.Scrape() { + urls[res.URL] = struct{}{} } - sort.Strings(urls) require.Len(t, urls, 4) - require.Equal(t, "http://example.com/baz", urls[0]) - require.Equal(t, "http://example.com/foo/bar", urls[1]) - require.Equal(t, "http://example.com/foo/baz", urls[2]) - require.Equal(t, "http://www.google.com/", urls[3]) + require.Contains(t, urls, "http://www.example.com/baz") + require.Contains(t, urls, "http://www.example.com/foo/bar") + require.Contains(t, urls, "http://www.example.com/foo/baz") + require.Contains(t, urls, "http://www.google.com/") +} + +func TestScrapeDepth(t *testing.T) { + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Depth: 2, + AllowDomains: []string{"*"}, + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + FetchFunc: func(url string) (string, error) { + switch url { + case "http://www.example.com/": + return `<a href="http://www.google.com">Google</a>`, nil + case "http://www.google.com/": + return `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil + case "http://www.duckduckgo.com/": + return `<a href="http://www.example.com">Example</a>`, nil + } + return "", nil + }, + } + + urls := make(map[string]struct{}) + for res := range scr.Scrape() { + urls[res.URL] = struct{}{} + } + + require.Len(t, urls, 3) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.google.com/") + require.Contains(t, urls, "http://www.duckduckgo.com/") +} + +func TestScrapeAllowDomains(t *testing.T) { + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Depth: 1, + AllowDomains: []string{"www.google.com"}, + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + FetchFunc: func(url string) (string, error) { + return `<a href="http://www.google.com">Google</a> + <a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil + }, + } + + urls := make(map[string]struct{}) + for res := range scr.Scrape() { + urls[res.URL] = struct{}{} + } + + require.Len(t, urls, 2) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.google.com/") +} + +func TestScrapeAllowDomainsAll(t *testing.T) { + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Depth: 1, + AllowDomains: []string{"*"}, + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + FetchFunc: func(url string) (string, error) { + return `<a href="http://www.google.com">Google</a> + <a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil + }, + } + + urls := make(map[string]struct{}) + for res := range scr.Scrape() { + urls[res.URL] = struct{}{} + } + + require.Len(t, urls, 3) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.duckduckgo.com/") + require.Contains(t, urls, "http://www.google.com/") +} + +func TestScrapeDenyDomains(t *testing.T) { + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Depth: 1, + AllowDomains: []string{"*"}, + DenyDomains: []string{"www.google.com"}, + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + FetchFunc: func(url string) (string, error) { + return `<a href="http://www.google.com">Google</a> + <a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil + }, + } + + urls := make(map[string]struct{}) + for res := range scr.Scrape() { + urls[res.URL] = struct{}{} + } + + require.Len(t, urls, 2) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.duckduckgo.com/") +} + +func TestScrapeAllowURLs(t *testing.T) { + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Depth: 1, + AllowURLs: []string{`/foo\?id=\d+`, `/bar$`}, + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + FetchFunc: func(url string) (string, error) { + return `<a href="foo?id=123">123</a> + <a href="foo?id=ABC">ABC</a> + <a href="/bar">bar</a> + <a href="/barz">barz</a>`, nil + }, + } + + urls := make(map[string]struct{}) + for res := range scr.Scrape() { + urls[res.URL] = struct{}{} + } + + require.Len(t, urls, 3) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.example.com/foo?id=123") + require.Contains(t, urls, "http://www.example.com/bar") +} + +func TestScrapeRate(t *testing.T) { + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Depth: 1, + Rate: 100, // every 10ms + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + FetchFunc: func(url string) (string, error) { + return `<a href="foo">foo</a>`, nil + }, + } + + res := scr.Scrape() + + start := time.Now() + <-res + first := time.Now().Add(-10 * time.Millisecond) + <-res + second := time.Now().Add(-20 * time.Millisecond) + + require.Less(t, first.Sub(start), 2*time.Millisecond) + require.Less(t, second.Sub(start), 2*time.Millisecond) +} + +func TestScrapeProxy(t *testing.T) { + proxyCalled := false + proxy := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + proxyCalled = true + w.Write([]byte(`<a href="http://www.google.com">Google</a>`)) + })) + + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Proxy: proxy.URL, + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + } + + res := <-scr.Scrape() + + require.True(t, proxyCalled) + require.Equal(t, "http://www.example.com/", res.URL) } |