From 11d73f57a80bb65b7507ec80433b8f035ed226c2 Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Tue, 17 Oct 2023 18:16:58 +0200 Subject: Allow configuration of links to follow --- modules/followlinks/followlinks_test.go | 96 +++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) (limited to 'modules/followlinks/followlinks_test.go') diff --git a/modules/followlinks/followlinks_test.go b/modules/followlinks/followlinks_test.go index 0a628c3..f3eb4fe 100644 --- a/modules/followlinks/followlinks_test.go +++ b/modules/followlinks/followlinks_test.go @@ -47,3 +47,99 @@ func TestFollowLinks(t *testing.T) { require.Contains(t, urls, "http://www.google.com") require.Contains(t, urls, "http://www.google.com/baz") } + +func TestFollowSelector(t *testing.T) { + var urls []string + var mu sync.Mutex + + scraper := flyscrape.NewScraper() + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"}) + scraper.LoadModule(&followlinks.Module{ + Follow: []string{".next a[href]"}, + }) + + scraper.LoadModule(hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` + Baz + Baz + `) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, + }) + + scraper.Run() + + require.Len(t, urls, 2) + require.Contains(t, urls, "http://www.example.com/foo/bar") + require.Contains(t, urls, "http://www.google.com") +} + +func TestFollowDataAttr(t *testing.T) { + var urls []string + var mu sync.Mutex + + scraper := flyscrape.NewScraper() + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"}) + scraper.LoadModule(&followlinks.Module{ + Follow: []string{"[data-url]"}, + }) + + scraper.LoadModule(hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` + Baz + Baz +
Google
`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, + }) + + scraper.Run() + + require.Len(t, urls, 2) + require.Contains(t, urls, "http://www.example.com/foo/bar") + require.Contains(t, urls, "http://www.google.com") +} + +func TestFollowMultiple(t *testing.T) { + var urls []string + var mu sync.Mutex + + scraper := flyscrape.NewScraper() + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"}) + scraper.LoadModule(&followlinks.Module{ + Follow: []string{"a.prev", "a.next"}, + }) + + scraper.LoadModule(hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` + Baz + + `) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, + }) + + scraper.Run() + + require.Len(t, urls, 3) + require.Contains(t, urls, "http://www.example.com/foo/bar") + require.Contains(t, urls, "http://www.example.com/foo/a") + require.Contains(t, urls, "http://www.example.com/foo/b") +} -- cgit v1.2.3