diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2025-01-10 13:09:50 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-01-10 13:09:50 +0100 |
| commit | 40e02d5d28e59dbeb6134afdce12536c602e6aa5 (patch) | |
| tree | 24c84c4084cf72e552804397eecc9f5bb3c2d4be | |
| parent | bf99c233a18c3165e0d4d251b41224e5bc6eb93d (diff) | |
Implement manual following (#82)
| -rw-r--r-- | README.md | 10 | ||||
| -rw-r--r-- | examples/hackernews_manual_follow.js | 13 | ||||
| -rw-r--r-- | js.go | 4 | ||||
| -rw-r--r-- | js_test.go | 21 | ||||
| -rw-r--r-- | modules/followlinks/followlinks.go | 16 | ||||
| -rw-r--r-- | modules/followlinks/followlinks_test.go | 40 | ||||
| -rw-r--r-- | scrape.go | 3 |
7 files changed, 98 insertions, 9 deletions
@@ -204,7 +204,9 @@ export const config = { // Specify how deep links should be followed. (default = 0, no follow) depth: 5, - // Speficy the css selectors to follow. (default = ["a[href]"]) + // Specify the css selectors to follow. (default = ["a[href]"]) + // Setting follow to [] disables automatic following. + // Can later be used with manual following. follow: [".next > a", ".related a"], // Specify the allowed domains. ['*'] for all. (default = domain from url) @@ -260,7 +262,7 @@ export const config = { }, }; -export default function ({ doc, url, absoluteURL, scrape }) { +export default function ({ doc, url, absoluteURL, scrape, follow }) { // doc // Contains the parsed HTML document. @@ -274,6 +276,10 @@ export default function ({ doc, url, absoluteURL, scrape }) { // return { ... }; // }) // Scrapes a linked page and returns the scrape result. + + // follow("/foo") + // Follows a link manually. + // Disable automatic following with `follow: []` for best results. } ``` diff --git a/examples/hackernews_manual_follow.js b/examples/hackernews_manual_follow.js new file mode 100644 index 0000000..16e7bee --- /dev/null +++ b/examples/hackernews_manual_follow.js @@ -0,0 +1,13 @@ +export const config = { + url: "https://news.ycombinator.com/", + depth: 2, + follow: [], +}; + +export default function({ url, doc, follow }) { + const next = doc.find(".morelink").attr("href"); + + follow(next); + + return { url, next }; +} @@ -30,6 +30,7 @@ type ScrapeParams struct { HTML string URL string Process func(url string) ([]byte, error) + Follow func(url string) } type ScrapeFunc func(ScrapeParams) (any, error) @@ -213,6 +214,9 @@ func scrape(vm *goja.Runtime) (ScrapeFunc, error) { return f(goja.FunctionCall{Arguments: []goja.Value{arg}}) }) + o.Set("follow", func(url string) { + p.Follow(absoluteURL(url)) + }) return o, nil } @@ -260,6 +260,27 @@ func TestJSScrapeParamScrapeDeep(t *testing.T) { }, result) } +func TestJSScrapeParamFollow(t *testing.T) { + js := ` + export default function({ follow }) { + follow("/foo") + } + ` + exports, err := flyscrape.Compile(js, nil) + require.NoError(t, err) + + var followedURL string + _, err = exports.Scrape(flyscrape.ScrapeParams{ + HTML: html, + URL: "http://localhost/", + Follow: func(url string) { + followedURL = url + }, + }) + require.NoError(t, err) + require.Equal(t, "http://localhost/foo", followedURL) +} + func TestJSCompileError(t *testing.T) { exports, err := flyscrape.Compile("import foo;", nil) require.Error(t, err) diff --git a/modules/followlinks/followlinks.go b/modules/followlinks/followlinks.go index c1448be..3ce2797 100644 --- a/modules/followlinks/followlinks.go +++ b/modules/followlinks/followlinks.go @@ -18,7 +18,7 @@ func init() { } type Module struct { - Follow []string `json:"follow"` + Follow *[]string `json:"follow"` } func (Module) ModuleInfo() flyscrape.ModuleInfo { @@ -29,18 +29,26 @@ func (Module) ModuleInfo() flyscrape.ModuleInfo { } func (m *Module) Provision(ctx flyscrape.Context) { - if len(m.Follow) == 0 { - m.Follow = []string{"a[href]"} + if m.Follow == nil { + m.Follow = &[]string{"a[href]"} } } func (m *Module) ReceiveResponse(resp *flyscrape.Response) { + if m.Follow == nil { + return + } + for _, link := range m.parseLinks(string(resp.Body), resp.Request.URL) { resp.Visit(link) } } func (m *Module) parseLinks(html string, origin string) []string { + if m.Follow == nil { + return nil + } + var links []string doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { @@ -54,7 +62,7 @@ func (m *Module) parseLinks(html string, origin string) []string { uniqueLinks := make(map[string]bool) - for _, selector := range m.Follow { + for _, selector := range *m.Follow { attr := parseSelectorAttr(selector) doc.Find(selector).Each(func(i int, s *goquery.Selection) { link, _ := s.Attr(attr) diff --git a/modules/followlinks/followlinks_test.go b/modules/followlinks/followlinks_test.go index af186f9..b09b0cd 100644 --- a/modules/followlinks/followlinks_test.go +++ b/modules/followlinks/followlinks_test.go @@ -57,7 +57,7 @@ func TestFollowSelector(t *testing.T) { mods := []flyscrape.Module{ &starturl.Module{URL: "http://www.example.com/foo/bar"}, &followlinks.Module{ - Follow: []string{".next a[href]"}, + Follow: &[]string{".next a[href]"}, }, hook.Module{ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { @@ -92,7 +92,7 @@ func TestFollowDataAttr(t *testing.T) { mods := []flyscrape.Module{ &starturl.Module{URL: "http://www.example.com/foo/bar"}, &followlinks.Module{ - Follow: []string{"[data-url]"}, + Follow: &[]string{"[data-url]"}, }, hook.Module{ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { @@ -125,7 +125,7 @@ func TestFollowMultiple(t *testing.T) { mods := []flyscrape.Module{ &starturl.Module{URL: "http://www.example.com/foo/bar"}, &followlinks.Module{ - Follow: []string{"a.prev", "a.next"}, + Follow: &[]string{"a.prev", "a.next"}, }, hook.Module{ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { @@ -151,3 +151,37 @@ func TestFollowMultiple(t *testing.T) { require.Contains(t, urls, "http://www.example.com/foo/a") require.Contains(t, urls, "http://www.example.com/foo/b") } + +func TestFollowNoFollow(t *testing.T) { + var urls []string + var mu sync.Mutex + + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com/foo/bar"}, + &followlinks.Module{ + Follow: &[]string{}, + }, + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` + <a href="/baz">Baz</a> + <a href="baz">Baz</a> + <div class="next"> + <a href="http://www.google.com">Google</a> + </div>`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, + }, + } + + scraper := flyscrape.NewScraper() + scraper.Modules = mods + scraper.Run() + + require.Len(t, urls, 1) + require.Contains(t, urls, "http://www.example.com/foo/bar") +} @@ -207,6 +207,9 @@ func (s *Scraper) process(url string, depth int) { HTML: string(response.Body), URL: request.URL, Process: s.processImmediate, + Follow: func(url string) { + s.enqueueJob(url, depth+1) + }, } response.Data, err = s.ScrapeFunc(p) |