diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-10-05 14:53:37 +0200 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-10-05 14:53:37 +0200 |
| commit | 1fc497fbdc79a43c62ac2e8eaf4827752dbeef8e (patch) | |
| tree | 67738e213ef97f249bdfa0f1bddda0839192cb77 /modules/followlinks | |
| parent | bd9e7f7acfd855d4685aa4544169c0e29cdbf205 (diff) | |
Refactor codebase into modules
Diffstat (limited to 'modules/followlinks')
| -rw-r--r-- | modules/followlinks/followlinks.go | 60 | ||||
| -rw-r--r-- | modules/followlinks/followlinks_test.go | 26 |
2 files changed, 74 insertions, 12 deletions
diff --git a/modules/followlinks/followlinks.go b/modules/followlinks/followlinks.go index 99d6cee..c53f167 100644 --- a/modules/followlinks/followlinks.go +++ b/modules/followlinks/followlinks.go @@ -5,19 +5,71 @@ package followlinks import ( + "net/url" + "strings" + + "github.com/PuerkitoBio/goquery" "github.com/philippta/flyscrape" ) func init() { - flyscrape.RegisterModule(new(Module)) + flyscrape.RegisterModule(Module{}) } type Module struct{} -func (m *Module) OnResponse(resp *flyscrape.Response) { - for _, link := range flyscrape.ParseLinks(string(resp.Body), resp.Request.URL) { +func (Module) ModuleInfo() flyscrape.ModuleInfo { + return flyscrape.ModuleInfo{ + ID: "followlinks", + New: func() flyscrape.Module { return new(Module) }, + } +} + +func (m *Module) ReceiveResponse(resp *flyscrape.Response) { + for _, link := range parseLinks(string(resp.Body), resp.Request.URL) { resp.Visit(link) } } -var _ flyscrape.OnResponse = (*Module)(nil) +func parseLinks(html string, origin string) []string { + var links []string + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + return nil + } + + originurl, err := url.Parse(origin) + if err != nil { + return nil + } + + uniqueLinks := make(map[string]bool) + doc.Find("a").Each(func(i int, s *goquery.Selection) { + link, _ := s.Attr("href") + + parsedLink, err := originurl.Parse(link) + + if err != nil || !isValidLink(parsedLink) { + return + } + + absLink := parsedLink.String() + + if !uniqueLinks[absLink] { + links = append(links, absLink) + uniqueLinks[absLink] = true + } + }) + + return links +} + +func isValidLink(link *url.URL) bool { + if link.Scheme != "" && link.Scheme != "http" && link.Scheme != "https" { + return false + } + + return true +} + +var _ flyscrape.ResponseReceiver = (*Module)(nil) diff --git a/modules/followlinks/followlinks_test.go b/modules/followlinks/followlinks_test.go index 18c8ceb..0a628c3 100644 --- a/modules/followlinks/followlinks_test.go +++ b/modules/followlinks/followlinks_test.go @@ -5,27 +5,37 @@ package followlinks_test import ( + "net/http" + "sync" "testing" "github.com/philippta/flyscrape" "github.com/philippta/flyscrape/modules/followlinks" + "github.com/philippta/flyscrape/modules/hook" "github.com/philippta/flyscrape/modules/starturl" "github.com/stretchr/testify/require" ) func TestFollowLinks(t *testing.T) { + var urls []string + var mu sync.Mutex + scraper := flyscrape.NewScraper() scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"}) scraper.LoadModule(&followlinks.Module{}) - scraper.SetTransport(flyscrape.MockTransport(200, ` - <a href="/baz">Baz</a> - <a href="baz">Baz</a> - <a href="http://www.google.com">Google</a>`)) - - var urls []string - scraper.OnRequest(func(req *flyscrape.Request) { - urls = append(urls, req.URL) + scraper.LoadModule(hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` + <a href="/baz">Baz</a> + <a href="baz">Baz</a> + <a href="http://www.google.com">Google</a>`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, }) scraper.Run() |