summaryrefslogtreecommitdiff
path: root/modules/followlinks/followlinks.go
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2023-10-05 14:53:37 +0200
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2023-10-05 14:53:37 +0200
commit1fc497fbdc79a43c62ac2e8eaf4827752dbeef8e (patch)
tree67738e213ef97f249bdfa0f1bddda0839192cb77 /modules/followlinks/followlinks.go
parentbd9e7f7acfd855d4685aa4544169c0e29cdbf205 (diff)
Refactor codebase into modules
Diffstat (limited to 'modules/followlinks/followlinks.go')
-rw-r--r--modules/followlinks/followlinks.go60
1 files changed, 56 insertions, 4 deletions
diff --git a/modules/followlinks/followlinks.go b/modules/followlinks/followlinks.go
index 99d6cee..c53f167 100644
--- a/modules/followlinks/followlinks.go
+++ b/modules/followlinks/followlinks.go
@@ -5,19 +5,71 @@
package followlinks
import (
+ "net/url"
+ "strings"
+
+ "github.com/PuerkitoBio/goquery"
"github.com/philippta/flyscrape"
)
func init() {
- flyscrape.RegisterModule(new(Module))
+ flyscrape.RegisterModule(Module{})
}
type Module struct{}
-func (m *Module) OnResponse(resp *flyscrape.Response) {
- for _, link := range flyscrape.ParseLinks(string(resp.Body), resp.Request.URL) {
+func (Module) ModuleInfo() flyscrape.ModuleInfo {
+ return flyscrape.ModuleInfo{
+ ID: "followlinks",
+ New: func() flyscrape.Module { return new(Module) },
+ }
+}
+
+func (m *Module) ReceiveResponse(resp *flyscrape.Response) {
+ for _, link := range parseLinks(string(resp.Body), resp.Request.URL) {
resp.Visit(link)
}
}
-var _ flyscrape.OnResponse = (*Module)(nil)
+func parseLinks(html string, origin string) []string {
+ var links []string
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ return nil
+ }
+
+ originurl, err := url.Parse(origin)
+ if err != nil {
+ return nil
+ }
+
+ uniqueLinks := make(map[string]bool)
+ doc.Find("a").Each(func(i int, s *goquery.Selection) {
+ link, _ := s.Attr("href")
+
+ parsedLink, err := originurl.Parse(link)
+
+ if err != nil || !isValidLink(parsedLink) {
+ return
+ }
+
+ absLink := parsedLink.String()
+
+ if !uniqueLinks[absLink] {
+ links = append(links, absLink)
+ uniqueLinks[absLink] = true
+ }
+ })
+
+ return links
+}
+
+func isValidLink(link *url.URL) bool {
+ if link.Scheme != "" && link.Scheme != "http" && link.Scheme != "https" {
+ return false
+ }
+
+ return true
+}
+
+var _ flyscrape.ResponseReceiver = (*Module)(nil)