simplify project structure

author: Philipp Tanlak <philipp.tanlak@gmail.com> 2023-08-11 18:31:20 +0200
committer: Philipp Tanlak <philipp.tanlak@gmail.com> 2023-08-11 18:31:20 +0200
commit: 062b36fe5725d1267c66db2e506b4131d78ce772 (patch)
tree: 998e5260feb1babac8dae512b56d67d8f20f7266 /scrape.go
parent: 7e4cf39a0ba6ccbd5cc036700a8b1ff9358ecc3d (diff)
1 files changed, 177 insertions, 0 deletions
diff --git a/scrape.go b/scrape.go
new file mode 100644
index 0000000..3706510
--- /dev/null
+++ b/scrape.go
@@ -0,0 +1,177 @@
+package flyscrape
+
+import (
+	"log"
+	"strings"
+	"sync"
+
+	"github.com/PuerkitoBio/goquery"
+	"github.com/cornelk/hashmap"
+	"github.com/nlnwa/whatwg-url/url"
+)
+
+type ScrapeParams struct {
+	HTML string
+	URL  string
+}
+
+type ScrapeOptions struct {
+	URL            string   `json:"url"`
+	AllowedDomains []string `json:"allowed_domains"`
+	Depth          int      `json:"depth"`
+}
+
+type ScrapeResult struct {
+	URL   string   `json:"url"`
+	Data  any      `json:"data,omitempty"`
+	Links []string `json:"-"`
+	Error error    `json:"error,omitempty"`
+}
+
+type (
+	ScrapeFunc func(ScrapeParams) (any, error)
+	FetchFunc  func(url string) (string, error)
+)
+
+type Scraper struct {
+	ScrapeOptions ScrapeOptions
+	ScrapeFunc    ScrapeFunc
+	FetchFunc     FetchFunc
+	Concurrency   int
+
+	visited *hashmap.Map[string, struct{}]
+	wg      *sync.WaitGroup
+}
+
+type target struct {
+	url   string
+	depth int
+}
+
+type result struct {
+	url   string
+	data  any
+	links []string
+	err   error
+}
+
+func (s *Scraper) Scrape() <-chan ScrapeResult {
+	if s.Concurrency == 0 {
+		s.Concurrency = 1
+	}
+
+	jobs := make(chan target, 1024)
+	results := make(chan result)
+	scraperesults := make(chan ScrapeResult)
+	s.visited = hashmap.New[string, struct{}]()
+	s.wg = &sync.WaitGroup{}
+
+	for i := 0; i < s.Concurrency; i++ {
+		go s.worker(i, jobs, results)
+	}
+
+	s.wg.Add(1)
+	jobs <- target{url: s.ScrapeOptions.URL, depth: s.ScrapeOptions.Depth}
+
+	go func() {
+		s.wg.Wait()
+		close(jobs)
+		close(results)
+	}()
+
+	go func() {
+		for res := range results {
+			scraperesults <- ScrapeResult{
+				URL:   res.url,
+				Data:  res.data,
+				Links: res.links,
+				Error: res.err,
+			}
+		}
+		close(scraperesults)
+	}()
+
+	return scraperesults
+}
+
+func (s *Scraper) worker(id int, jobs chan target, results chan<- result) {
+	for j := range jobs {
+		res := s.process(j)
+
+		if j.depth > 0 {
+			for _, l := range res.links {
+				if _, ok := s.visited.Get(l); ok {
+					continue
+				}
+
+				s.wg.Add(1)
+				select {
+				case jobs <- target{url: l, depth: j.depth - 1}:
+					s.visited.Set(l, struct{}{})
+				default:
+					log.Println("queue is full, can't add url:", l)
+					s.wg.Done()
+				}
+			}
+		}
+
+		results <- res
+		s.wg.Done()
+	}
+}
+
+func (s *Scraper) process(job target) result {
+	html, err := s.FetchFunc(job.url)
+	if err != nil {
+		return result{url: job.url, err: err}
+	}
+
+	links := Links(html, job.url)
+	data, err := s.ScrapeFunc(ScrapeParams{HTML: html, URL: job.url})
+	if err != nil {
+		return result{url: job.url, links: links, err: err}
+	}
+
+	return result{url: job.url, data: data, links: links}
+}
+
+func Links(html string, origin string) []string {
+	var links []string
+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+	if err != nil {
+		return nil
+	}
+
+	urlParser := url.NewParser(url.WithPercentEncodeSinglePercentSign())
+
+	uniqueLinks := make(map[string]bool)
+	doc.Find("a").Each(func(i int, s *goquery.Selection) {
+		link, _ := s.Attr("href")
+
+		parsedLink, err := urlParser.ParseRef(origin, link)
+		if err != nil || !isValidLink(parsedLink) {
+			return
+		}
+
+		absLink := parsedLink.Href(true)
+
+		if !uniqueLinks[absLink] {
+			links = append(links, absLink)
+			uniqueLinks[absLink] = true
+		}
+	})
+
+	return links
+}
+
+func isValidLink(link *url.Url) bool {
+	if link.Scheme() != "" && link.Scheme() != "http" && link.Scheme() != "https" {
+		return false
+	}
+
+	if strings.HasPrefix(link.String(), "javascript:") {
+		return false
+	}
+
+	return true
+}
author	Philipp Tanlak <philipp.tanlak@gmail.com>	2023-08-11 18:31:20 +0200
committer	Philipp Tanlak <philipp.tanlak@gmail.com>	2023-08-11 18:31:20 +0200
commit	062b36fe5725d1267c66db2e506b4131d78ce772 (patch)
tree	998e5260feb1babac8dae512b56d67d8f20f7266 /scrape.go
parent	7e4cf39a0ba6ccbd5cc036700a8b1ff9358ecc3d (diff)