summaryrefslogtreecommitdiff
path: root/scrape.go
diff options
context:
space:
mode:
Diffstat (limited to 'scrape.go')
-rw-r--r--scrape.go177
1 files changed, 177 insertions, 0 deletions
diff --git a/scrape.go b/scrape.go
new file mode 100644
index 0000000..3706510
--- /dev/null
+++ b/scrape.go
@@ -0,0 +1,177 @@
+package flyscrape
+
+import (
+ "log"
+ "strings"
+ "sync"
+
+ "github.com/PuerkitoBio/goquery"
+ "github.com/cornelk/hashmap"
+ "github.com/nlnwa/whatwg-url/url"
+)
+
+type ScrapeParams struct {
+ HTML string
+ URL string
+}
+
+type ScrapeOptions struct {
+ URL string `json:"url"`
+ AllowedDomains []string `json:"allowed_domains"`
+ Depth int `json:"depth"`
+}
+
+type ScrapeResult struct {
+ URL string `json:"url"`
+ Data any `json:"data,omitempty"`
+ Links []string `json:"-"`
+ Error error `json:"error,omitempty"`
+}
+
+type (
+ ScrapeFunc func(ScrapeParams) (any, error)
+ FetchFunc func(url string) (string, error)
+)
+
+type Scraper struct {
+ ScrapeOptions ScrapeOptions
+ ScrapeFunc ScrapeFunc
+ FetchFunc FetchFunc
+ Concurrency int
+
+ visited *hashmap.Map[string, struct{}]
+ wg *sync.WaitGroup
+}
+
+type target struct {
+ url string
+ depth int
+}
+
+type result struct {
+ url string
+ data any
+ links []string
+ err error
+}
+
+func (s *Scraper) Scrape() <-chan ScrapeResult {
+ if s.Concurrency == 0 {
+ s.Concurrency = 1
+ }
+
+ jobs := make(chan target, 1024)
+ results := make(chan result)
+ scraperesults := make(chan ScrapeResult)
+ s.visited = hashmap.New[string, struct{}]()
+ s.wg = &sync.WaitGroup{}
+
+ for i := 0; i < s.Concurrency; i++ {
+ go s.worker(i, jobs, results)
+ }
+
+ s.wg.Add(1)
+ jobs <- target{url: s.ScrapeOptions.URL, depth: s.ScrapeOptions.Depth}
+
+ go func() {
+ s.wg.Wait()
+ close(jobs)
+ close(results)
+ }()
+
+ go func() {
+ for res := range results {
+ scraperesults <- ScrapeResult{
+ URL: res.url,
+ Data: res.data,
+ Links: res.links,
+ Error: res.err,
+ }
+ }
+ close(scraperesults)
+ }()
+
+ return scraperesults
+}
+
+func (s *Scraper) worker(id int, jobs chan target, results chan<- result) {
+ for j := range jobs {
+ res := s.process(j)
+
+ if j.depth > 0 {
+ for _, l := range res.links {
+ if _, ok := s.visited.Get(l); ok {
+ continue
+ }
+
+ s.wg.Add(1)
+ select {
+ case jobs <- target{url: l, depth: j.depth - 1}:
+ s.visited.Set(l, struct{}{})
+ default:
+ log.Println("queue is full, can't add url:", l)
+ s.wg.Done()
+ }
+ }
+ }
+
+ results <- res
+ s.wg.Done()
+ }
+}
+
+func (s *Scraper) process(job target) result {
+ html, err := s.FetchFunc(job.url)
+ if err != nil {
+ return result{url: job.url, err: err}
+ }
+
+ links := Links(html, job.url)
+ data, err := s.ScrapeFunc(ScrapeParams{HTML: html, URL: job.url})
+ if err != nil {
+ return result{url: job.url, links: links, err: err}
+ }
+
+ return result{url: job.url, data: data, links: links}
+}
+
+func Links(html string, origin string) []string {
+ var links []string
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ return nil
+ }
+
+ urlParser := url.NewParser(url.WithPercentEncodeSinglePercentSign())
+
+ uniqueLinks := make(map[string]bool)
+ doc.Find("a").Each(func(i int, s *goquery.Selection) {
+ link, _ := s.Attr("href")
+
+ parsedLink, err := urlParser.ParseRef(origin, link)
+ if err != nil || !isValidLink(parsedLink) {
+ return
+ }
+
+ absLink := parsedLink.Href(true)
+
+ if !uniqueLinks[absLink] {
+ links = append(links, absLink)
+ uniqueLinks[absLink] = true
+ }
+ })
+
+ return links
+}
+
+func isValidLink(link *url.Url) bool {
+ if link.Scheme() != "" && link.Scheme() != "http" && link.Scheme() != "https" {
+ return false
+ }
+
+ if strings.HasPrefix(link.String(), "javascript:") {
+ return false
+ }
+
+ return true
+}