From c36bb2ca2a82338a822c6962f3373809b4bed814 Mon Sep 17 00:00:00 2001
From: Philipp Tanlak <philipp.tanlak@gmail.com>
Date: Wed, 16 Aug 2023 19:58:10 +0200
Subject: add allowed domains feature

---
 scrape.go | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

(limited to 'scrape.go')

diff --git a/scrape.go b/scrape.go
index 0c31518..3ecfe7b 100644
--- a/scrape.go
+++ b/scrape.go
@@ -17,7 +17,7 @@ type ScrapeParams struct {
 
 type ScrapeOptions struct {
 	URL            string   `json:"url"`
-	AllowedDomains []string `json:"allowed_domains"`
+	AllowedDomains []string `json:"allowedDomains"`
 	Depth          int      `json:"depth"`
 }
 
@@ -62,6 +62,12 @@ func (s *Scraper) Scrape() <-chan ScrapeResult {
 	if s.FetchFunc == nil {
 		s.FetchFunc = Fetch()
 	}
+	if len(s.ScrapeOptions.AllowedDomains) == 0 {
+		u, err := url.Parse(s.ScrapeOptions.URL)
+		if err == nil {
+			s.ScrapeOptions.AllowedDomains = []string{u.Host()}
+		}
+	}
 
 	jobs := make(chan target, 1024)
 	results := make(chan result)
@@ -107,6 +113,10 @@ func (s *Scraper) worker(id int, jobs chan target, results chan<- result) {
 					continue
 				}
 
+				if !s.isURLAllowed(l) {
+					continue
+				}
+
 				s.wg.Add(1)
 				select {
 				case jobs <- target{url: l, depth: j.depth - 1}:
@@ -138,6 +148,26 @@ func (s *Scraper) process(job target) result {
 	return result{url: job.url, data: data, links: links}
 }
 
+func (s *Scraper) isURLAllowed(rawurl string) bool {
+	u, err := url.Parse(rawurl)
+	if err != nil {
+		return false
+	}
+
+	host := u.Host()
+
+	for _, domain := range s.ScrapeOptions.AllowedDomains {
+		if domain == "*" {
+			return true
+		}
+		if host == domain {
+			return true
+		}
+	}
+
+	return false
+}
+
 func Links(html string, origin string) []string {
 	var links []string
 	doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
-- 
cgit v1.2.3