From c36bb2ca2a82338a822c6962f3373809b4bed814 Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Wed, 16 Aug 2023 19:58:10 +0200 Subject: add allowed domains feature --- scrape.go | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'scrape.go') diff --git a/scrape.go b/scrape.go index 0c31518..3ecfe7b 100644 --- a/scrape.go +++ b/scrape.go @@ -17,7 +17,7 @@ type ScrapeParams struct { type ScrapeOptions struct { URL string `json:"url"` - AllowedDomains []string `json:"allowed_domains"` + AllowedDomains []string `json:"allowedDomains"` Depth int `json:"depth"` } @@ -62,6 +62,12 @@ func (s *Scraper) Scrape() <-chan ScrapeResult { if s.FetchFunc == nil { s.FetchFunc = Fetch() } + if len(s.ScrapeOptions.AllowedDomains) == 0 { + u, err := url.Parse(s.ScrapeOptions.URL) + if err == nil { + s.ScrapeOptions.AllowedDomains = []string{u.Host()} + } + } jobs := make(chan target, 1024) results := make(chan result) @@ -107,6 +113,10 @@ func (s *Scraper) worker(id int, jobs chan target, results chan<- result) { continue } + if !s.isURLAllowed(l) { + continue + } + s.wg.Add(1) select { case jobs <- target{url: l, depth: j.depth - 1}: @@ -138,6 +148,26 @@ func (s *Scraper) process(job target) result { return result{url: job.url, data: data, links: links} } +func (s *Scraper) isURLAllowed(rawurl string) bool { + u, err := url.Parse(rawurl) + if err != nil { + return false + } + + host := u.Host() + + for _, domain := range s.ScrapeOptions.AllowedDomains { + if domain == "*" { + return true + } + if host == domain { + return true + } + } + + return false +} + func Links(html string, origin string) []string { var links []string doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) -- cgit v1.2.3