From 5c16435e2218344a6e232ebb48cf022a32ba85d5 Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Sun, 27 Aug 2023 19:10:49 +0200 Subject: add tests and allow urls --- scrape.go | 63 ++++++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 17 deletions(-) (limited to 'scrape.go') diff --git a/scrape.go b/scrape.go index f245137..1f9ad97 100644 --- a/scrape.go +++ b/scrape.go @@ -6,6 +6,7 @@ package flyscrape import ( "log" + "regexp" "strings" "sync" "time" @@ -21,11 +22,13 @@ type ScrapeParams struct { } type ScrapeOptions struct { - URL string `json:"url"` - AllowedDomains []string `json:"allowedDomains"` - BlockedDomains []string `json:"blockedDomains"` - Depth int `json:"depth"` - Rate float64 `json:"rate"` + URL string `json:"url"` + AllowDomains []string `json:"allowDomains"` + DenyDomains []string `json:"denyDomains"` + AllowURLs []string `json:"allowURLs"` + Proxy string `json:"proxy"` + Depth int `json:"depth"` + Rate float64 `json:"rate"` } type ScrapeResult struct { @@ -54,10 +57,11 @@ type Scraper struct { ScrapeFunc ScrapeFunc FetchFunc FetchFunc - visited *hashmap.Map[string, struct{}] - wg *sync.WaitGroup - jobs chan target - results chan ScrapeResult + visited *hashmap.Map[string, struct{}] + wg *sync.WaitGroup + jobs chan target + results chan ScrapeResult + allowURLsRE []*regexp.Regexp } func (s *Scraper) init() { @@ -69,16 +73,24 @@ func (s *Scraper) init() { if s.FetchFunc == nil { s.FetchFunc = Fetch() } + if s.ScrapeOptions.Proxy != "" { + s.FetchFunc = ProxiedFetch(s.ScrapeOptions.Proxy) + } if s.ScrapeOptions.Rate == 0 { s.ScrapeOptions.Rate = 100 } - if len(s.ScrapeOptions.AllowedDomains) == 0 { - u, err := url.Parse(s.ScrapeOptions.URL) - if err == nil { - s.ScrapeOptions.AllowedDomains = []string{u.Host()} + if u, err := url.Parse(s.ScrapeOptions.URL); err == nil { + s.ScrapeOptions.AllowDomains = append(s.ScrapeOptions.AllowDomains, u.Host()) + } + + for _, pat := range s.ScrapeOptions.AllowURLs { + re, err := regexp.Compile(pat) + if err != nil { + continue } + s.allowURLsRE = append(s.allowURLsRE, re) } } @@ -116,7 +128,8 @@ func (s *Scraper) worker() { continue } - if !s.isURLAllowed(l) { + allowed := s.isDomainAllowed(l) && s.isURLAllowed(l) + if !allowed { continue } @@ -157,7 +170,7 @@ func (s *Scraper) enqueueJob(url string, depth int) { } } -func (s *Scraper) isURLAllowed(rawurl string) bool { +func (s *Scraper) isDomainAllowed(rawurl string) bool { u, err := url.Parse(rawurl) if err != nil { return false @@ -166,14 +179,14 @@ func (s *Scraper) isURLAllowed(rawurl string) bool { host := u.Host() ok := false - for _, domain := range s.ScrapeOptions.AllowedDomains { + for _, domain := range s.ScrapeOptions.AllowDomains { if domain == "*" || host == domain { ok = true break } } - for _, domain := range s.ScrapeOptions.BlockedDomains { + for _, domain := range s.ScrapeOptions.DenyDomains { if host == domain { ok = false break @@ -183,6 +196,22 @@ func (s *Scraper) isURLAllowed(rawurl string) bool { return ok } +func (s *Scraper) isURLAllowed(rawurl string) bool { + if len(s.allowURLsRE) == 0 { + return true + } + + ok := false + + for _, re := range s.allowURLsRE { + if re.MatchString(rawurl) { + ok = true + } + } + + return ok +} + func (s *Scraper) waitClose() { s.wg.Wait() close(s.jobs) -- cgit v1.2.3