summaryrefslogtreecommitdiff
path: root/scrape.go
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2023-08-27 19:25:04 +0200
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2023-08-27 19:25:04 +0200
commitb2881a3bc04063ba520fa9795fd459d32a1b8874 (patch)
treec49a55a869de978f0702c07ff7df9ae3bc080026 /scrape.go
parent5c16435e2218344a6e232ebb48cf022a32ba85d5 (diff)
rename allow and block fields
Diffstat (limited to 'scrape.go')
-rw-r--r--scrape.go65
1 files changed, 46 insertions, 19 deletions
diff --git a/scrape.go b/scrape.go
index 1f9ad97..8b6ce11 100644
--- a/scrape.go
+++ b/scrape.go
@@ -22,13 +22,14 @@ type ScrapeParams struct {
}
type ScrapeOptions struct {
- URL string `json:"url"`
- AllowDomains []string `json:"allowDomains"`
- DenyDomains []string `json:"denyDomains"`
- AllowURLs []string `json:"allowURLs"`
- Proxy string `json:"proxy"`
- Depth int `json:"depth"`
- Rate float64 `json:"rate"`
+ URL string `json:"url"`
+ AllowedDomains []string `json:"allowedDomains"`
+ BlockedDomains []string `json:"blockedDomains"`
+ AllowedURLs []string `json:"allowedURLs"`
+ BlockedURLs []string `json:"blockedURLs"`
+ Proxy string `json:"proxy"`
+ Depth int `json:"depth"`
+ Rate float64 `json:"rate"`
}
type ScrapeResult struct {
@@ -57,11 +58,12 @@ type Scraper struct {
ScrapeFunc ScrapeFunc
FetchFunc FetchFunc
- visited *hashmap.Map[string, struct{}]
- wg *sync.WaitGroup
- jobs chan target
- results chan ScrapeResult
- allowURLsRE []*regexp.Regexp
+ visited *hashmap.Map[string, struct{}]
+ wg *sync.WaitGroup
+ jobs chan target
+ results chan ScrapeResult
+ allowedURLsRE []*regexp.Regexp
+ blockedURLsRE []*regexp.Regexp
}
func (s *Scraper) init() {
@@ -82,15 +84,23 @@ func (s *Scraper) init() {
}
if u, err := url.Parse(s.ScrapeOptions.URL); err == nil {
- s.ScrapeOptions.AllowDomains = append(s.ScrapeOptions.AllowDomains, u.Host())
+ s.ScrapeOptions.AllowedDomains = append(s.ScrapeOptions.AllowedDomains, u.Host())
}
- for _, pat := range s.ScrapeOptions.AllowURLs {
+ for _, pat := range s.ScrapeOptions.AllowedURLs {
re, err := regexp.Compile(pat)
if err != nil {
continue
}
- s.allowURLsRE = append(s.allowURLsRE, re)
+ s.allowedURLsRE = append(s.allowedURLsRE, re)
+ }
+
+ for _, pat := range s.ScrapeOptions.BlockedURLs {
+ re, err := regexp.Compile(pat)
+ if err != nil {
+ continue
+ }
+ s.blockedURLsRE = append(s.blockedURLsRE, re)
}
}
@@ -179,14 +189,14 @@ func (s *Scraper) isDomainAllowed(rawurl string) bool {
host := u.Host()
ok := false
- for _, domain := range s.ScrapeOptions.AllowDomains {
+ for _, domain := range s.ScrapeOptions.AllowedDomains {
if domain == "*" || host == domain {
ok = true
break
}
}
- for _, domain := range s.ScrapeOptions.DenyDomains {
+ for _, domain := range s.ScrapeOptions.BlockedDomains {
if host == domain {
ok = false
break
@@ -197,15 +207,32 @@ func (s *Scraper) isDomainAllowed(rawurl string) bool {
}
func (s *Scraper) isURLAllowed(rawurl string) bool {
- if len(s.allowURLsRE) == 0 {
+ // allow root url
+ if rawurl == s.ScrapeOptions.URL {
+ return true
+ }
+
+ // allow if no filter is set
+ if len(s.allowedURLsRE) == 0 && len(s.blockedURLsRE) == 0 {
return true
}
ok := false
+ if len(s.allowedURLsRE) == 0 {
+ ok = true
+ }
- for _, re := range s.allowURLsRE {
+ for _, re := range s.allowedURLsRE {
if re.MatchString(rawurl) {
ok = true
+ break
+ }
+ }
+
+ for _, re := range s.blockedURLsRE {
+ if re.MatchString(rawurl) {
+ ok = false
+ break
}
}