summaryrefslogtreecommitdiff
path: root/scrape.go
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2023-08-27 19:10:49 +0200
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2023-08-27 19:10:49 +0200
commit5c16435e2218344a6e232ebb48cf022a32ba85d5 (patch)
tree3cfa1dbc1f489ba4509fc408a8c0afccca7f9c7c /scrape.go
parent52107c13b4c2c4efa9269b187916f3195be5a10d (diff)
add tests and allow urls
Diffstat (limited to 'scrape.go')
-rw-r--r--scrape.go63
1 files changed, 46 insertions, 17 deletions
diff --git a/scrape.go b/scrape.go
index f245137..1f9ad97 100644
--- a/scrape.go
+++ b/scrape.go
@@ -6,6 +6,7 @@ package flyscrape
import (
"log"
+ "regexp"
"strings"
"sync"
"time"
@@ -21,11 +22,13 @@ type ScrapeParams struct {
}
type ScrapeOptions struct {
- URL string `json:"url"`
- AllowedDomains []string `json:"allowedDomains"`
- BlockedDomains []string `json:"blockedDomains"`
- Depth int `json:"depth"`
- Rate float64 `json:"rate"`
+ URL string `json:"url"`
+ AllowDomains []string `json:"allowDomains"`
+ DenyDomains []string `json:"denyDomains"`
+ AllowURLs []string `json:"allowURLs"`
+ Proxy string `json:"proxy"`
+ Depth int `json:"depth"`
+ Rate float64 `json:"rate"`
}
type ScrapeResult struct {
@@ -54,10 +57,11 @@ type Scraper struct {
ScrapeFunc ScrapeFunc
FetchFunc FetchFunc
- visited *hashmap.Map[string, struct{}]
- wg *sync.WaitGroup
- jobs chan target
- results chan ScrapeResult
+ visited *hashmap.Map[string, struct{}]
+ wg *sync.WaitGroup
+ jobs chan target
+ results chan ScrapeResult
+ allowURLsRE []*regexp.Regexp
}
func (s *Scraper) init() {
@@ -69,16 +73,24 @@ func (s *Scraper) init() {
if s.FetchFunc == nil {
s.FetchFunc = Fetch()
}
+ if s.ScrapeOptions.Proxy != "" {
+ s.FetchFunc = ProxiedFetch(s.ScrapeOptions.Proxy)
+ }
if s.ScrapeOptions.Rate == 0 {
s.ScrapeOptions.Rate = 100
}
- if len(s.ScrapeOptions.AllowedDomains) == 0 {
- u, err := url.Parse(s.ScrapeOptions.URL)
- if err == nil {
- s.ScrapeOptions.AllowedDomains = []string{u.Host()}
+ if u, err := url.Parse(s.ScrapeOptions.URL); err == nil {
+ s.ScrapeOptions.AllowDomains = append(s.ScrapeOptions.AllowDomains, u.Host())
+ }
+
+ for _, pat := range s.ScrapeOptions.AllowURLs {
+ re, err := regexp.Compile(pat)
+ if err != nil {
+ continue
}
+ s.allowURLsRE = append(s.allowURLsRE, re)
}
}
@@ -116,7 +128,8 @@ func (s *Scraper) worker() {
continue
}
- if !s.isURLAllowed(l) {
+ allowed := s.isDomainAllowed(l) && s.isURLAllowed(l)
+ if !allowed {
continue
}
@@ -157,7 +170,7 @@ func (s *Scraper) enqueueJob(url string, depth int) {
}
}
-func (s *Scraper) isURLAllowed(rawurl string) bool {
+func (s *Scraper) isDomainAllowed(rawurl string) bool {
u, err := url.Parse(rawurl)
if err != nil {
return false
@@ -166,14 +179,14 @@ func (s *Scraper) isURLAllowed(rawurl string) bool {
host := u.Host()
ok := false
- for _, domain := range s.ScrapeOptions.AllowedDomains {
+ for _, domain := range s.ScrapeOptions.AllowDomains {
if domain == "*" || host == domain {
ok = true
break
}
}
- for _, domain := range s.ScrapeOptions.BlockedDomains {
+ for _, domain := range s.ScrapeOptions.DenyDomains {
if host == domain {
ok = false
break
@@ -183,6 +196,22 @@ func (s *Scraper) isURLAllowed(rawurl string) bool {
return ok
}
+func (s *Scraper) isURLAllowed(rawurl string) bool {
+ if len(s.allowURLsRE) == 0 {
+ return true
+ }
+
+ ok := false
+
+ for _, re := range s.allowURLsRE {
+ if re.MatchString(rawurl) {
+ ok = true
+ }
+ }
+
+ return ok
+}
+
func (s *Scraper) waitClose() {
s.wg.Wait()
close(s.jobs)