summaryrefslogtreecommitdiff
path: root/scrape.go
diff options
context:
space:
mode:
Diffstat (limited to 'scrape.go')
-rw-r--r--scrape.go32
1 files changed, 31 insertions, 1 deletions
diff --git a/scrape.go b/scrape.go
index 0c31518..3ecfe7b 100644
--- a/scrape.go
+++ b/scrape.go
@@ -17,7 +17,7 @@ type ScrapeParams struct {
type ScrapeOptions struct {
URL string `json:"url"`
- AllowedDomains []string `json:"allowed_domains"`
+ AllowedDomains []string `json:"allowedDomains"`
Depth int `json:"depth"`
}
@@ -62,6 +62,12 @@ func (s *Scraper) Scrape() <-chan ScrapeResult {
if s.FetchFunc == nil {
s.FetchFunc = Fetch()
}
+ if len(s.ScrapeOptions.AllowedDomains) == 0 {
+ u, err := url.Parse(s.ScrapeOptions.URL)
+ if err == nil {
+ s.ScrapeOptions.AllowedDomains = []string{u.Host()}
+ }
+ }
jobs := make(chan target, 1024)
results := make(chan result)
@@ -107,6 +113,10 @@ func (s *Scraper) worker(id int, jobs chan target, results chan<- result) {
continue
}
+ if !s.isURLAllowed(l) {
+ continue
+ }
+
s.wg.Add(1)
select {
case jobs <- target{url: l, depth: j.depth - 1}:
@@ -138,6 +148,26 @@ func (s *Scraper) process(job target) result {
return result{url: job.url, data: data, links: links}
}
+func (s *Scraper) isURLAllowed(rawurl string) bool {
+ u, err := url.Parse(rawurl)
+ if err != nil {
+ return false
+ }
+
+ host := u.Host()
+
+ for _, domain := range s.ScrapeOptions.AllowedDomains {
+ if domain == "*" {
+ return true
+ }
+ if host == domain {
+ return true
+ }
+ }
+
+ return false
+}
+
func Links(html string, origin string) []string {
var links []string
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))