rename allow and block fields

author: Philipp Tanlak <philipp.tanlak@gmail.com> 2023-08-27 19:25:04 +0200
committer: Philipp Tanlak <philipp.tanlak@gmail.com> 2023-08-27 19:25:04 +0200
commit: b2881a3bc04063ba520fa9795fd459d32a1b8874 (patch)
tree: c49a55a869de978f0702c07ff7df9ae3bc080026
parent: 5c16435e2218344a6e232ebb48cf022a32ba85d5 (diff)
3 files changed, 102 insertions, 44 deletions
diff --git a/js/template.js b/js/template.js
index ac78b47..75a60c7 100644
--- a/js/template.js
+++ b/js/template.js
@@ -3,8 +3,10 @@ import { parse } from 'flyscrape';
 export const options = {
     url: 'https://news.ycombinator.com/',     // Specify the URL to start scraping from.
     depth: 1,                                 // Specify how deep links should be followed.  (default = 0, no follow)
-    allowDomains: [],                         // Specify the allowed domains. * for all.     (default = domain from url)
-    denyDomains: [],                          // Specify the denied domains.                 (default = none)
+    allowedDomains: [],                       // Specify the allowed domains. ['*'] for all. (default = domain from url)
+    blockedDomains: [],                       // Specify the blocked domains.                (default = none)
+    allowedURLs: [],                          // Specify the allowed URLs as regex.          (default = all allowed)
+    blockedURLs: [],                          // Specify the blocked URLs as regex.          (default = non blocked)
     rate: 100,                                // Specify the rate in requests per second.    (default = 100)
 }
 
diff --git a/scrape.go b/scrape.go
index 1f9ad97..8b6ce11 100644
--- a/scrape.go
+++ b/scrape.go
@@ -22,13 +22,14 @@ type ScrapeParams struct {
 }
 
 type ScrapeOptions struct {
-	URL          string   `json:"url"`
-	AllowDomains []string `json:"allowDomains"`
-	DenyDomains  []string `json:"denyDomains"`
-	AllowURLs    []string `json:"allowURLs"`
-	Proxy        string   `json:"proxy"`
-	Depth        int      `json:"depth"`
-	Rate         float64  `json:"rate"`
+	URL            string   `json:"url"`
+	AllowedDomains []string `json:"allowedDomains"`
+	BlockedDomains []string `json:"blockedDomains"`
+	AllowedURLs    []string `json:"allowedURLs"`
+	BlockedURLs    []string `json:"blockedURLs"`
+	Proxy          string   `json:"proxy"`
+	Depth          int      `json:"depth"`
+	Rate           float64  `json:"rate"`
 }
 
 type ScrapeResult struct {
@@ -57,11 +58,12 @@ type Scraper struct {
 	ScrapeFunc    ScrapeFunc
 	FetchFunc     FetchFunc
 
-	visited     *hashmap.Map[string, struct{}]
-	wg          *sync.WaitGroup
-	jobs        chan target
-	results     chan ScrapeResult
-	allowURLsRE []*regexp.Regexp
+	visited       *hashmap.Map[string, struct{}]
+	wg            *sync.WaitGroup
+	jobs          chan target
+	results       chan ScrapeResult
+	allowedURLsRE []*regexp.Regexp
+	blockedURLsRE []*regexp.Regexp
 }
 
 func (s *Scraper) init() {
@@ -82,15 +84,23 @@ func (s *Scraper) init() {
 	}
 
 	if u, err := url.Parse(s.ScrapeOptions.URL); err == nil {
-		s.ScrapeOptions.AllowDomains = append(s.ScrapeOptions.AllowDomains, u.Host())
+		s.ScrapeOptions.AllowedDomains = append(s.ScrapeOptions.AllowedDomains, u.Host())
 	}
 
-	for _, pat := range s.ScrapeOptions.AllowURLs {
+	for _, pat := range s.ScrapeOptions.AllowedURLs {
 		re, err := regexp.Compile(pat)
 		if err != nil {
 			continue
 		}
-		s.allowURLsRE = append(s.allowURLsRE, re)
+		s.allowedURLsRE = append(s.allowedURLsRE, re)
+	}
+
+	for _, pat := range s.ScrapeOptions.BlockedURLs {
+		re, err := regexp.Compile(pat)
+		if err != nil {
+			continue
+		}
+		s.blockedURLsRE = append(s.blockedURLsRE, re)
 	}
 }
 
@@ -179,14 +189,14 @@ func (s *Scraper) isDomainAllowed(rawurl string) bool {
 	host := u.Host()
 	ok := false
 
-	for _, domain := range s.ScrapeOptions.AllowDomains {
+	for _, domain := range s.ScrapeOptions.AllowedDomains {
 		if domain == "*" || host == domain {
 			ok = true
 			break
 		}
 	}
 
-	for _, domain := range s.ScrapeOptions.DenyDomains {
+	for _, domain := range s.ScrapeOptions.BlockedDomains {
 		if host == domain {
 			ok = false
 			break
@@ -197,15 +207,32 @@ func (s *Scraper) isDomainAllowed(rawurl string) bool {
 }
 
 func (s *Scraper) isURLAllowed(rawurl string) bool {
-	if len(s.allowURLsRE) == 0 {
+	// allow root url
+	if rawurl == s.ScrapeOptions.URL {
+		return true
+	}
+
+	// allow if no filter is set
+	if len(s.allowedURLsRE) == 0 && len(s.blockedURLsRE) == 0 {
 		return true
 	}
 
 	ok := false
+	if len(s.allowedURLsRE) == 0 {
+		ok = true
+	}
 
-	for _, re := range s.allowURLsRE {
+	for _, re := range s.allowedURLsRE {
 		if re.MatchString(rawurl) {
 			ok = true
+			break
+		}
+	}
+
+	for _, re := range s.blockedURLsRE {
+		if re.MatchString(rawurl) {
+			ok = false
+			break
 		}
 	}
 
diff --git a/scrape_test.go b/scrape_test.go
index acfbbbf..c1a32b6 100644
--- a/scrape_test.go
+++ b/scrape_test.go
@@ -18,9 +18,9 @@ import (
 func TestScrapeFollowLinks(t *testing.T) {
 	scr := flyscrape.Scraper{
 		ScrapeOptions: flyscrape.ScrapeOptions{
-			URL:          "http://www.example.com/foo/bar",
-			Depth:        1,
-			AllowDomains: []string{"www.google.com"},
+			URL:            "http://www.example.com/foo/bar",
+			Depth:          1,
+			AllowedDomains: []string{"www.google.com"},
 		},
 		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
 			return "foobar", nil
@@ -47,9 +47,9 @@ func TestScrapeFollowLinks(t *testing.T) {
 func TestScrapeDepth(t *testing.T) {
 	scr := flyscrape.Scraper{
 		ScrapeOptions: flyscrape.ScrapeOptions{
-			URL:          "http://www.example.com/",
-			Depth:        2,
-			AllowDomains: []string{"*"},
+			URL:            "http://www.example.com/",
+			Depth:          2,
+			AllowedDomains: []string{"*"},
 		},
 		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
 			return "foobar", nil
@@ -78,12 +78,12 @@ func TestScrapeDepth(t *testing.T) {
 	require.Contains(t, urls, "http://www.duckduckgo.com/")
 }
 
-func TestScrapeAllowDomains(t *testing.T) {
+func TestScrapeAllowedDomains(t *testing.T) {
 	scr := flyscrape.Scraper{
 		ScrapeOptions: flyscrape.ScrapeOptions{
-			URL:          "http://www.example.com/",
-			Depth:        1,
-			AllowDomains: []string{"www.google.com"},
+			URL:            "http://www.example.com/",
+			Depth:          1,
+			AllowedDomains: []string{"www.google.com"},
 		},
 		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
 			return "foobar", nil
@@ -104,12 +104,12 @@ func TestScrapeAllowDomains(t *testing.T) {
 	require.Contains(t, urls, "http://www.google.com/")
 }
 
-func TestScrapeAllowDomainsAll(t *testing.T) {
+func TestScrapeAllowedDomainsAll(t *testing.T) {
 	scr := flyscrape.Scraper{
 		ScrapeOptions: flyscrape.ScrapeOptions{
-			URL:          "http://www.example.com/",
-			Depth:        1,
-			AllowDomains: []string{"*"},
+			URL:            "http://www.example.com/",
+			Depth:          1,
+			AllowedDomains: []string{"*"},
 		},
 		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
 			return "foobar", nil
@@ -131,13 +131,13 @@ func TestScrapeAllowDomainsAll(t *testing.T) {
 	require.Contains(t, urls, "http://www.google.com/")
 }
 
-func TestScrapeDenyDomains(t *testing.T) {
+func TestScrapeBlockedDomains(t *testing.T) {
 	scr := flyscrape.Scraper{
 		ScrapeOptions: flyscrape.ScrapeOptions{
-			URL:          "http://www.example.com/",
-			Depth:        1,
-			AllowDomains: []string{"*"},
-			DenyDomains:  []string{"www.google.com"},
+			URL:            "http://www.example.com/",
+			Depth:          1,
+			AllowedDomains: []string{"*"},
+			BlockedDomains: []string{"www.google.com"},
 		},
 		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
 			return "foobar", nil
@@ -158,12 +158,12 @@ func TestScrapeDenyDomains(t *testing.T) {
 	require.Contains(t, urls, "http://www.duckduckgo.com/")
 }
 
-func TestScrapeAllowURLs(t *testing.T) {
+func TestScrapeAllowedURLs(t *testing.T) {
 	scr := flyscrape.Scraper{
 		ScrapeOptions: flyscrape.ScrapeOptions{
-			URL:       "http://www.example.com/",
-			Depth:     1,
-			AllowURLs: []string{`/foo\?id=\d+`, `/bar$`},
+			URL:         "http://www.example.com/",
+			Depth:       1,
+			AllowedURLs: []string{`/foo\?id=\d+`, `/bar$`},
 		},
 		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
 			return "foobar", nil
@@ -187,6 +187,35 @@ func TestScrapeAllowURLs(t *testing.T) {
 	require.Contains(t, urls, "http://www.example.com/bar")
 }
 
+func TestScrapeBlockedURLs(t *testing.T) {
+	scr := flyscrape.Scraper{
+		ScrapeOptions: flyscrape.ScrapeOptions{
+			URL:         "http://www.example.com/",
+			Depth:       1,
+			BlockedURLs: []string{`/foo\?id=\d+`, `/bar$`},
+		},
+		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+			return "foobar", nil
+		},
+		FetchFunc: func(url string) (string, error) {
+			return `<a href="foo?id=123">123</a>
+			        <a href="foo?id=ABC">ABC</a>
+			        <a href="/bar">bar</a>
+                    <a href="/barz">barz</a>`, nil
+		},
+	}
+
+	urls := make(map[string]struct{})
+	for res := range scr.Scrape() {
+		urls[res.URL] = struct{}{}
+	}
+
+	require.Len(t, urls, 3)
+	require.Contains(t, urls, "http://www.example.com/")
+	require.Contains(t, urls, "http://www.example.com/foo?id=ABC")
+	require.Contains(t, urls, "http://www.example.com/barz")
+}
+
 func TestScrapeRate(t *testing.T) {
 	scr := flyscrape.Scraper{
 		ScrapeOptions: flyscrape.ScrapeOptions{
author	Philipp Tanlak <philipp.tanlak@gmail.com>	2023-08-27 19:25:04 +0200
committer	Philipp Tanlak <philipp.tanlak@gmail.com>	2023-08-27 19:25:04 +0200
commit	b2881a3bc04063ba520fa9795fd459d32a1b8874 (patch)
tree	c49a55a869de978f0702c07ff7df9ae3bc080026
parent	5c16435e2218344a6e232ebb48cf022a32ba85d5 (diff)