summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2023-08-27 19:25:04 +0200
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2023-08-27 19:25:04 +0200
commitb2881a3bc04063ba520fa9795fd459d32a1b8874 (patch)
treec49a55a869de978f0702c07ff7df9ae3bc080026
parent5c16435e2218344a6e232ebb48cf022a32ba85d5 (diff)
rename allow and block fields
-rw-r--r--js/template.js6
-rw-r--r--scrape.go65
-rw-r--r--scrape_test.go75
3 files changed, 102 insertions, 44 deletions
diff --git a/js/template.js b/js/template.js
index ac78b47..75a60c7 100644
--- a/js/template.js
+++ b/js/template.js
@@ -3,8 +3,10 @@ import { parse } from 'flyscrape';
export const options = {
url: 'https://news.ycombinator.com/', // Specify the URL to start scraping from.
depth: 1, // Specify how deep links should be followed. (default = 0, no follow)
- allowDomains: [], // Specify the allowed domains. * for all. (default = domain from url)
- denyDomains: [], // Specify the denied domains. (default = none)
+ allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
+ blockedDomains: [], // Specify the blocked domains. (default = none)
+ allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed)
+ blockedURLs: [], // Specify the blocked URLs as regex. (default = non blocked)
rate: 100, // Specify the rate in requests per second. (default = 100)
}
diff --git a/scrape.go b/scrape.go
index 1f9ad97..8b6ce11 100644
--- a/scrape.go
+++ b/scrape.go
@@ -22,13 +22,14 @@ type ScrapeParams struct {
}
type ScrapeOptions struct {
- URL string `json:"url"`
- AllowDomains []string `json:"allowDomains"`
- DenyDomains []string `json:"denyDomains"`
- AllowURLs []string `json:"allowURLs"`
- Proxy string `json:"proxy"`
- Depth int `json:"depth"`
- Rate float64 `json:"rate"`
+ URL string `json:"url"`
+ AllowedDomains []string `json:"allowedDomains"`
+ BlockedDomains []string `json:"blockedDomains"`
+ AllowedURLs []string `json:"allowedURLs"`
+ BlockedURLs []string `json:"blockedURLs"`
+ Proxy string `json:"proxy"`
+ Depth int `json:"depth"`
+ Rate float64 `json:"rate"`
}
type ScrapeResult struct {
@@ -57,11 +58,12 @@ type Scraper struct {
ScrapeFunc ScrapeFunc
FetchFunc FetchFunc
- visited *hashmap.Map[string, struct{}]
- wg *sync.WaitGroup
- jobs chan target
- results chan ScrapeResult
- allowURLsRE []*regexp.Regexp
+ visited *hashmap.Map[string, struct{}]
+ wg *sync.WaitGroup
+ jobs chan target
+ results chan ScrapeResult
+ allowedURLsRE []*regexp.Regexp
+ blockedURLsRE []*regexp.Regexp
}
func (s *Scraper) init() {
@@ -82,15 +84,23 @@ func (s *Scraper) init() {
}
if u, err := url.Parse(s.ScrapeOptions.URL); err == nil {
- s.ScrapeOptions.AllowDomains = append(s.ScrapeOptions.AllowDomains, u.Host())
+ s.ScrapeOptions.AllowedDomains = append(s.ScrapeOptions.AllowedDomains, u.Host())
}
- for _, pat := range s.ScrapeOptions.AllowURLs {
+ for _, pat := range s.ScrapeOptions.AllowedURLs {
re, err := regexp.Compile(pat)
if err != nil {
continue
}
- s.allowURLsRE = append(s.allowURLsRE, re)
+ s.allowedURLsRE = append(s.allowedURLsRE, re)
+ }
+
+ for _, pat := range s.ScrapeOptions.BlockedURLs {
+ re, err := regexp.Compile(pat)
+ if err != nil {
+ continue
+ }
+ s.blockedURLsRE = append(s.blockedURLsRE, re)
}
}
@@ -179,14 +189,14 @@ func (s *Scraper) isDomainAllowed(rawurl string) bool {
host := u.Host()
ok := false
- for _, domain := range s.ScrapeOptions.AllowDomains {
+ for _, domain := range s.ScrapeOptions.AllowedDomains {
if domain == "*" || host == domain {
ok = true
break
}
}
- for _, domain := range s.ScrapeOptions.DenyDomains {
+ for _, domain := range s.ScrapeOptions.BlockedDomains {
if host == domain {
ok = false
break
@@ -197,15 +207,32 @@ func (s *Scraper) isDomainAllowed(rawurl string) bool {
}
func (s *Scraper) isURLAllowed(rawurl string) bool {
- if len(s.allowURLsRE) == 0 {
+ // allow root url
+ if rawurl == s.ScrapeOptions.URL {
+ return true
+ }
+
+ // allow if no filter is set
+ if len(s.allowedURLsRE) == 0 && len(s.blockedURLsRE) == 0 {
return true
}
ok := false
+ if len(s.allowedURLsRE) == 0 {
+ ok = true
+ }
- for _, re := range s.allowURLsRE {
+ for _, re := range s.allowedURLsRE {
if re.MatchString(rawurl) {
ok = true
+ break
+ }
+ }
+
+ for _, re := range s.blockedURLsRE {
+ if re.MatchString(rawurl) {
+ ok = false
+ break
}
}
diff --git a/scrape_test.go b/scrape_test.go
index acfbbbf..c1a32b6 100644
--- a/scrape_test.go
+++ b/scrape_test.go
@@ -18,9 +18,9 @@ import (
func TestScrapeFollowLinks(t *testing.T) {
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
- URL: "http://www.example.com/foo/bar",
- Depth: 1,
- AllowDomains: []string{"www.google.com"},
+ URL: "http://www.example.com/foo/bar",
+ Depth: 1,
+ AllowedDomains: []string{"www.google.com"},
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return "foobar", nil
@@ -47,9 +47,9 @@ func TestScrapeFollowLinks(t *testing.T) {
func TestScrapeDepth(t *testing.T) {
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
- URL: "http://www.example.com/",
- Depth: 2,
- AllowDomains: []string{"*"},
+ URL: "http://www.example.com/",
+ Depth: 2,
+ AllowedDomains: []string{"*"},
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return "foobar", nil
@@ -78,12 +78,12 @@ func TestScrapeDepth(t *testing.T) {
require.Contains(t, urls, "http://www.duckduckgo.com/")
}
-func TestScrapeAllowDomains(t *testing.T) {
+func TestScrapeAllowedDomains(t *testing.T) {
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
- URL: "http://www.example.com/",
- Depth: 1,
- AllowDomains: []string{"www.google.com"},
+ URL: "http://www.example.com/",
+ Depth: 1,
+ AllowedDomains: []string{"www.google.com"},
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return "foobar", nil
@@ -104,12 +104,12 @@ func TestScrapeAllowDomains(t *testing.T) {
require.Contains(t, urls, "http://www.google.com/")
}
-func TestScrapeAllowDomainsAll(t *testing.T) {
+func TestScrapeAllowedDomainsAll(t *testing.T) {
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
- URL: "http://www.example.com/",
- Depth: 1,
- AllowDomains: []string{"*"},
+ URL: "http://www.example.com/",
+ Depth: 1,
+ AllowedDomains: []string{"*"},
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return "foobar", nil
@@ -131,13 +131,13 @@ func TestScrapeAllowDomainsAll(t *testing.T) {
require.Contains(t, urls, "http://www.google.com/")
}
-func TestScrapeDenyDomains(t *testing.T) {
+func TestScrapeBlockedDomains(t *testing.T) {
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
- URL: "http://www.example.com/",
- Depth: 1,
- AllowDomains: []string{"*"},
- DenyDomains: []string{"www.google.com"},
+ URL: "http://www.example.com/",
+ Depth: 1,
+ AllowedDomains: []string{"*"},
+ BlockedDomains: []string{"www.google.com"},
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return "foobar", nil
@@ -158,12 +158,12 @@ func TestScrapeDenyDomains(t *testing.T) {
require.Contains(t, urls, "http://www.duckduckgo.com/")
}
-func TestScrapeAllowURLs(t *testing.T) {
+func TestScrapeAllowedURLs(t *testing.T) {
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
- URL: "http://www.example.com/",
- Depth: 1,
- AllowURLs: []string{`/foo\?id=\d+`, `/bar$`},
+ URL: "http://www.example.com/",
+ Depth: 1,
+ AllowedURLs: []string{`/foo\?id=\d+`, `/bar$`},
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return "foobar", nil
@@ -187,6 +187,35 @@ func TestScrapeAllowURLs(t *testing.T) {
require.Contains(t, urls, "http://www.example.com/bar")
}
+func TestScrapeBlockedURLs(t *testing.T) {
+ scr := flyscrape.Scraper{
+ ScrapeOptions: flyscrape.ScrapeOptions{
+ URL: "http://www.example.com/",
+ Depth: 1,
+ BlockedURLs: []string{`/foo\?id=\d+`, `/bar$`},
+ },
+ ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+ return "foobar", nil
+ },
+ FetchFunc: func(url string) (string, error) {
+ return `<a href="foo?id=123">123</a>
+ <a href="foo?id=ABC">ABC</a>
+ <a href="/bar">bar</a>
+ <a href="/barz">barz</a>`, nil
+ },
+ }
+
+ urls := make(map[string]struct{})
+ for res := range scr.Scrape() {
+ urls[res.URL] = struct{}{}
+ }
+
+ require.Len(t, urls, 3)
+ require.Contains(t, urls, "http://www.example.com/")
+ require.Contains(t, urls, "http://www.example.com/foo?id=ABC")
+ require.Contains(t, urls, "http://www.example.com/barz")
+}
+
func TestScrapeRate(t *testing.T) {
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{