diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-27 19:25:04 +0200 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-27 19:25:04 +0200 |
| commit | b2881a3bc04063ba520fa9795fd459d32a1b8874 (patch) | |
| tree | c49a55a869de978f0702c07ff7df9ae3bc080026 | |
| parent | 5c16435e2218344a6e232ebb48cf022a32ba85d5 (diff) | |
rename allow and block fields
| -rw-r--r-- | js/template.js | 6 | ||||
| -rw-r--r-- | scrape.go | 65 | ||||
| -rw-r--r-- | scrape_test.go | 75 |
3 files changed, 102 insertions, 44 deletions
diff --git a/js/template.js b/js/template.js index ac78b47..75a60c7 100644 --- a/js/template.js +++ b/js/template.js @@ -3,8 +3,10 @@ import { parse } from 'flyscrape'; export const options = { url: 'https://news.ycombinator.com/', // Specify the URL to start scraping from. depth: 1, // Specify how deep links should be followed. (default = 0, no follow) - allowDomains: [], // Specify the allowed domains. * for all. (default = domain from url) - denyDomains: [], // Specify the denied domains. (default = none) + allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) + blockedDomains: [], // Specify the blocked domains. (default = none) + allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) + blockedURLs: [], // Specify the blocked URLs as regex. (default = non blocked) rate: 100, // Specify the rate in requests per second. (default = 100) } @@ -22,13 +22,14 @@ type ScrapeParams struct { } type ScrapeOptions struct { - URL string `json:"url"` - AllowDomains []string `json:"allowDomains"` - DenyDomains []string `json:"denyDomains"` - AllowURLs []string `json:"allowURLs"` - Proxy string `json:"proxy"` - Depth int `json:"depth"` - Rate float64 `json:"rate"` + URL string `json:"url"` + AllowedDomains []string `json:"allowedDomains"` + BlockedDomains []string `json:"blockedDomains"` + AllowedURLs []string `json:"allowedURLs"` + BlockedURLs []string `json:"blockedURLs"` + Proxy string `json:"proxy"` + Depth int `json:"depth"` + Rate float64 `json:"rate"` } type ScrapeResult struct { @@ -57,11 +58,12 @@ type Scraper struct { ScrapeFunc ScrapeFunc FetchFunc FetchFunc - visited *hashmap.Map[string, struct{}] - wg *sync.WaitGroup - jobs chan target - results chan ScrapeResult - allowURLsRE []*regexp.Regexp + visited *hashmap.Map[string, struct{}] + wg *sync.WaitGroup + jobs chan target + results chan ScrapeResult + allowedURLsRE []*regexp.Regexp + blockedURLsRE []*regexp.Regexp } func (s *Scraper) init() { @@ -82,15 +84,23 @@ func (s *Scraper) init() { } if u, err := url.Parse(s.ScrapeOptions.URL); err == nil { - s.ScrapeOptions.AllowDomains = append(s.ScrapeOptions.AllowDomains, u.Host()) + s.ScrapeOptions.AllowedDomains = append(s.ScrapeOptions.AllowedDomains, u.Host()) } - for _, pat := range s.ScrapeOptions.AllowURLs { + for _, pat := range s.ScrapeOptions.AllowedURLs { re, err := regexp.Compile(pat) if err != nil { continue } - s.allowURLsRE = append(s.allowURLsRE, re) + s.allowedURLsRE = append(s.allowedURLsRE, re) + } + + for _, pat := range s.ScrapeOptions.BlockedURLs { + re, err := regexp.Compile(pat) + if err != nil { + continue + } + s.blockedURLsRE = append(s.blockedURLsRE, re) } } @@ -179,14 +189,14 @@ func (s *Scraper) isDomainAllowed(rawurl string) bool { host := u.Host() ok := false - for _, domain := range s.ScrapeOptions.AllowDomains { + for _, domain := range s.ScrapeOptions.AllowedDomains { if domain == "*" || host == domain { ok = true break } } - for _, domain := range s.ScrapeOptions.DenyDomains { + for _, domain := range s.ScrapeOptions.BlockedDomains { if host == domain { ok = false break @@ -197,15 +207,32 @@ func (s *Scraper) isDomainAllowed(rawurl string) bool { } func (s *Scraper) isURLAllowed(rawurl string) bool { - if len(s.allowURLsRE) == 0 { + // allow root url + if rawurl == s.ScrapeOptions.URL { + return true + } + + // allow if no filter is set + if len(s.allowedURLsRE) == 0 && len(s.blockedURLsRE) == 0 { return true } ok := false + if len(s.allowedURLsRE) == 0 { + ok = true + } - for _, re := range s.allowURLsRE { + for _, re := range s.allowedURLsRE { if re.MatchString(rawurl) { ok = true + break + } + } + + for _, re := range s.blockedURLsRE { + if re.MatchString(rawurl) { + ok = false + break } } diff --git a/scrape_test.go b/scrape_test.go index acfbbbf..c1a32b6 100644 --- a/scrape_test.go +++ b/scrape_test.go @@ -18,9 +18,9 @@ import ( func TestScrapeFollowLinks(t *testing.T) { scr := flyscrape.Scraper{ ScrapeOptions: flyscrape.ScrapeOptions{ - URL: "http://www.example.com/foo/bar", - Depth: 1, - AllowDomains: []string{"www.google.com"}, + URL: "http://www.example.com/foo/bar", + Depth: 1, + AllowedDomains: []string{"www.google.com"}, }, ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { return "foobar", nil @@ -47,9 +47,9 @@ func TestScrapeFollowLinks(t *testing.T) { func TestScrapeDepth(t *testing.T) { scr := flyscrape.Scraper{ ScrapeOptions: flyscrape.ScrapeOptions{ - URL: "http://www.example.com/", - Depth: 2, - AllowDomains: []string{"*"}, + URL: "http://www.example.com/", + Depth: 2, + AllowedDomains: []string{"*"}, }, ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { return "foobar", nil @@ -78,12 +78,12 @@ func TestScrapeDepth(t *testing.T) { require.Contains(t, urls, "http://www.duckduckgo.com/") } -func TestScrapeAllowDomains(t *testing.T) { +func TestScrapeAllowedDomains(t *testing.T) { scr := flyscrape.Scraper{ ScrapeOptions: flyscrape.ScrapeOptions{ - URL: "http://www.example.com/", - Depth: 1, - AllowDomains: []string{"www.google.com"}, + URL: "http://www.example.com/", + Depth: 1, + AllowedDomains: []string{"www.google.com"}, }, ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { return "foobar", nil @@ -104,12 +104,12 @@ func TestScrapeAllowDomains(t *testing.T) { require.Contains(t, urls, "http://www.google.com/") } -func TestScrapeAllowDomainsAll(t *testing.T) { +func TestScrapeAllowedDomainsAll(t *testing.T) { scr := flyscrape.Scraper{ ScrapeOptions: flyscrape.ScrapeOptions{ - URL: "http://www.example.com/", - Depth: 1, - AllowDomains: []string{"*"}, + URL: "http://www.example.com/", + Depth: 1, + AllowedDomains: []string{"*"}, }, ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { return "foobar", nil @@ -131,13 +131,13 @@ func TestScrapeAllowDomainsAll(t *testing.T) { require.Contains(t, urls, "http://www.google.com/") } -func TestScrapeDenyDomains(t *testing.T) { +func TestScrapeBlockedDomains(t *testing.T) { scr := flyscrape.Scraper{ ScrapeOptions: flyscrape.ScrapeOptions{ - URL: "http://www.example.com/", - Depth: 1, - AllowDomains: []string{"*"}, - DenyDomains: []string{"www.google.com"}, + URL: "http://www.example.com/", + Depth: 1, + AllowedDomains: []string{"*"}, + BlockedDomains: []string{"www.google.com"}, }, ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { return "foobar", nil @@ -158,12 +158,12 @@ func TestScrapeDenyDomains(t *testing.T) { require.Contains(t, urls, "http://www.duckduckgo.com/") } -func TestScrapeAllowURLs(t *testing.T) { +func TestScrapeAllowedURLs(t *testing.T) { scr := flyscrape.Scraper{ ScrapeOptions: flyscrape.ScrapeOptions{ - URL: "http://www.example.com/", - Depth: 1, - AllowURLs: []string{`/foo\?id=\d+`, `/bar$`}, + URL: "http://www.example.com/", + Depth: 1, + AllowedURLs: []string{`/foo\?id=\d+`, `/bar$`}, }, ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { return "foobar", nil @@ -187,6 +187,35 @@ func TestScrapeAllowURLs(t *testing.T) { require.Contains(t, urls, "http://www.example.com/bar") } +func TestScrapeBlockedURLs(t *testing.T) { + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Depth: 1, + BlockedURLs: []string{`/foo\?id=\d+`, `/bar$`}, + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + FetchFunc: func(url string) (string, error) { + return `<a href="foo?id=123">123</a> + <a href="foo?id=ABC">ABC</a> + <a href="/bar">bar</a> + <a href="/barz">barz</a>`, nil + }, + } + + urls := make(map[string]struct{}) + for res := range scr.Scrape() { + urls[res.URL] = struct{}{} + } + + require.Len(t, urls, 3) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.example.com/foo?id=ABC") + require.Contains(t, urls, "http://www.example.com/barz") +} + func TestScrapeRate(t *testing.T) { scr := flyscrape.Scraper{ ScrapeOptions: flyscrape.ScrapeOptions{ |