add tests and allow urls

author: Philipp Tanlak <philipp.tanlak@gmail.com> 2023-08-27 19:10:49 +0200
committer: Philipp Tanlak <philipp.tanlak@gmail.com> 2023-08-27 19:10:49 +0200
commit: 5c16435e2218344a6e232ebb48cf022a32ba85d5 (patch)
tree: 3cfa1dbc1f489ba4509fc408a8c0afccca7f9c7c
parent: 52107c13b4c2c4efa9269b187916f3195be5a10d (diff)
8 files changed, 389 insertions, 57 deletions
diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go
index cf8f8cf..8e83ca8 100644
--- a/cmd/flyscrape/run.go
+++ b/cmd/flyscrape/run.go
@@ -19,6 +19,7 @@ type RunCommand struct{}
 func (c *RunCommand) Run(args []string) error {
 	fs := flag.NewFlagSet("flyscrape-run", flag.ContinueOnError)
 	noPrettyPrint := fs.Bool("no-pretty-print", false, "no-pretty-print")
+	proxy := fs.String("proxy", "", "proxy")
 	fs.Usage = c.Usage
 
 	if err := fs.Parse(args); err != nil {
@@ -44,6 +45,9 @@ func (c *RunCommand) Run(args []string) error {
 		ScrapeOptions: opts,
 		ScrapeFunc:    scrape,
 	}
+	if *proxy != "" {
+		svc.FetchFunc = flyscrape.ProxiedFetch(*proxy)
+	}
 
 	count := 0
 	start := time.Now()
diff --git a/cmd/flyscrape/watch.go b/cmd/flyscrape/watch.go
index 02a3b45..777ae8a 100644
--- a/cmd/flyscrape/watch.go
+++ b/cmd/flyscrape/watch.go
@@ -18,6 +18,7 @@ type WatchCommand struct{}
 
 func (c *WatchCommand) Run(args []string) error {
 	fs := flag.NewFlagSet("flyscrape-watch", flag.ContinueOnError)
+	proxy := fs.String("proxy", "", "proxy")
 	fs.Usage = c.Usage
 
 	if err := fs.Parse(args); err != nil {
@@ -28,7 +29,14 @@ func (c *WatchCommand) Run(args []string) error {
 		return fmt.Errorf("too many arguments")
 	}
 
-	fetch := flyscrape.CachedFetch()
+	var fetch flyscrape.FetchFunc
+	if *proxy != "" {
+		fetch = flyscrape.ProxiedFetch(*proxy)
+	} else {
+		fetch = flyscrape.Fetch()
+	}
+
+	fetch = flyscrape.CachedFetch(fetch)
 	script := fs.Arg(0)
 
 	err := flyscrape.Watch(script, func(s string) error {
diff --git a/fetch.go b/fetch.go
index 8303a76..f9d49d7 100644
--- a/fetch.go
+++ b/fetch.go
@@ -5,21 +5,29 @@
 package flyscrape
 
 import (
+	"crypto/tls"
 	"io"
 	"net/http"
+	"net/url"
 
 	"github.com/cornelk/hashmap"
 )
 
-func CachedFetch() FetchFunc {
-	cache := hashmap.New[string, string]()
+func ProxiedFetch(proxyURL string) FetchFunc {
+	pu, err := url.Parse(proxyURL)
+	if err != nil {
+		panic("invalid proxy url")
+	}
 
-	return func(url string) (string, error) {
-		if html, ok := cache.Get(url); ok {
-			return html, nil
-		}
+	client := http.Client{
+		Transport: &http.Transport{
+			Proxy:           http.ProxyURL(pu),
+			TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
+		},
+	}
 
-		resp, err := http.Get(url)
+	return func(url string) (string, error) {
+		resp, err := client.Get(url)
 		if err != nil {
 			return "", err
 		}
@@ -31,6 +39,23 @@ func CachedFetch() FetchFunc {
 		}
 
 		html := string(body)
+		return html, nil
+	}
+}
+
+func CachedFetch(fetch FetchFunc) FetchFunc {
+	cache := hashmap.New[string, string]()
+
+	return func(url string) (string, error) {
+		if html, ok := cache.Get(url); ok {
+			return html, nil
+		}
+
+		html, err := fetch(url)
+		if err != nil {
+			return "", err
+		}
+
 		cache.Set(url, html)
 		return html, nil
 	}
diff --git a/fetch_test.go b/fetch_test.go
new file mode 100644
index 0000000..5ee0222
--- /dev/null
+++ b/fetch_test.go
@@ -0,0 +1,56 @@
+package flyscrape_test
+
+import (
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"flyscrape"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestFetchFetch(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Write([]byte("foobar"))
+	}))
+
+	fetch := flyscrape.Fetch()
+
+	html, err := fetch(srv.URL)
+	require.NoError(t, err)
+	require.Equal(t, html, "foobar")
+}
+
+func TestFetchCachedFetch(t *testing.T) {
+	numcalled := 0
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		numcalled++
+		w.Write([]byte("foobar"))
+	}))
+
+	fetch := flyscrape.CachedFetch(flyscrape.Fetch())
+
+	html, err := fetch(srv.URL)
+	require.NoError(t, err)
+	require.Equal(t, html, "foobar")
+
+	html, err = fetch(srv.URL)
+	require.NoError(t, err)
+	require.Equal(t, html, "foobar")
+
+	require.Equal(t, 1, numcalled)
+}
+
+func TestFetchProxiedFetch(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		require.Equal(t, r.URL.String(), "http://example.com/foo")
+		w.Write([]byte("foobar"))
+	}))
+
+	fetch := flyscrape.ProxiedFetch(srv.URL)
+
+	html, err := fetch("http://example.com/foo")
+	require.NoError(t, err)
+	require.Equal(t, html, "foobar")
+}
diff --git a/js/template.js b/js/template.js
index 56fffa0..ac78b47 100644
--- a/js/template.js
+++ b/js/template.js
@@ -3,8 +3,8 @@ import { parse } from 'flyscrape';
 export const options = {
     url: 'https://news.ycombinator.com/',     // Specify the URL to start scraping from.
     depth: 1,                                 // Specify how deep links should be followed.  (default = 0, no follow)
-    allowedDomains: ['news.ycombinator.com'], // Specify the allowed domains.                (default = domain from url)
-    blockedDomains: [],                       // Specify the blocked domains.                (default = none)
+    allowDomains: [],                         // Specify the allowed domains. * for all.     (default = domain from url)
+    denyDomains: [],                          // Specify the denied domains.                 (default = none)
     rate: 100,                                // Specify the rate in requests per second.    (default = 100)
 }
 
diff --git a/js_test.go b/js_test.go
index 7cde15d..d8ab305 100644
--- a/js_test.go
+++ b/js_test.go
@@ -16,8 +16,8 @@ var html = `
 <html>
     <body>
         <main>
-            <h1>Plugins</h1>
-            <p>The plugin API allows you to inject code into various parts of the build process.</p>
+            <h1>headline</h1>
+            <p>paragraph</p>
         </main>
     </body>
 </html>`
@@ -34,22 +34,44 @@ export default function({ html, url }) {
 
     return {
         headline: $("h1").text(),
-        body: $("p").text()
+        body: $("p").text(),
+        url: url,
     }
 }
 `
 
-func TestV8(t *testing.T) {
+func TestJSScrape(t *testing.T) {
 	opts, run, err := flyscrape.Compile(script)
 	require.NoError(t, err)
 	require.NotNil(t, opts)
 	require.NotNil(t, run)
 
-	extract, err := run(flyscrape.ScrapeParams{
+	result, err := run(flyscrape.ScrapeParams{
 		HTML: html,
+		URL:  "http://localhost/",
 	})
 
 	require.NoError(t, err)
-	require.Equal(t, "Plugins", extract.(map[string]any)["headline"])
-	require.Equal(t, "The plugin API allows you to inject code into various parts of the build process.", extract.(map[string]any)["body"])
+
+	m, ok := result.(map[string]any)
+	require.True(t, ok)
+	require.Equal(t, "headline", m["headline"])
+	require.Equal(t, "paragraph", m["body"])
+	require.Equal(t, "http://localhost/", m["url"])
+}
+
+func TestJSCompileError(t *testing.T) {
+	opts, run, err := flyscrape.Compile("import foo;")
+	require.Error(t, err)
+	require.Empty(t, opts)
+	require.Nil(t, run)
+
+	var terr flyscrape.TransformError
+	require.ErrorAs(t, err, &terr)
+
+	require.Equal(t, terr, flyscrape.TransformError{
+		Line:   1,
+		Column: 10,
+		Text:   `Expected "from" but found ";"`,
+	})
 }
diff --git a/scrape.go b/scrape.go
index f245137..1f9ad97 100644
--- a/scrape.go
+++ b/scrape.go
@@ -6,6 +6,7 @@ package flyscrape
 
 import (
 	"log"
+	"regexp"
 	"strings"
 	"sync"
 	"time"
@@ -21,11 +22,13 @@ type ScrapeParams struct {
 }
 
 type ScrapeOptions struct {
-	URL            string   `json:"url"`
-	AllowedDomains []string `json:"allowedDomains"`
-	BlockedDomains []string `json:"blockedDomains"`
-	Depth          int      `json:"depth"`
-	Rate           float64  `json:"rate"`
+	URL          string   `json:"url"`
+	AllowDomains []string `json:"allowDomains"`
+	DenyDomains  []string `json:"denyDomains"`
+	AllowURLs    []string `json:"allowURLs"`
+	Proxy        string   `json:"proxy"`
+	Depth        int      `json:"depth"`
+	Rate         float64  `json:"rate"`
 }
 
 type ScrapeResult struct {
@@ -54,10 +57,11 @@ type Scraper struct {
 	ScrapeFunc    ScrapeFunc
 	FetchFunc     FetchFunc
 
-	visited *hashmap.Map[string, struct{}]
-	wg      *sync.WaitGroup
-	jobs    chan target
-	results chan ScrapeResult
+	visited     *hashmap.Map[string, struct{}]
+	wg          *sync.WaitGroup
+	jobs        chan target
+	results     chan ScrapeResult
+	allowURLsRE []*regexp.Regexp
 }
 
 func (s *Scraper) init() {
@@ -69,16 +73,24 @@ func (s *Scraper) init() {
 	if s.FetchFunc == nil {
 		s.FetchFunc = Fetch()
 	}
+	if s.ScrapeOptions.Proxy != "" {
+		s.FetchFunc = ProxiedFetch(s.ScrapeOptions.Proxy)
+	}
 
 	if s.ScrapeOptions.Rate == 0 {
 		s.ScrapeOptions.Rate = 100
 	}
 
-	if len(s.ScrapeOptions.AllowedDomains) == 0 {
-		u, err := url.Parse(s.ScrapeOptions.URL)
-		if err == nil {
-			s.ScrapeOptions.AllowedDomains = []string{u.Host()}
+	if u, err := url.Parse(s.ScrapeOptions.URL); err == nil {
+		s.ScrapeOptions.AllowDomains = append(s.ScrapeOptions.AllowDomains, u.Host())
+	}
+
+	for _, pat := range s.ScrapeOptions.AllowURLs {
+		re, err := regexp.Compile(pat)
+		if err != nil {
+			continue
 		}
+		s.allowURLsRE = append(s.allowURLsRE, re)
 	}
 }
 
@@ -116,7 +128,8 @@ func (s *Scraper) worker() {
 					continue
 				}
 
-				if !s.isURLAllowed(l) {
+				allowed := s.isDomainAllowed(l) && s.isURLAllowed(l)
+				if !allowed {
 					continue
 				}
 
@@ -157,7 +170,7 @@ func (s *Scraper) enqueueJob(url string, depth int) {
 	}
 }
 
-func (s *Scraper) isURLAllowed(rawurl string) bool {
+func (s *Scraper) isDomainAllowed(rawurl string) bool {
 	u, err := url.Parse(rawurl)
 	if err != nil {
 		return false
@@ -166,14 +179,14 @@ func (s *Scraper) isURLAllowed(rawurl string) bool {
 	host := u.Host()
 	ok := false
 
-	for _, domain := range s.ScrapeOptions.AllowedDomains {
+	for _, domain := range s.ScrapeOptions.AllowDomains {
 		if domain == "*" || host == domain {
 			ok = true
 			break
 		}
 	}
 
-	for _, domain := range s.ScrapeOptions.BlockedDomains {
+	for _, domain := range s.ScrapeOptions.DenyDomains {
 		if host == domain {
 			ok = false
 			break
@@ -183,6 +196,22 @@ func (s *Scraper) isURLAllowed(rawurl string) bool {
 	return ok
 }
 
+func (s *Scraper) isURLAllowed(rawurl string) bool {
+	if len(s.allowURLsRE) == 0 {
+		return true
+	}
+
+	ok := false
+
+	for _, re := range s.allowURLsRE {
+		if re.MatchString(rawurl) {
+			ok = true
+		}
+	}
+
+	return ok
+}
+
 func (s *Scraper) waitClose() {
 	s.wg.Wait()
 	close(s.jobs)
diff --git a/scrape_test.go b/scrape_test.go
index 602be9f..acfbbbf 100644
--- a/scrape_test.go
+++ b/scrape_test.go
@@ -5,46 +5,234 @@
 package flyscrape_test
 
 import (
-	"sort"
+	"net/http"
+	"net/http/httptest"
 	"testing"
+	"time"
 
 	"flyscrape"
 
 	"github.com/stretchr/testify/require"
 )
 
-func TestScrape(t *testing.T) {
-	svc := flyscrape.Scraper{
+func TestScrapeFollowLinks(t *testing.T) {
+	scr := flyscrape.Scraper{
 		ScrapeOptions: flyscrape.ScrapeOptions{
-			URL:            "http://example.com/foo/bar",
-			Depth:          1,
-			AllowedDomains: []string{"example.com", "www.google.com"},
+			URL:          "http://www.example.com/foo/bar",
+			Depth:        1,
+			AllowDomains: []string{"www.google.com"},
 		},
 		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
-			return map[string]any{
-				"url": params.URL,
-			}, nil
+			return "foobar", nil
 		},
 		FetchFunc: func(url string) (string, error) {
-			return `<html>
-                <body>
-                    <a href="/baz">Baz</a>
+			return `<a href="/baz">Baz</a>
                     <a href="baz">Baz</a>
-                    <a href="http://www.google.com">Google</a>
-                </body>
-            </html>`, nil
+                    <a href="http://www.google.com">Google</a>`, nil
 		},
 	}
 
-	var urls []string
-	for res := range svc.Scrape() {
-		urls = append(urls, res.URL)
+	urls := make(map[string]struct{})
+	for res := range scr.Scrape() {
+		urls[res.URL] = struct{}{}
 	}
-	sort.Strings(urls)
 
 	require.Len(t, urls, 4)
-	require.Equal(t, "http://example.com/baz", urls[0])
-	require.Equal(t, "http://example.com/foo/bar", urls[1])
-	require.Equal(t, "http://example.com/foo/baz", urls[2])
-	require.Equal(t, "http://www.google.com/", urls[3])
+	require.Contains(t, urls, "http://www.example.com/baz")
+	require.Contains(t, urls, "http://www.example.com/foo/bar")
+	require.Contains(t, urls, "http://www.example.com/foo/baz")
+	require.Contains(t, urls, "http://www.google.com/")
+}
+
+func TestScrapeDepth(t *testing.T) {
+	scr := flyscrape.Scraper{
+		ScrapeOptions: flyscrape.ScrapeOptions{
+			URL:          "http://www.example.com/",
+			Depth:        2,
+			AllowDomains: []string{"*"},
+		},
+		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+			return "foobar", nil
+		},
+		FetchFunc: func(url string) (string, error) {
+			switch url {
+			case "http://www.example.com/":
+				return `<a href="http://www.google.com">Google</a>`, nil
+			case "http://www.google.com/":
+				return `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil
+			case "http://www.duckduckgo.com/":
+				return `<a href="http://www.example.com">Example</a>`, nil
+			}
+			return "", nil
+		},
+	}
+
+	urls := make(map[string]struct{})
+	for res := range scr.Scrape() {
+		urls[res.URL] = struct{}{}
+	}
+
+	require.Len(t, urls, 3)
+	require.Contains(t, urls, "http://www.example.com/")
+	require.Contains(t, urls, "http://www.google.com/")
+	require.Contains(t, urls, "http://www.duckduckgo.com/")
+}
+
+func TestScrapeAllowDomains(t *testing.T) {
+	scr := flyscrape.Scraper{
+		ScrapeOptions: flyscrape.ScrapeOptions{
+			URL:          "http://www.example.com/",
+			Depth:        1,
+			AllowDomains: []string{"www.google.com"},
+		},
+		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+			return "foobar", nil
+		},
+		FetchFunc: func(url string) (string, error) {
+			return `<a href="http://www.google.com">Google</a>
+                    <a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil
+		},
+	}
+
+	urls := make(map[string]struct{})
+	for res := range scr.Scrape() {
+		urls[res.URL] = struct{}{}
+	}
+
+	require.Len(t, urls, 2)
+	require.Contains(t, urls, "http://www.example.com/")
+	require.Contains(t, urls, "http://www.google.com/")
+}
+
+func TestScrapeAllowDomainsAll(t *testing.T) {
+	scr := flyscrape.Scraper{
+		ScrapeOptions: flyscrape.ScrapeOptions{
+			URL:          "http://www.example.com/",
+			Depth:        1,
+			AllowDomains: []string{"*"},
+		},
+		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+			return "foobar", nil
+		},
+		FetchFunc: func(url string) (string, error) {
+			return `<a href="http://www.google.com">Google</a>
+                    <a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil
+		},
+	}
+
+	urls := make(map[string]struct{})
+	for res := range scr.Scrape() {
+		urls[res.URL] = struct{}{}
+	}
+
+	require.Len(t, urls, 3)
+	require.Contains(t, urls, "http://www.example.com/")
+	require.Contains(t, urls, "http://www.duckduckgo.com/")
+	require.Contains(t, urls, "http://www.google.com/")
+}
+
+func TestScrapeDenyDomains(t *testing.T) {
+	scr := flyscrape.Scraper{
+		ScrapeOptions: flyscrape.ScrapeOptions{
+			URL:          "http://www.example.com/",
+			Depth:        1,
+			AllowDomains: []string{"*"},
+			DenyDomains:  []string{"www.google.com"},
+		},
+		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+			return "foobar", nil
+		},
+		FetchFunc: func(url string) (string, error) {
+			return `<a href="http://www.google.com">Google</a>
+                    <a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil
+		},
+	}
+
+	urls := make(map[string]struct{})
+	for res := range scr.Scrape() {
+		urls[res.URL] = struct{}{}
+	}
+
+	require.Len(t, urls, 2)
+	require.Contains(t, urls, "http://www.example.com/")
+	require.Contains(t, urls, "http://www.duckduckgo.com/")
+}
+
+func TestScrapeAllowURLs(t *testing.T) {
+	scr := flyscrape.Scraper{
+		ScrapeOptions: flyscrape.ScrapeOptions{
+			URL:       "http://www.example.com/",
+			Depth:     1,
+			AllowURLs: []string{`/foo\?id=\d+`, `/bar$`},
+		},
+		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+			return "foobar", nil
+		},
+		FetchFunc: func(url string) (string, error) {
+			return `<a href="foo?id=123">123</a>
+			        <a href="foo?id=ABC">ABC</a>
+			        <a href="/bar">bar</a>
+                    <a href="/barz">barz</a>`, nil
+		},
+	}
+
+	urls := make(map[string]struct{})
+	for res := range scr.Scrape() {
+		urls[res.URL] = struct{}{}
+	}
+
+	require.Len(t, urls, 3)
+	require.Contains(t, urls, "http://www.example.com/")
+	require.Contains(t, urls, "http://www.example.com/foo?id=123")
+	require.Contains(t, urls, "http://www.example.com/bar")
+}
+
+func TestScrapeRate(t *testing.T) {
+	scr := flyscrape.Scraper{
+		ScrapeOptions: flyscrape.ScrapeOptions{
+			URL:   "http://www.example.com/",
+			Depth: 1,
+			Rate:  100, // every 10ms
+		},
+		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+			return "foobar", nil
+		},
+		FetchFunc: func(url string) (string, error) {
+			return `<a href="foo">foo</a>`, nil
+		},
+	}
+
+	res := scr.Scrape()
+
+	start := time.Now()
+	<-res
+	first := time.Now().Add(-10 * time.Millisecond)
+	<-res
+	second := time.Now().Add(-20 * time.Millisecond)
+
+	require.Less(t, first.Sub(start), 2*time.Millisecond)
+	require.Less(t, second.Sub(start), 2*time.Millisecond)
+}
+
+func TestScrapeProxy(t *testing.T) {
+	proxyCalled := false
+	proxy := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		proxyCalled = true
+		w.Write([]byte(`<a href="http://www.google.com">Google</a>`))
+	}))
+
+	scr := flyscrape.Scraper{
+		ScrapeOptions: flyscrape.ScrapeOptions{
+			URL:   "http://www.example.com/",
+			Proxy: proxy.URL,
+		},
+		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+			return "foobar", nil
+		},
+	}
+
+	res := <-scr.Scrape()
+
+	require.True(t, proxyCalled)
+	require.Equal(t, "http://www.example.com/", res.URL)
 }
author	Philipp Tanlak <philipp.tanlak@gmail.com>	2023-08-27 19:10:49 +0200
committer	Philipp Tanlak <philipp.tanlak@gmail.com>	2023-08-27 19:10:49 +0200
commit	5c16435e2218344a6e232ebb48cf022a32ba85d5 (patch)
tree	3cfa1dbc1f489ba4509fc408a8c0afccca7f9c7c
parent	52107c13b4c2c4efa9269b187916f3195be5a10d (diff)