summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cmd/flyscrape/run.go4
-rw-r--r--cmd/flyscrape/watch.go10
-rw-r--r--fetch.go39
-rw-r--r--fetch_test.go56
-rw-r--r--js/template.js4
-rw-r--r--js_test.go36
-rw-r--r--scrape.go63
-rw-r--r--scrape_test.go234
8 files changed, 389 insertions, 57 deletions
diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go
index cf8f8cf..8e83ca8 100644
--- a/cmd/flyscrape/run.go
+++ b/cmd/flyscrape/run.go
@@ -19,6 +19,7 @@ type RunCommand struct{}
func (c *RunCommand) Run(args []string) error {
fs := flag.NewFlagSet("flyscrape-run", flag.ContinueOnError)
noPrettyPrint := fs.Bool("no-pretty-print", false, "no-pretty-print")
+ proxy := fs.String("proxy", "", "proxy")
fs.Usage = c.Usage
if err := fs.Parse(args); err != nil {
@@ -44,6 +45,9 @@ func (c *RunCommand) Run(args []string) error {
ScrapeOptions: opts,
ScrapeFunc: scrape,
}
+ if *proxy != "" {
+ svc.FetchFunc = flyscrape.ProxiedFetch(*proxy)
+ }
count := 0
start := time.Now()
diff --git a/cmd/flyscrape/watch.go b/cmd/flyscrape/watch.go
index 02a3b45..777ae8a 100644
--- a/cmd/flyscrape/watch.go
+++ b/cmd/flyscrape/watch.go
@@ -18,6 +18,7 @@ type WatchCommand struct{}
func (c *WatchCommand) Run(args []string) error {
fs := flag.NewFlagSet("flyscrape-watch", flag.ContinueOnError)
+ proxy := fs.String("proxy", "", "proxy")
fs.Usage = c.Usage
if err := fs.Parse(args); err != nil {
@@ -28,7 +29,14 @@ func (c *WatchCommand) Run(args []string) error {
return fmt.Errorf("too many arguments")
}
- fetch := flyscrape.CachedFetch()
+ var fetch flyscrape.FetchFunc
+ if *proxy != "" {
+ fetch = flyscrape.ProxiedFetch(*proxy)
+ } else {
+ fetch = flyscrape.Fetch()
+ }
+
+ fetch = flyscrape.CachedFetch(fetch)
script := fs.Arg(0)
err := flyscrape.Watch(script, func(s string) error {
diff --git a/fetch.go b/fetch.go
index 8303a76..f9d49d7 100644
--- a/fetch.go
+++ b/fetch.go
@@ -5,21 +5,29 @@
package flyscrape
import (
+ "crypto/tls"
"io"
"net/http"
+ "net/url"
"github.com/cornelk/hashmap"
)
-func CachedFetch() FetchFunc {
- cache := hashmap.New[string, string]()
+func ProxiedFetch(proxyURL string) FetchFunc {
+ pu, err := url.Parse(proxyURL)
+ if err != nil {
+ panic("invalid proxy url")
+ }
- return func(url string) (string, error) {
- if html, ok := cache.Get(url); ok {
- return html, nil
- }
+ client := http.Client{
+ Transport: &http.Transport{
+ Proxy: http.ProxyURL(pu),
+ TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
+ },
+ }
- resp, err := http.Get(url)
+ return func(url string) (string, error) {
+ resp, err := client.Get(url)
if err != nil {
return "", err
}
@@ -31,6 +39,23 @@ func CachedFetch() FetchFunc {
}
html := string(body)
+ return html, nil
+ }
+}
+
+func CachedFetch(fetch FetchFunc) FetchFunc {
+ cache := hashmap.New[string, string]()
+
+ return func(url string) (string, error) {
+ if html, ok := cache.Get(url); ok {
+ return html, nil
+ }
+
+ html, err := fetch(url)
+ if err != nil {
+ return "", err
+ }
+
cache.Set(url, html)
return html, nil
}
diff --git a/fetch_test.go b/fetch_test.go
new file mode 100644
index 0000000..5ee0222
--- /dev/null
+++ b/fetch_test.go
@@ -0,0 +1,56 @@
+package flyscrape_test
+
+import (
+ "net/http"
+ "net/http/httptest"
+ "testing"
+
+ "flyscrape"
+
+ "github.com/stretchr/testify/require"
+)
+
+func TestFetchFetch(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.Write([]byte("foobar"))
+ }))
+
+ fetch := flyscrape.Fetch()
+
+ html, err := fetch(srv.URL)
+ require.NoError(t, err)
+ require.Equal(t, html, "foobar")
+}
+
+func TestFetchCachedFetch(t *testing.T) {
+ numcalled := 0
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ numcalled++
+ w.Write([]byte("foobar"))
+ }))
+
+ fetch := flyscrape.CachedFetch(flyscrape.Fetch())
+
+ html, err := fetch(srv.URL)
+ require.NoError(t, err)
+ require.Equal(t, html, "foobar")
+
+ html, err = fetch(srv.URL)
+ require.NoError(t, err)
+ require.Equal(t, html, "foobar")
+
+ require.Equal(t, 1, numcalled)
+}
+
+func TestFetchProxiedFetch(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ require.Equal(t, r.URL.String(), "http://example.com/foo")
+ w.Write([]byte("foobar"))
+ }))
+
+ fetch := flyscrape.ProxiedFetch(srv.URL)
+
+ html, err := fetch("http://example.com/foo")
+ require.NoError(t, err)
+ require.Equal(t, html, "foobar")
+}
diff --git a/js/template.js b/js/template.js
index 56fffa0..ac78b47 100644
--- a/js/template.js
+++ b/js/template.js
@@ -3,8 +3,8 @@ import { parse } from 'flyscrape';
export const options = {
url: 'https://news.ycombinator.com/', // Specify the URL to start scraping from.
depth: 1, // Specify how deep links should be followed. (default = 0, no follow)
- allowedDomains: ['news.ycombinator.com'], // Specify the allowed domains. (default = domain from url)
- blockedDomains: [], // Specify the blocked domains. (default = none)
+ allowDomains: [], // Specify the allowed domains. * for all. (default = domain from url)
+ denyDomains: [], // Specify the denied domains. (default = none)
rate: 100, // Specify the rate in requests per second. (default = 100)
}
diff --git a/js_test.go b/js_test.go
index 7cde15d..d8ab305 100644
--- a/js_test.go
+++ b/js_test.go
@@ -16,8 +16,8 @@ var html = `
<html>
<body>
<main>
- <h1>Plugins</h1>
- <p>The plugin API allows you to inject code into various parts of the build process.</p>
+ <h1>headline</h1>
+ <p>paragraph</p>
</main>
</body>
</html>`
@@ -34,22 +34,44 @@ export default function({ html, url }) {
return {
headline: $("h1").text(),
- body: $("p").text()
+ body: $("p").text(),
+ url: url,
}
}
`
-func TestV8(t *testing.T) {
+func TestJSScrape(t *testing.T) {
opts, run, err := flyscrape.Compile(script)
require.NoError(t, err)
require.NotNil(t, opts)
require.NotNil(t, run)
- extract, err := run(flyscrape.ScrapeParams{
+ result, err := run(flyscrape.ScrapeParams{
HTML: html,
+ URL: "http://localhost/",
})
require.NoError(t, err)
- require.Equal(t, "Plugins", extract.(map[string]any)["headline"])
- require.Equal(t, "The plugin API allows you to inject code into various parts of the build process.", extract.(map[string]any)["body"])
+
+ m, ok := result.(map[string]any)
+ require.True(t, ok)
+ require.Equal(t, "headline", m["headline"])
+ require.Equal(t, "paragraph", m["body"])
+ require.Equal(t, "http://localhost/", m["url"])
+}
+
+func TestJSCompileError(t *testing.T) {
+ opts, run, err := flyscrape.Compile("import foo;")
+ require.Error(t, err)
+ require.Empty(t, opts)
+ require.Nil(t, run)
+
+ var terr flyscrape.TransformError
+ require.ErrorAs(t, err, &terr)
+
+ require.Equal(t, terr, flyscrape.TransformError{
+ Line: 1,
+ Column: 10,
+ Text: `Expected "from" but found ";"`,
+ })
}
diff --git a/scrape.go b/scrape.go
index f245137..1f9ad97 100644
--- a/scrape.go
+++ b/scrape.go
@@ -6,6 +6,7 @@ package flyscrape
import (
"log"
+ "regexp"
"strings"
"sync"
"time"
@@ -21,11 +22,13 @@ type ScrapeParams struct {
}
type ScrapeOptions struct {
- URL string `json:"url"`
- AllowedDomains []string `json:"allowedDomains"`
- BlockedDomains []string `json:"blockedDomains"`
- Depth int `json:"depth"`
- Rate float64 `json:"rate"`
+ URL string `json:"url"`
+ AllowDomains []string `json:"allowDomains"`
+ DenyDomains []string `json:"denyDomains"`
+ AllowURLs []string `json:"allowURLs"`
+ Proxy string `json:"proxy"`
+ Depth int `json:"depth"`
+ Rate float64 `json:"rate"`
}
type ScrapeResult struct {
@@ -54,10 +57,11 @@ type Scraper struct {
ScrapeFunc ScrapeFunc
FetchFunc FetchFunc
- visited *hashmap.Map[string, struct{}]
- wg *sync.WaitGroup
- jobs chan target
- results chan ScrapeResult
+ visited *hashmap.Map[string, struct{}]
+ wg *sync.WaitGroup
+ jobs chan target
+ results chan ScrapeResult
+ allowURLsRE []*regexp.Regexp
}
func (s *Scraper) init() {
@@ -69,16 +73,24 @@ func (s *Scraper) init() {
if s.FetchFunc == nil {
s.FetchFunc = Fetch()
}
+ if s.ScrapeOptions.Proxy != "" {
+ s.FetchFunc = ProxiedFetch(s.ScrapeOptions.Proxy)
+ }
if s.ScrapeOptions.Rate == 0 {
s.ScrapeOptions.Rate = 100
}
- if len(s.ScrapeOptions.AllowedDomains) == 0 {
- u, err := url.Parse(s.ScrapeOptions.URL)
- if err == nil {
- s.ScrapeOptions.AllowedDomains = []string{u.Host()}
+ if u, err := url.Parse(s.ScrapeOptions.URL); err == nil {
+ s.ScrapeOptions.AllowDomains = append(s.ScrapeOptions.AllowDomains, u.Host())
+ }
+
+ for _, pat := range s.ScrapeOptions.AllowURLs {
+ re, err := regexp.Compile(pat)
+ if err != nil {
+ continue
}
+ s.allowURLsRE = append(s.allowURLsRE, re)
}
}
@@ -116,7 +128,8 @@ func (s *Scraper) worker() {
continue
}
- if !s.isURLAllowed(l) {
+ allowed := s.isDomainAllowed(l) && s.isURLAllowed(l)
+ if !allowed {
continue
}
@@ -157,7 +170,7 @@ func (s *Scraper) enqueueJob(url string, depth int) {
}
}
-func (s *Scraper) isURLAllowed(rawurl string) bool {
+func (s *Scraper) isDomainAllowed(rawurl string) bool {
u, err := url.Parse(rawurl)
if err != nil {
return false
@@ -166,14 +179,14 @@ func (s *Scraper) isURLAllowed(rawurl string) bool {
host := u.Host()
ok := false
- for _, domain := range s.ScrapeOptions.AllowedDomains {
+ for _, domain := range s.ScrapeOptions.AllowDomains {
if domain == "*" || host == domain {
ok = true
break
}
}
- for _, domain := range s.ScrapeOptions.BlockedDomains {
+ for _, domain := range s.ScrapeOptions.DenyDomains {
if host == domain {
ok = false
break
@@ -183,6 +196,22 @@ func (s *Scraper) isURLAllowed(rawurl string) bool {
return ok
}
+func (s *Scraper) isURLAllowed(rawurl string) bool {
+ if len(s.allowURLsRE) == 0 {
+ return true
+ }
+
+ ok := false
+
+ for _, re := range s.allowURLsRE {
+ if re.MatchString(rawurl) {
+ ok = true
+ }
+ }
+
+ return ok
+}
+
func (s *Scraper) waitClose() {
s.wg.Wait()
close(s.jobs)
diff --git a/scrape_test.go b/scrape_test.go
index 602be9f..acfbbbf 100644
--- a/scrape_test.go
+++ b/scrape_test.go
@@ -5,46 +5,234 @@
package flyscrape_test
import (
- "sort"
+ "net/http"
+ "net/http/httptest"
"testing"
+ "time"
"flyscrape"
"github.com/stretchr/testify/require"
)
-func TestScrape(t *testing.T) {
- svc := flyscrape.Scraper{
+func TestScrapeFollowLinks(t *testing.T) {
+ scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
- URL: "http://example.com/foo/bar",
- Depth: 1,
- AllowedDomains: []string{"example.com", "www.google.com"},
+ URL: "http://www.example.com/foo/bar",
+ Depth: 1,
+ AllowDomains: []string{"www.google.com"},
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
- return map[string]any{
- "url": params.URL,
- }, nil
+ return "foobar", nil
},
FetchFunc: func(url string) (string, error) {
- return `<html>
- <body>
- <a href="/baz">Baz</a>
+ return `<a href="/baz">Baz</a>
<a href="baz">Baz</a>
- <a href="http://www.google.com">Google</a>
- </body>
- </html>`, nil
+ <a href="http://www.google.com">Google</a>`, nil
},
}
- var urls []string
- for res := range svc.Scrape() {
- urls = append(urls, res.URL)
+ urls := make(map[string]struct{})
+ for res := range scr.Scrape() {
+ urls[res.URL] = struct{}{}
}
- sort.Strings(urls)
require.Len(t, urls, 4)
- require.Equal(t, "http://example.com/baz", urls[0])
- require.Equal(t, "http://example.com/foo/bar", urls[1])
- require.Equal(t, "http://example.com/foo/baz", urls[2])
- require.Equal(t, "http://www.google.com/", urls[3])
+ require.Contains(t, urls, "http://www.example.com/baz")
+ require.Contains(t, urls, "http://www.example.com/foo/bar")
+ require.Contains(t, urls, "http://www.example.com/foo/baz")
+ require.Contains(t, urls, "http://www.google.com/")
+}
+
+func TestScrapeDepth(t *testing.T) {
+ scr := flyscrape.Scraper{
+ ScrapeOptions: flyscrape.ScrapeOptions{
+ URL: "http://www.example.com/",
+ Depth: 2,
+ AllowDomains: []string{"*"},
+ },
+ ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+ return "foobar", nil
+ },
+ FetchFunc: func(url string) (string, error) {
+ switch url {
+ case "http://www.example.com/":
+ return `<a href="http://www.google.com">Google</a>`, nil
+ case "http://www.google.com/":
+ return `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil
+ case "http://www.duckduckgo.com/":
+ return `<a href="http://www.example.com">Example</a>`, nil
+ }
+ return "", nil
+ },
+ }
+
+ urls := make(map[string]struct{})
+ for res := range scr.Scrape() {
+ urls[res.URL] = struct{}{}
+ }
+
+ require.Len(t, urls, 3)
+ require.Contains(t, urls, "http://www.example.com/")
+ require.Contains(t, urls, "http://www.google.com/")
+ require.Contains(t, urls, "http://www.duckduckgo.com/")
+}
+
+func TestScrapeAllowDomains(t *testing.T) {
+ scr := flyscrape.Scraper{
+ ScrapeOptions: flyscrape.ScrapeOptions{
+ URL: "http://www.example.com/",
+ Depth: 1,
+ AllowDomains: []string{"www.google.com"},
+ },
+ ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+ return "foobar", nil
+ },
+ FetchFunc: func(url string) (string, error) {
+ return `<a href="http://www.google.com">Google</a>
+ <a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil
+ },
+ }
+
+ urls := make(map[string]struct{})
+ for res := range scr.Scrape() {
+ urls[res.URL] = struct{}{}
+ }
+
+ require.Len(t, urls, 2)
+ require.Contains(t, urls, "http://www.example.com/")
+ require.Contains(t, urls, "http://www.google.com/")
+}
+
+func TestScrapeAllowDomainsAll(t *testing.T) {
+ scr := flyscrape.Scraper{
+ ScrapeOptions: flyscrape.ScrapeOptions{
+ URL: "http://www.example.com/",
+ Depth: 1,
+ AllowDomains: []string{"*"},
+ },
+ ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+ return "foobar", nil
+ },
+ FetchFunc: func(url string) (string, error) {
+ return `<a href="http://www.google.com">Google</a>
+ <a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil
+ },
+ }
+
+ urls := make(map[string]struct{})
+ for res := range scr.Scrape() {
+ urls[res.URL] = struct{}{}
+ }
+
+ require.Len(t, urls, 3)
+ require.Contains(t, urls, "http://www.example.com/")
+ require.Contains(t, urls, "http://www.duckduckgo.com/")
+ require.Contains(t, urls, "http://www.google.com/")
+}
+
+func TestScrapeDenyDomains(t *testing.T) {
+ scr := flyscrape.Scraper{
+ ScrapeOptions: flyscrape.ScrapeOptions{
+ URL: "http://www.example.com/",
+ Depth: 1,
+ AllowDomains: []string{"*"},
+ DenyDomains: []string{"www.google.com"},
+ },
+ ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+ return "foobar", nil
+ },
+ FetchFunc: func(url string) (string, error) {
+ return `<a href="http://www.google.com">Google</a>
+ <a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil
+ },
+ }
+
+ urls := make(map[string]struct{})
+ for res := range scr.Scrape() {
+ urls[res.URL] = struct{}{}
+ }
+
+ require.Len(t, urls, 2)
+ require.Contains(t, urls, "http://www.example.com/")
+ require.Contains(t, urls, "http://www.duckduckgo.com/")
+}
+
+func TestScrapeAllowURLs(t *testing.T) {
+ scr := flyscrape.Scraper{
+ ScrapeOptions: flyscrape.ScrapeOptions{
+ URL: "http://www.example.com/",
+ Depth: 1,
+ AllowURLs: []string{`/foo\?id=\d+`, `/bar$`},
+ },
+ ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+ return "foobar", nil
+ },
+ FetchFunc: func(url string) (string, error) {
+ return `<a href="foo?id=123">123</a>
+ <a href="foo?id=ABC">ABC</a>
+ <a href="/bar">bar</a>
+ <a href="/barz">barz</a>`, nil
+ },
+ }
+
+ urls := make(map[string]struct{})
+ for res := range scr.Scrape() {
+ urls[res.URL] = struct{}{}
+ }
+
+ require.Len(t, urls, 3)
+ require.Contains(t, urls, "http://www.example.com/")
+ require.Contains(t, urls, "http://www.example.com/foo?id=123")
+ require.Contains(t, urls, "http://www.example.com/bar")
+}
+
+func TestScrapeRate(t *testing.T) {
+ scr := flyscrape.Scraper{
+ ScrapeOptions: flyscrape.ScrapeOptions{
+ URL: "http://www.example.com/",
+ Depth: 1,
+ Rate: 100, // every 10ms
+ },
+ ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+ return "foobar", nil
+ },
+ FetchFunc: func(url string) (string, error) {
+ return `<a href="foo">foo</a>`, nil
+ },
+ }
+
+ res := scr.Scrape()
+
+ start := time.Now()
+ <-res
+ first := time.Now().Add(-10 * time.Millisecond)
+ <-res
+ second := time.Now().Add(-20 * time.Millisecond)
+
+ require.Less(t, first.Sub(start), 2*time.Millisecond)
+ require.Less(t, second.Sub(start), 2*time.Millisecond)
+}
+
+func TestScrapeProxy(t *testing.T) {
+ proxyCalled := false
+ proxy := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ proxyCalled = true
+ w.Write([]byte(`<a href="http://www.google.com">Google</a>`))
+ }))
+
+ scr := flyscrape.Scraper{
+ ScrapeOptions: flyscrape.ScrapeOptions{
+ URL: "http://www.example.com/",
+ Proxy: proxy.URL,
+ },
+ ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
+ return "foobar", nil
+ },
+ }
+
+ res := <-scr.Scrape()
+
+ require.True(t, proxyCalled)
+ require.Equal(t, "http://www.example.com/", res.URL)
}