diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-27 19:10:49 +0200 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-27 19:10:49 +0200 |
| commit | 5c16435e2218344a6e232ebb48cf022a32ba85d5 (patch) | |
| tree | 3cfa1dbc1f489ba4509fc408a8c0afccca7f9c7c | |
| parent | 52107c13b4c2c4efa9269b187916f3195be5a10d (diff) | |
add tests and allow urls
| -rw-r--r-- | cmd/flyscrape/run.go | 4 | ||||
| -rw-r--r-- | cmd/flyscrape/watch.go | 10 | ||||
| -rw-r--r-- | fetch.go | 39 | ||||
| -rw-r--r-- | fetch_test.go | 56 | ||||
| -rw-r--r-- | js/template.js | 4 | ||||
| -rw-r--r-- | js_test.go | 36 | ||||
| -rw-r--r-- | scrape.go | 63 | ||||
| -rw-r--r-- | scrape_test.go | 234 |
8 files changed, 389 insertions, 57 deletions
diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go index cf8f8cf..8e83ca8 100644 --- a/cmd/flyscrape/run.go +++ b/cmd/flyscrape/run.go @@ -19,6 +19,7 @@ type RunCommand struct{} func (c *RunCommand) Run(args []string) error { fs := flag.NewFlagSet("flyscrape-run", flag.ContinueOnError) noPrettyPrint := fs.Bool("no-pretty-print", false, "no-pretty-print") + proxy := fs.String("proxy", "", "proxy") fs.Usage = c.Usage if err := fs.Parse(args); err != nil { @@ -44,6 +45,9 @@ func (c *RunCommand) Run(args []string) error { ScrapeOptions: opts, ScrapeFunc: scrape, } + if *proxy != "" { + svc.FetchFunc = flyscrape.ProxiedFetch(*proxy) + } count := 0 start := time.Now() diff --git a/cmd/flyscrape/watch.go b/cmd/flyscrape/watch.go index 02a3b45..777ae8a 100644 --- a/cmd/flyscrape/watch.go +++ b/cmd/flyscrape/watch.go @@ -18,6 +18,7 @@ type WatchCommand struct{} func (c *WatchCommand) Run(args []string) error { fs := flag.NewFlagSet("flyscrape-watch", flag.ContinueOnError) + proxy := fs.String("proxy", "", "proxy") fs.Usage = c.Usage if err := fs.Parse(args); err != nil { @@ -28,7 +29,14 @@ func (c *WatchCommand) Run(args []string) error { return fmt.Errorf("too many arguments") } - fetch := flyscrape.CachedFetch() + var fetch flyscrape.FetchFunc + if *proxy != "" { + fetch = flyscrape.ProxiedFetch(*proxy) + } else { + fetch = flyscrape.Fetch() + } + + fetch = flyscrape.CachedFetch(fetch) script := fs.Arg(0) err := flyscrape.Watch(script, func(s string) error { @@ -5,21 +5,29 @@ package flyscrape import ( + "crypto/tls" "io" "net/http" + "net/url" "github.com/cornelk/hashmap" ) -func CachedFetch() FetchFunc { - cache := hashmap.New[string, string]() +func ProxiedFetch(proxyURL string) FetchFunc { + pu, err := url.Parse(proxyURL) + if err != nil { + panic("invalid proxy url") + } - return func(url string) (string, error) { - if html, ok := cache.Get(url); ok { - return html, nil - } + client := http.Client{ + Transport: &http.Transport{ + Proxy: http.ProxyURL(pu), + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + }, + } - resp, err := http.Get(url) + return func(url string) (string, error) { + resp, err := client.Get(url) if err != nil { return "", err } @@ -31,6 +39,23 @@ func CachedFetch() FetchFunc { } html := string(body) + return html, nil + } +} + +func CachedFetch(fetch FetchFunc) FetchFunc { + cache := hashmap.New[string, string]() + + return func(url string) (string, error) { + if html, ok := cache.Get(url); ok { + return html, nil + } + + html, err := fetch(url) + if err != nil { + return "", err + } + cache.Set(url, html) return html, nil } diff --git a/fetch_test.go b/fetch_test.go new file mode 100644 index 0000000..5ee0222 --- /dev/null +++ b/fetch_test.go @@ -0,0 +1,56 @@ +package flyscrape_test + +import ( + "net/http" + "net/http/httptest" + "testing" + + "flyscrape" + + "github.com/stretchr/testify/require" +) + +func TestFetchFetch(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte("foobar")) + })) + + fetch := flyscrape.Fetch() + + html, err := fetch(srv.URL) + require.NoError(t, err) + require.Equal(t, html, "foobar") +} + +func TestFetchCachedFetch(t *testing.T) { + numcalled := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + numcalled++ + w.Write([]byte("foobar")) + })) + + fetch := flyscrape.CachedFetch(flyscrape.Fetch()) + + html, err := fetch(srv.URL) + require.NoError(t, err) + require.Equal(t, html, "foobar") + + html, err = fetch(srv.URL) + require.NoError(t, err) + require.Equal(t, html, "foobar") + + require.Equal(t, 1, numcalled) +} + +func TestFetchProxiedFetch(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, r.URL.String(), "http://example.com/foo") + w.Write([]byte("foobar")) + })) + + fetch := flyscrape.ProxiedFetch(srv.URL) + + html, err := fetch("http://example.com/foo") + require.NoError(t, err) + require.Equal(t, html, "foobar") +} diff --git a/js/template.js b/js/template.js index 56fffa0..ac78b47 100644 --- a/js/template.js +++ b/js/template.js @@ -3,8 +3,8 @@ import { parse } from 'flyscrape'; export const options = { url: 'https://news.ycombinator.com/', // Specify the URL to start scraping from. depth: 1, // Specify how deep links should be followed. (default = 0, no follow) - allowedDomains: ['news.ycombinator.com'], // Specify the allowed domains. (default = domain from url) - blockedDomains: [], // Specify the blocked domains. (default = none) + allowDomains: [], // Specify the allowed domains. * for all. (default = domain from url) + denyDomains: [], // Specify the denied domains. (default = none) rate: 100, // Specify the rate in requests per second. (default = 100) } @@ -16,8 +16,8 @@ var html = ` <html> <body> <main> - <h1>Plugins</h1> - <p>The plugin API allows you to inject code into various parts of the build process.</p> + <h1>headline</h1> + <p>paragraph</p> </main> </body> </html>` @@ -34,22 +34,44 @@ export default function({ html, url }) { return { headline: $("h1").text(), - body: $("p").text() + body: $("p").text(), + url: url, } } ` -func TestV8(t *testing.T) { +func TestJSScrape(t *testing.T) { opts, run, err := flyscrape.Compile(script) require.NoError(t, err) require.NotNil(t, opts) require.NotNil(t, run) - extract, err := run(flyscrape.ScrapeParams{ + result, err := run(flyscrape.ScrapeParams{ HTML: html, + URL: "http://localhost/", }) require.NoError(t, err) - require.Equal(t, "Plugins", extract.(map[string]any)["headline"]) - require.Equal(t, "The plugin API allows you to inject code into various parts of the build process.", extract.(map[string]any)["body"]) + + m, ok := result.(map[string]any) + require.True(t, ok) + require.Equal(t, "headline", m["headline"]) + require.Equal(t, "paragraph", m["body"]) + require.Equal(t, "http://localhost/", m["url"]) +} + +func TestJSCompileError(t *testing.T) { + opts, run, err := flyscrape.Compile("import foo;") + require.Error(t, err) + require.Empty(t, opts) + require.Nil(t, run) + + var terr flyscrape.TransformError + require.ErrorAs(t, err, &terr) + + require.Equal(t, terr, flyscrape.TransformError{ + Line: 1, + Column: 10, + Text: `Expected "from" but found ";"`, + }) } @@ -6,6 +6,7 @@ package flyscrape import ( "log" + "regexp" "strings" "sync" "time" @@ -21,11 +22,13 @@ type ScrapeParams struct { } type ScrapeOptions struct { - URL string `json:"url"` - AllowedDomains []string `json:"allowedDomains"` - BlockedDomains []string `json:"blockedDomains"` - Depth int `json:"depth"` - Rate float64 `json:"rate"` + URL string `json:"url"` + AllowDomains []string `json:"allowDomains"` + DenyDomains []string `json:"denyDomains"` + AllowURLs []string `json:"allowURLs"` + Proxy string `json:"proxy"` + Depth int `json:"depth"` + Rate float64 `json:"rate"` } type ScrapeResult struct { @@ -54,10 +57,11 @@ type Scraper struct { ScrapeFunc ScrapeFunc FetchFunc FetchFunc - visited *hashmap.Map[string, struct{}] - wg *sync.WaitGroup - jobs chan target - results chan ScrapeResult + visited *hashmap.Map[string, struct{}] + wg *sync.WaitGroup + jobs chan target + results chan ScrapeResult + allowURLsRE []*regexp.Regexp } func (s *Scraper) init() { @@ -69,16 +73,24 @@ func (s *Scraper) init() { if s.FetchFunc == nil { s.FetchFunc = Fetch() } + if s.ScrapeOptions.Proxy != "" { + s.FetchFunc = ProxiedFetch(s.ScrapeOptions.Proxy) + } if s.ScrapeOptions.Rate == 0 { s.ScrapeOptions.Rate = 100 } - if len(s.ScrapeOptions.AllowedDomains) == 0 { - u, err := url.Parse(s.ScrapeOptions.URL) - if err == nil { - s.ScrapeOptions.AllowedDomains = []string{u.Host()} + if u, err := url.Parse(s.ScrapeOptions.URL); err == nil { + s.ScrapeOptions.AllowDomains = append(s.ScrapeOptions.AllowDomains, u.Host()) + } + + for _, pat := range s.ScrapeOptions.AllowURLs { + re, err := regexp.Compile(pat) + if err != nil { + continue } + s.allowURLsRE = append(s.allowURLsRE, re) } } @@ -116,7 +128,8 @@ func (s *Scraper) worker() { continue } - if !s.isURLAllowed(l) { + allowed := s.isDomainAllowed(l) && s.isURLAllowed(l) + if !allowed { continue } @@ -157,7 +170,7 @@ func (s *Scraper) enqueueJob(url string, depth int) { } } -func (s *Scraper) isURLAllowed(rawurl string) bool { +func (s *Scraper) isDomainAllowed(rawurl string) bool { u, err := url.Parse(rawurl) if err != nil { return false @@ -166,14 +179,14 @@ func (s *Scraper) isURLAllowed(rawurl string) bool { host := u.Host() ok := false - for _, domain := range s.ScrapeOptions.AllowedDomains { + for _, domain := range s.ScrapeOptions.AllowDomains { if domain == "*" || host == domain { ok = true break } } - for _, domain := range s.ScrapeOptions.BlockedDomains { + for _, domain := range s.ScrapeOptions.DenyDomains { if host == domain { ok = false break @@ -183,6 +196,22 @@ func (s *Scraper) isURLAllowed(rawurl string) bool { return ok } +func (s *Scraper) isURLAllowed(rawurl string) bool { + if len(s.allowURLsRE) == 0 { + return true + } + + ok := false + + for _, re := range s.allowURLsRE { + if re.MatchString(rawurl) { + ok = true + } + } + + return ok +} + func (s *Scraper) waitClose() { s.wg.Wait() close(s.jobs) diff --git a/scrape_test.go b/scrape_test.go index 602be9f..acfbbbf 100644 --- a/scrape_test.go +++ b/scrape_test.go @@ -5,46 +5,234 @@ package flyscrape_test import ( - "sort" + "net/http" + "net/http/httptest" "testing" + "time" "flyscrape" "github.com/stretchr/testify/require" ) -func TestScrape(t *testing.T) { - svc := flyscrape.Scraper{ +func TestScrapeFollowLinks(t *testing.T) { + scr := flyscrape.Scraper{ ScrapeOptions: flyscrape.ScrapeOptions{ - URL: "http://example.com/foo/bar", - Depth: 1, - AllowedDomains: []string{"example.com", "www.google.com"}, + URL: "http://www.example.com/foo/bar", + Depth: 1, + AllowDomains: []string{"www.google.com"}, }, ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { - return map[string]any{ - "url": params.URL, - }, nil + return "foobar", nil }, FetchFunc: func(url string) (string, error) { - return `<html> - <body> - <a href="/baz">Baz</a> + return `<a href="/baz">Baz</a> <a href="baz">Baz</a> - <a href="http://www.google.com">Google</a> - </body> - </html>`, nil + <a href="http://www.google.com">Google</a>`, nil }, } - var urls []string - for res := range svc.Scrape() { - urls = append(urls, res.URL) + urls := make(map[string]struct{}) + for res := range scr.Scrape() { + urls[res.URL] = struct{}{} } - sort.Strings(urls) require.Len(t, urls, 4) - require.Equal(t, "http://example.com/baz", urls[0]) - require.Equal(t, "http://example.com/foo/bar", urls[1]) - require.Equal(t, "http://example.com/foo/baz", urls[2]) - require.Equal(t, "http://www.google.com/", urls[3]) + require.Contains(t, urls, "http://www.example.com/baz") + require.Contains(t, urls, "http://www.example.com/foo/bar") + require.Contains(t, urls, "http://www.example.com/foo/baz") + require.Contains(t, urls, "http://www.google.com/") +} + +func TestScrapeDepth(t *testing.T) { + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Depth: 2, + AllowDomains: []string{"*"}, + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + FetchFunc: func(url string) (string, error) { + switch url { + case "http://www.example.com/": + return `<a href="http://www.google.com">Google</a>`, nil + case "http://www.google.com/": + return `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil + case "http://www.duckduckgo.com/": + return `<a href="http://www.example.com">Example</a>`, nil + } + return "", nil + }, + } + + urls := make(map[string]struct{}) + for res := range scr.Scrape() { + urls[res.URL] = struct{}{} + } + + require.Len(t, urls, 3) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.google.com/") + require.Contains(t, urls, "http://www.duckduckgo.com/") +} + +func TestScrapeAllowDomains(t *testing.T) { + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Depth: 1, + AllowDomains: []string{"www.google.com"}, + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + FetchFunc: func(url string) (string, error) { + return `<a href="http://www.google.com">Google</a> + <a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil + }, + } + + urls := make(map[string]struct{}) + for res := range scr.Scrape() { + urls[res.URL] = struct{}{} + } + + require.Len(t, urls, 2) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.google.com/") +} + +func TestScrapeAllowDomainsAll(t *testing.T) { + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Depth: 1, + AllowDomains: []string{"*"}, + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + FetchFunc: func(url string) (string, error) { + return `<a href="http://www.google.com">Google</a> + <a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil + }, + } + + urls := make(map[string]struct{}) + for res := range scr.Scrape() { + urls[res.URL] = struct{}{} + } + + require.Len(t, urls, 3) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.duckduckgo.com/") + require.Contains(t, urls, "http://www.google.com/") +} + +func TestScrapeDenyDomains(t *testing.T) { + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Depth: 1, + AllowDomains: []string{"*"}, + DenyDomains: []string{"www.google.com"}, + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + FetchFunc: func(url string) (string, error) { + return `<a href="http://www.google.com">Google</a> + <a href="http://www.duckduckgo.com">DuckDuckGo</a>`, nil + }, + } + + urls := make(map[string]struct{}) + for res := range scr.Scrape() { + urls[res.URL] = struct{}{} + } + + require.Len(t, urls, 2) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.duckduckgo.com/") +} + +func TestScrapeAllowURLs(t *testing.T) { + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Depth: 1, + AllowURLs: []string{`/foo\?id=\d+`, `/bar$`}, + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + FetchFunc: func(url string) (string, error) { + return `<a href="foo?id=123">123</a> + <a href="foo?id=ABC">ABC</a> + <a href="/bar">bar</a> + <a href="/barz">barz</a>`, nil + }, + } + + urls := make(map[string]struct{}) + for res := range scr.Scrape() { + urls[res.URL] = struct{}{} + } + + require.Len(t, urls, 3) + require.Contains(t, urls, "http://www.example.com/") + require.Contains(t, urls, "http://www.example.com/foo?id=123") + require.Contains(t, urls, "http://www.example.com/bar") +} + +func TestScrapeRate(t *testing.T) { + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Depth: 1, + Rate: 100, // every 10ms + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + FetchFunc: func(url string) (string, error) { + return `<a href="foo">foo</a>`, nil + }, + } + + res := scr.Scrape() + + start := time.Now() + <-res + first := time.Now().Add(-10 * time.Millisecond) + <-res + second := time.Now().Add(-20 * time.Millisecond) + + require.Less(t, first.Sub(start), 2*time.Millisecond) + require.Less(t, second.Sub(start), 2*time.Millisecond) +} + +func TestScrapeProxy(t *testing.T) { + proxyCalled := false + proxy := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + proxyCalled = true + w.Write([]byte(`<a href="http://www.google.com">Google</a>`)) + })) + + scr := flyscrape.Scraper{ + ScrapeOptions: flyscrape.ScrapeOptions{ + URL: "http://www.example.com/", + Proxy: proxy.URL, + }, + ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { + return "foobar", nil + }, + } + + res := <-scr.Scrape() + + require.True(t, proxyCalled) + require.Equal(t, "http://www.example.com/", res.URL) } |