diff options
| -rw-r--r-- | cmd/flyscrape/run.go | 13 | ||||
| -rw-r--r-- | examples/esbuild.github.io.js | 4 | ||||
| -rw-r--r-- | js.go | 13 | ||||
| -rw-r--r-- | scrape.go | 32 | ||||
| -rw-r--r-- | scrape_test.go | 5 | ||||
| -rw-r--r-- | utils.go | 6 |
6 files changed, 60 insertions, 13 deletions
diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go index 8ec9390..2577c25 100644 --- a/cmd/flyscrape/run.go +++ b/cmd/flyscrape/run.go @@ -15,6 +15,7 @@ type RunCommand struct{} func (c *RunCommand) Run(args []string) error { fs := flag.NewFlagSet("flyscrape-run", flag.ContinueOnError) concurrent := fs.Int("concurrent", 0, "concurrency") + noPrettyPrint := fs.Bool("no-pretty-print", false, "no-pretty-print") fs.Usage = c.Usage if err := fs.Parse(args); err != nil { @@ -45,7 +46,11 @@ func (c *RunCommand) Run(args []string) error { count := 0 start := time.Now() for result := range svc.Scrape() { - flyscrape.PrettyPrint(result) + if *noPrettyPrint { + flyscrape.Print(result) + } else { + flyscrape.PrettyPrint(result) + } count++ } log.Printf("Scraped %d websites in %v\n", count, time.Since(start)) @@ -66,6 +71,9 @@ Arguments: -concurrent NUM Determines the number of concurrent requests. + -no-pretty-print + Disables pretty printing of scrape results. + Examples: @@ -74,5 +82,8 @@ Examples: # Run the script with 10 concurrent requests. $ flyscrape run -concurrent 10 example.js + + # Run the script with pretty printing disabled. + $ flyscrape run -no-pretty-print example.js `[1:]) } diff --git a/examples/esbuild.github.io.js b/examples/esbuild.github.io.js index 7a00478..735a61e 100644 --- a/examples/esbuild.github.io.js +++ b/examples/esbuild.github.io.js @@ -3,6 +3,10 @@ import { parse } from "flyscrape"; export const options = { url: "https://esbuild.github.io/plugins/", depth: 1, + allowedDomains: [ + "esbuild.github.io", + "nodejs.org", + ], } export default function({ html }) { @@ -58,18 +58,13 @@ func vm(src string) (ScrapeOptions, ScrapeFunc, error) { } var opts ScrapeOptions - - url, err := ctx.RunScript("options.url", "main.js") + optsJSON, err := ctx.RunScript("JSON.stringify(options)", "main.js") if err != nil { - return ScrapeOptions{}, nil, fmt.Errorf("reading options.url: %w", err) + return ScrapeOptions{}, nil, fmt.Errorf("reading options: %w", err) } - opts.URL = url.String() - - depth, err := ctx.RunScript("options.depth", "main.js") - if err != nil { - return ScrapeOptions{}, nil, fmt.Errorf("reading options.depth: %w", err) + if err := json.Unmarshal([]byte(optsJSON.String()), &opts); err != nil { + return ScrapeOptions{}, nil, fmt.Errorf("decoding options json: %w", err) } - opts.Depth = int(depth.Integer()) scrape := func(params ScrapeParams) (any, error) { suffix := randSeq(10) @@ -17,7 +17,7 @@ type ScrapeParams struct { type ScrapeOptions struct { URL string `json:"url"` - AllowedDomains []string `json:"allowed_domains"` + AllowedDomains []string `json:"allowedDomains"` Depth int `json:"depth"` } @@ -62,6 +62,12 @@ func (s *Scraper) Scrape() <-chan ScrapeResult { if s.FetchFunc == nil { s.FetchFunc = Fetch() } + if len(s.ScrapeOptions.AllowedDomains) == 0 { + u, err := url.Parse(s.ScrapeOptions.URL) + if err == nil { + s.ScrapeOptions.AllowedDomains = []string{u.Host()} + } + } jobs := make(chan target, 1024) results := make(chan result) @@ -107,6 +113,10 @@ func (s *Scraper) worker(id int, jobs chan target, results chan<- result) { continue } + if !s.isURLAllowed(l) { + continue + } + s.wg.Add(1) select { case jobs <- target{url: l, depth: j.depth - 1}: @@ -138,6 +148,26 @@ func (s *Scraper) process(job target) result { return result{url: job.url, data: data, links: links} } +func (s *Scraper) isURLAllowed(rawurl string) bool { + u, err := url.Parse(rawurl) + if err != nil { + return false + } + + host := u.Host() + + for _, domain := range s.ScrapeOptions.AllowedDomains { + if domain == "*" { + return true + } + if host == domain { + return true + } + } + + return false +} + func Links(html string, origin string) []string { var links []string doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) diff --git a/scrape_test.go b/scrape_test.go index 5d6e578..643b10d 100644 --- a/scrape_test.go +++ b/scrape_test.go @@ -13,8 +13,9 @@ func TestScrape(t *testing.T) { svc := flyscrape.Scraper{ Concurrency: 10, ScrapeOptions: flyscrape.ScrapeOptions{ - URL: "http://example.com/foo/bar", - Depth: 1, + URL: "http://example.com/foo/bar", + Depth: 1, + AllowedDomains: []string{"example.com", "www.google.com"}, }, ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) { return map[string]any{ @@ -11,3 +11,9 @@ func PrettyPrint(v any) { enc.SetIndent("", " ") enc.Encode(v) } + +func Print(v any) { + enc := json.NewEncoder(os.Stdout) + enc.SetEscapeHTML(false) + enc.Encode(v) +} |