summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cmd/flyscrape/run.go13
-rw-r--r--examples/esbuild.github.io.js4
-rw-r--r--js.go13
-rw-r--r--scrape.go32
-rw-r--r--scrape_test.go5
-rw-r--r--utils.go6
6 files changed, 60 insertions, 13 deletions
diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go
index 8ec9390..2577c25 100644
--- a/cmd/flyscrape/run.go
+++ b/cmd/flyscrape/run.go
@@ -15,6 +15,7 @@ type RunCommand struct{}
func (c *RunCommand) Run(args []string) error {
fs := flag.NewFlagSet("flyscrape-run", flag.ContinueOnError)
concurrent := fs.Int("concurrent", 0, "concurrency")
+ noPrettyPrint := fs.Bool("no-pretty-print", false, "no-pretty-print")
fs.Usage = c.Usage
if err := fs.Parse(args); err != nil {
@@ -45,7 +46,11 @@ func (c *RunCommand) Run(args []string) error {
count := 0
start := time.Now()
for result := range svc.Scrape() {
- flyscrape.PrettyPrint(result)
+ if *noPrettyPrint {
+ flyscrape.Print(result)
+ } else {
+ flyscrape.PrettyPrint(result)
+ }
count++
}
log.Printf("Scraped %d websites in %v\n", count, time.Since(start))
@@ -66,6 +71,9 @@ Arguments:
-concurrent NUM
Determines the number of concurrent requests.
+ -no-pretty-print
+ Disables pretty printing of scrape results.
+
Examples:
@@ -74,5 +82,8 @@ Examples:
# Run the script with 10 concurrent requests.
$ flyscrape run -concurrent 10 example.js
+
+ # Run the script with pretty printing disabled.
+ $ flyscrape run -no-pretty-print example.js
`[1:])
}
diff --git a/examples/esbuild.github.io.js b/examples/esbuild.github.io.js
index 7a00478..735a61e 100644
--- a/examples/esbuild.github.io.js
+++ b/examples/esbuild.github.io.js
@@ -3,6 +3,10 @@ import { parse } from "flyscrape";
export const options = {
url: "https://esbuild.github.io/plugins/",
depth: 1,
+ allowedDomains: [
+ "esbuild.github.io",
+ "nodejs.org",
+ ],
}
export default function({ html }) {
diff --git a/js.go b/js.go
index 526f27b..242f15f 100644
--- a/js.go
+++ b/js.go
@@ -58,18 +58,13 @@ func vm(src string) (ScrapeOptions, ScrapeFunc, error) {
}
var opts ScrapeOptions
-
- url, err := ctx.RunScript("options.url", "main.js")
+ optsJSON, err := ctx.RunScript("JSON.stringify(options)", "main.js")
if err != nil {
- return ScrapeOptions{}, nil, fmt.Errorf("reading options.url: %w", err)
+ return ScrapeOptions{}, nil, fmt.Errorf("reading options: %w", err)
}
- opts.URL = url.String()
-
- depth, err := ctx.RunScript("options.depth", "main.js")
- if err != nil {
- return ScrapeOptions{}, nil, fmt.Errorf("reading options.depth: %w", err)
+ if err := json.Unmarshal([]byte(optsJSON.String()), &opts); err != nil {
+ return ScrapeOptions{}, nil, fmt.Errorf("decoding options json: %w", err)
}
- opts.Depth = int(depth.Integer())
scrape := func(params ScrapeParams) (any, error) {
suffix := randSeq(10)
diff --git a/scrape.go b/scrape.go
index 0c31518..3ecfe7b 100644
--- a/scrape.go
+++ b/scrape.go
@@ -17,7 +17,7 @@ type ScrapeParams struct {
type ScrapeOptions struct {
URL string `json:"url"`
- AllowedDomains []string `json:"allowed_domains"`
+ AllowedDomains []string `json:"allowedDomains"`
Depth int `json:"depth"`
}
@@ -62,6 +62,12 @@ func (s *Scraper) Scrape() <-chan ScrapeResult {
if s.FetchFunc == nil {
s.FetchFunc = Fetch()
}
+ if len(s.ScrapeOptions.AllowedDomains) == 0 {
+ u, err := url.Parse(s.ScrapeOptions.URL)
+ if err == nil {
+ s.ScrapeOptions.AllowedDomains = []string{u.Host()}
+ }
+ }
jobs := make(chan target, 1024)
results := make(chan result)
@@ -107,6 +113,10 @@ func (s *Scraper) worker(id int, jobs chan target, results chan<- result) {
continue
}
+ if !s.isURLAllowed(l) {
+ continue
+ }
+
s.wg.Add(1)
select {
case jobs <- target{url: l, depth: j.depth - 1}:
@@ -138,6 +148,26 @@ func (s *Scraper) process(job target) result {
return result{url: job.url, data: data, links: links}
}
+func (s *Scraper) isURLAllowed(rawurl string) bool {
+ u, err := url.Parse(rawurl)
+ if err != nil {
+ return false
+ }
+
+ host := u.Host()
+
+ for _, domain := range s.ScrapeOptions.AllowedDomains {
+ if domain == "*" {
+ return true
+ }
+ if host == domain {
+ return true
+ }
+ }
+
+ return false
+}
+
func Links(html string, origin string) []string {
var links []string
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
diff --git a/scrape_test.go b/scrape_test.go
index 5d6e578..643b10d 100644
--- a/scrape_test.go
+++ b/scrape_test.go
@@ -13,8 +13,9 @@ func TestScrape(t *testing.T) {
svc := flyscrape.Scraper{
Concurrency: 10,
ScrapeOptions: flyscrape.ScrapeOptions{
- URL: "http://example.com/foo/bar",
- Depth: 1,
+ URL: "http://example.com/foo/bar",
+ Depth: 1,
+ AllowedDomains: []string{"example.com", "www.google.com"},
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return map[string]any{
diff --git a/utils.go b/utils.go
index b3aca0b..7e59540 100644
--- a/utils.go
+++ b/utils.go
@@ -11,3 +11,9 @@ func PrettyPrint(v any) {
enc.SetIndent("", " ")
enc.Encode(v)
}
+
+func Print(v any) {
+ enc := json.NewEncoder(os.Stdout)
+ enc.SetEscapeHTML(false)
+ enc.Encode(v)
+}