summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cmd/flyscrape/run.go5
-rw-r--r--js/template.js8
-rw-r--r--js_test.go23
-rw-r--r--scrape.go168
-rw-r--r--scrape_test.go22
5 files changed, 105 insertions, 121 deletions
diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go
index 9a2a7bb..2d76a35 100644
--- a/cmd/flyscrape/run.go
+++ b/cmd/flyscrape/run.go
@@ -14,7 +14,6 @@ type RunCommand struct{}
func (c *RunCommand) Run(args []string) error {
fs := flag.NewFlagSet("flyscrape-run", flag.ContinueOnError)
- concurrent := fs.Int("concurrent", 0, "concurrency")
noPrettyPrint := fs.Bool("no-pretty-print", false, "no-pretty-print")
fs.Usage = c.Usage
@@ -40,7 +39,6 @@ func (c *RunCommand) Run(args []string) error {
svc := flyscrape.Scraper{
ScrapeOptions: opts,
ScrapeFunc: scrape,
- Concurrency: *concurrent,
}
count := 0
@@ -76,9 +74,6 @@ Usage:
Arguments:
- -concurrent NUM
- Determines the number of concurrent requests.
-
-no-pretty-print
Disables pretty printing of scrape results.
diff --git a/js/template.js b/js/template.js
index f75df28..56fffa0 100644
--- a/js/template.js
+++ b/js/template.js
@@ -1,9 +1,9 @@
-import { parse } from "flyscrape";
+import { parse } from 'flyscrape';
export const options = {
- url: "https://news.ycombinator.com/", // Specify the URL to start scraping from.
+ url: 'https://news.ycombinator.com/', // Specify the URL to start scraping from.
depth: 1, // Specify how deep links should be followed. (default = 0, no follow)
- allowedDomains: ["news.ycombinator.com"], // Specify the allowed domains. (default = domain from url)
+ allowedDomains: ['news.ycombinator.com'], // Specify the allowed domains. (default = domain from url)
blockedDomains: [], // Specify the blocked domains. (default = none)
rate: 100, // Specify the rate in requests per second. (default = 100)
}
@@ -13,7 +13,7 @@ export default function({ html, url }) {
const title = $('title');
const entries = $('.athing').toArray();
- if (entries.length == 0) {
+ if (!entries.length) {
return null; // Omits scraped pages without entries.
}
diff --git a/js_test.go b/js_test.go
index 34c4183..bf7bc46 100644
--- a/js_test.go
+++ b/js_test.go
@@ -1,7 +1,6 @@
package flyscrape_test
import (
- "os"
"testing"
"flyscrape"
@@ -19,11 +18,25 @@ var html = `
</body>
</html>`
-func TestV8(t *testing.T) {
- data, err := os.ReadFile("examples/esbuild.github.io.js")
- require.NoError(t, err)
+var script = `
+import { parse } from "flyscrape";
+
+export const options = {
+ url: "https://localhost/",
+}
- opts, run, err := flyscrape.Compile(string(data))
+export default function({ html, url }) {
+ const $ = parse(html);
+
+ return {
+ headline: $("h1").text(),
+ body: $("p").text()
+ }
+}
+`
+
+func TestV8(t *testing.T) {
+ opts, run, err := flyscrape.Compile(script)
require.NoError(t, err)
require.NotNil(t, opts)
require.NotNil(t, run)
diff --git a/scrape.go b/scrape.go
index ac75c73..be26e3c 100644
--- a/scrape.go
+++ b/scrape.go
@@ -32,133 +32,125 @@ type ScrapeResult struct {
Timestamp time.Time `json:"timestamp"`
}
-type (
- ScrapeFunc func(ScrapeParams) (any, error)
- FetchFunc func(url string) (string, error)
-)
+func (s *ScrapeResult) omit() bool {
+ return s.Error == nil && s.Data == nil
+}
+
+type ScrapeFunc func(ScrapeParams) (any, error)
+
+type FetchFunc func(url string) (string, error)
+
+type target struct {
+ url string
+ depth int
+}
type Scraper struct {
ScrapeOptions ScrapeOptions
ScrapeFunc ScrapeFunc
FetchFunc FetchFunc
- Concurrency int
visited *hashmap.Map[string, struct{}]
wg *sync.WaitGroup
+ jobs chan target
+ results chan ScrapeResult
}
-type target struct {
- url string
- depth int
-}
-
-type result struct {
- url string
- data any
- links []string
- err error
-}
+func (s *Scraper) init() {
+ s.visited = hashmap.New[string, struct{}]()
+ s.wg = &sync.WaitGroup{}
+ s.jobs = make(chan target, 1024)
+ s.results = make(chan ScrapeResult)
-func (s *Scraper) Scrape() <-chan ScrapeResult {
- if s.Concurrency == 0 {
- s.Concurrency = 1
- }
if s.FetchFunc == nil {
s.FetchFunc = Fetch()
}
+
if s.ScrapeOptions.Rate == 0 {
s.ScrapeOptions.Rate = 100
}
+
if len(s.ScrapeOptions.AllowedDomains) == 0 {
u, err := url.Parse(s.ScrapeOptions.URL)
if err == nil {
s.ScrapeOptions.AllowedDomains = []string{u.Host()}
}
}
+}
- jobs := make(chan target, 1024)
- results := make(chan result)
- scraperesults := make(chan ScrapeResult)
- s.visited = hashmap.New[string, struct{}]()
- s.wg = &sync.WaitGroup{}
+func (s *Scraper) Scrape() <-chan ScrapeResult {
+ s.init()
+ s.enqueueJob(s.ScrapeOptions.URL, s.ScrapeOptions.Depth)
- go s.worker(jobs, results)
+ go s.worker()
+ go s.waitClose()
- s.wg.Add(1)
- s.visited.Set(s.ScrapeOptions.URL, struct{}{})
- jobs <- target{url: s.ScrapeOptions.URL, depth: s.ScrapeOptions.Depth}
+ return s.results
+}
- go func() {
- s.wg.Wait()
- close(jobs)
- close(results)
- }()
+func (s *Scraper) worker() {
+ var (
+ rate = time.Duration(float64(time.Second) / s.ScrapeOptions.Rate)
+ leakyjobs = leakychan(s.jobs, rate)
+ )
- go func() {
- for res := range results {
- scraperesults <- ScrapeResult{
- URL: res.url,
- Data: res.data,
- Links: res.links,
- Error: res.err,
- Timestamp: time.Now().UTC(),
+ for job := range leakyjobs {
+ go func(job target) {
+ defer s.wg.Done()
+
+ res := s.process(job)
+ if !res.omit() {
+ s.results <- res
}
- }
- close(scraperesults)
- }()
- return scraperesults
-}
-
-func (s *Scraper) worker(jobs chan target, results chan<- result) {
- rate := time.Duration(float64(time.Second) / s.ScrapeOptions.Rate)
- for j := range leakychan(jobs, rate) {
- j := j
- go func() {
- res := s.process(j)
-
- if j.depth > 0 {
- for _, l := range res.links {
- if _, ok := s.visited.Get(l); ok {
- continue
- }
-
- if !s.isURLAllowed(l) {
- continue
- }
-
- s.wg.Add(1)
- select {
- case jobs <- target{url: l, depth: j.depth - 1}:
- s.visited.Set(l, struct{}{})
- default:
- log.Println("queue is full, can't add url:", l)
- s.wg.Done()
- }
- }
+ if job.depth <= 0 {
+ return
}
- if res.err != nil || res.data != nil {
- results <- res
+ for _, l := range res.Links {
+ if _, ok := s.visited.Get(l); ok {
+ continue
+ }
+
+ if !s.isURLAllowed(l) {
+ continue
+ }
+
+ s.enqueueJob(l, job.depth-1)
}
- s.wg.Done()
- }()
+ }(job)
}
}
-func (s *Scraper) process(job target) result {
+func (s *Scraper) process(job target) (res ScrapeResult) {
+ res.URL = job.url
+ res.Timestamp = time.Now()
+
html, err := s.FetchFunc(job.url)
if err != nil {
- return result{url: job.url, err: err}
+ res.Error = err
+ return
}
- links := Links(html, job.url)
- data, err := s.ScrapeFunc(ScrapeParams{HTML: html, URL: job.url})
+ res.Links = links(html, job.url)
+ res.Data, err = s.ScrapeFunc(ScrapeParams{HTML: html, URL: job.url})
if err != nil {
- return result{url: job.url, links: links, err: err}
+ res.Error = err
+ return
}
- return result{url: job.url, data: data, links: links}
+ return
+}
+
+func (s *Scraper) enqueueJob(url string, depth int) {
+ s.wg.Add(1)
+ select {
+ case s.jobs <- target{url: url, depth: depth}:
+ s.visited.Set(url, struct{}{})
+ default:
+ log.Println("queue is full, can't add url:", url)
+ s.wg.Done()
+ }
}
func (s *Scraper) isURLAllowed(rawurl string) bool {
@@ -187,7 +179,13 @@ func (s *Scraper) isURLAllowed(rawurl string) bool {
return ok
}
-func Links(html string, origin string) []string {
+func (s *Scraper) waitClose() {
+ s.wg.Wait()
+ close(s.jobs)
+ close(s.results)
+}
+
+func links(html string, origin string) []string {
var links []string
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
diff --git a/scrape_test.go b/scrape_test.go
index 643b10d..ffd8b70 100644
--- a/scrape_test.go
+++ b/scrape_test.go
@@ -11,7 +11,6 @@ import (
func TestScrape(t *testing.T) {
svc := flyscrape.Scraper{
- Concurrency: 10,
ScrapeOptions: flyscrape.ScrapeOptions{
URL: "http://example.com/foo/bar",
Depth: 1,
@@ -45,24 +44,3 @@ func TestScrape(t *testing.T) {
require.Equal(t, "http://example.com/foo/baz", urls[2])
require.Equal(t, "http://www.google.com/", urls[3])
}
-
-func TestFindLinks(t *testing.T) {
- origin := "http://example.com/foo/bar"
- html := `
- <html>
- <body>
- <a href="/baz">Baz</a>
- <a href="baz">Baz</a>
- <a href="http://www.google.com">Google</a>
- <a href="javascript:void(0)">Google</a>
- <a href="/foo#hello">Anchor</a>
- </body>
- </html>`
-
- links := flyscrape.Links(html, origin)
- require.Len(t, links, 4)
- require.Equal(t, "http://example.com/baz", links[0])
- require.Equal(t, "http://example.com/foo/baz", links[1])
- require.Equal(t, "http://www.google.com/", links[2])
- require.Equal(t, "http://example.com/foo", links[3])
-}