From 1a9af21755a78bb8689bd1f3830239f81dadc324 Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Thu, 17 Aug 2023 20:31:44 +0200 Subject: refactor --- cmd/flyscrape/run.go | 5 -- js/template.js | 8 +-- js_test.go | 23 +++++-- scrape.go | 168 +++++++++++++++++++++++++-------------------------- scrape_test.go | 22 ------- 5 files changed, 105 insertions(+), 121 deletions(-) diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go index 9a2a7bb..2d76a35 100644 --- a/cmd/flyscrape/run.go +++ b/cmd/flyscrape/run.go @@ -14,7 +14,6 @@ type RunCommand struct{} func (c *RunCommand) Run(args []string) error { fs := flag.NewFlagSet("flyscrape-run", flag.ContinueOnError) - concurrent := fs.Int("concurrent", 0, "concurrency") noPrettyPrint := fs.Bool("no-pretty-print", false, "no-pretty-print") fs.Usage = c.Usage @@ -40,7 +39,6 @@ func (c *RunCommand) Run(args []string) error { svc := flyscrape.Scraper{ ScrapeOptions: opts, ScrapeFunc: scrape, - Concurrency: *concurrent, } count := 0 @@ -76,9 +74,6 @@ Usage: Arguments: - -concurrent NUM - Determines the number of concurrent requests. - -no-pretty-print Disables pretty printing of scrape results. diff --git a/js/template.js b/js/template.js index f75df28..56fffa0 100644 --- a/js/template.js +++ b/js/template.js @@ -1,9 +1,9 @@ -import { parse } from "flyscrape"; +import { parse } from 'flyscrape'; export const options = { - url: "https://news.ycombinator.com/", // Specify the URL to start scraping from. + url: 'https://news.ycombinator.com/', // Specify the URL to start scraping from. depth: 1, // Specify how deep links should be followed. (default = 0, no follow) - allowedDomains: ["news.ycombinator.com"], // Specify the allowed domains. (default = domain from url) + allowedDomains: ['news.ycombinator.com'], // Specify the allowed domains. (default = domain from url) blockedDomains: [], // Specify the blocked domains. (default = none) rate: 100, // Specify the rate in requests per second. (default = 100) } @@ -13,7 +13,7 @@ export default function({ html, url }) { const title = $('title'); const entries = $('.athing').toArray(); - if (entries.length == 0) { + if (!entries.length) { return null; // Omits scraped pages without entries. } diff --git a/js_test.go b/js_test.go index 34c4183..bf7bc46 100644 --- a/js_test.go +++ b/js_test.go @@ -1,7 +1,6 @@ package flyscrape_test import ( - "os" "testing" "flyscrape" @@ -19,11 +18,25 @@ var html = ` ` -func TestV8(t *testing.T) { - data, err := os.ReadFile("examples/esbuild.github.io.js") - require.NoError(t, err) +var script = ` +import { parse } from "flyscrape"; + +export const options = { + url: "https://localhost/", +} - opts, run, err := flyscrape.Compile(string(data)) +export default function({ html, url }) { + const $ = parse(html); + + return { + headline: $("h1").text(), + body: $("p").text() + } +} +` + +func TestV8(t *testing.T) { + opts, run, err := flyscrape.Compile(script) require.NoError(t, err) require.NotNil(t, opts) require.NotNil(t, run) diff --git a/scrape.go b/scrape.go index ac75c73..be26e3c 100644 --- a/scrape.go +++ b/scrape.go @@ -32,133 +32,125 @@ type ScrapeResult struct { Timestamp time.Time `json:"timestamp"` } -type ( - ScrapeFunc func(ScrapeParams) (any, error) - FetchFunc func(url string) (string, error) -) +func (s *ScrapeResult) omit() bool { + return s.Error == nil && s.Data == nil +} + +type ScrapeFunc func(ScrapeParams) (any, error) + +type FetchFunc func(url string) (string, error) + +type target struct { + url string + depth int +} type Scraper struct { ScrapeOptions ScrapeOptions ScrapeFunc ScrapeFunc FetchFunc FetchFunc - Concurrency int visited *hashmap.Map[string, struct{}] wg *sync.WaitGroup + jobs chan target + results chan ScrapeResult } -type target struct { - url string - depth int -} - -type result struct { - url string - data any - links []string - err error -} +func (s *Scraper) init() { + s.visited = hashmap.New[string, struct{}]() + s.wg = &sync.WaitGroup{} + s.jobs = make(chan target, 1024) + s.results = make(chan ScrapeResult) -func (s *Scraper) Scrape() <-chan ScrapeResult { - if s.Concurrency == 0 { - s.Concurrency = 1 - } if s.FetchFunc == nil { s.FetchFunc = Fetch() } + if s.ScrapeOptions.Rate == 0 { s.ScrapeOptions.Rate = 100 } + if len(s.ScrapeOptions.AllowedDomains) == 0 { u, err := url.Parse(s.ScrapeOptions.URL) if err == nil { s.ScrapeOptions.AllowedDomains = []string{u.Host()} } } +} - jobs := make(chan target, 1024) - results := make(chan result) - scraperesults := make(chan ScrapeResult) - s.visited = hashmap.New[string, struct{}]() - s.wg = &sync.WaitGroup{} +func (s *Scraper) Scrape() <-chan ScrapeResult { + s.init() + s.enqueueJob(s.ScrapeOptions.URL, s.ScrapeOptions.Depth) - go s.worker(jobs, results) + go s.worker() + go s.waitClose() - s.wg.Add(1) - s.visited.Set(s.ScrapeOptions.URL, struct{}{}) - jobs <- target{url: s.ScrapeOptions.URL, depth: s.ScrapeOptions.Depth} + return s.results +} - go func() { - s.wg.Wait() - close(jobs) - close(results) - }() +func (s *Scraper) worker() { + var ( + rate = time.Duration(float64(time.Second) / s.ScrapeOptions.Rate) + leakyjobs = leakychan(s.jobs, rate) + ) - go func() { - for res := range results { - scraperesults <- ScrapeResult{ - URL: res.url, - Data: res.data, - Links: res.links, - Error: res.err, - Timestamp: time.Now().UTC(), + for job := range leakyjobs { + go func(job target) { + defer s.wg.Done() + + res := s.process(job) + if !res.omit() { + s.results <- res } - } - close(scraperesults) - }() - return scraperesults -} - -func (s *Scraper) worker(jobs chan target, results chan<- result) { - rate := time.Duration(float64(time.Second) / s.ScrapeOptions.Rate) - for j := range leakychan(jobs, rate) { - j := j - go func() { - res := s.process(j) - - if j.depth > 0 { - for _, l := range res.links { - if _, ok := s.visited.Get(l); ok { - continue - } - - if !s.isURLAllowed(l) { - continue - } - - s.wg.Add(1) - select { - case jobs <- target{url: l, depth: j.depth - 1}: - s.visited.Set(l, struct{}{}) - default: - log.Println("queue is full, can't add url:", l) - s.wg.Done() - } - } + if job.depth <= 0 { + return } - if res.err != nil || res.data != nil { - results <- res + for _, l := range res.Links { + if _, ok := s.visited.Get(l); ok { + continue + } + + if !s.isURLAllowed(l) { + continue + } + + s.enqueueJob(l, job.depth-1) } - s.wg.Done() - }() + }(job) } } -func (s *Scraper) process(job target) result { +func (s *Scraper) process(job target) (res ScrapeResult) { + res.URL = job.url + res.Timestamp = time.Now() + html, err := s.FetchFunc(job.url) if err != nil { - return result{url: job.url, err: err} + res.Error = err + return } - links := Links(html, job.url) - data, err := s.ScrapeFunc(ScrapeParams{HTML: html, URL: job.url}) + res.Links = links(html, job.url) + res.Data, err = s.ScrapeFunc(ScrapeParams{HTML: html, URL: job.url}) if err != nil { - return result{url: job.url, links: links, err: err} + res.Error = err + return } - return result{url: job.url, data: data, links: links} + return +} + +func (s *Scraper) enqueueJob(url string, depth int) { + s.wg.Add(1) + select { + case s.jobs <- target{url: url, depth: depth}: + s.visited.Set(url, struct{}{}) + default: + log.Println("queue is full, can't add url:", url) + s.wg.Done() + } } func (s *Scraper) isURLAllowed(rawurl string) bool { @@ -187,7 +179,13 @@ func (s *Scraper) isURLAllowed(rawurl string) bool { return ok } -func Links(html string, origin string) []string { +func (s *Scraper) waitClose() { + s.wg.Wait() + close(s.jobs) + close(s.results) +} + +func links(html string, origin string) []string { var links []string doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { diff --git a/scrape_test.go b/scrape_test.go index 643b10d..ffd8b70 100644 --- a/scrape_test.go +++ b/scrape_test.go @@ -11,7 +11,6 @@ import ( func TestScrape(t *testing.T) { svc := flyscrape.Scraper{ - Concurrency: 10, ScrapeOptions: flyscrape.ScrapeOptions{ URL: "http://example.com/foo/bar", Depth: 1, @@ -45,24 +44,3 @@ func TestScrape(t *testing.T) { require.Equal(t, "http://example.com/foo/baz", urls[2]) require.Equal(t, "http://www.google.com/", urls[3]) } - -func TestFindLinks(t *testing.T) { - origin := "http://example.com/foo/bar" - html := ` - - - Baz - Baz - Google - Google - Anchor - - ` - - links := flyscrape.Links(html, origin) - require.Len(t, links, 4) - require.Equal(t, "http://example.com/baz", links[0]) - require.Equal(t, "http://example.com/foo/baz", links[1]) - require.Equal(t, "http://www.google.com/", links[2]) - require.Equal(t, "http://example.com/foo", links[3]) -} -- cgit v1.2.3