diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-17 20:12:42 +0200 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-17 20:12:42 +0200 |
| commit | 8812c84f32c74ac2f44af1abdb7e4e6f7cbf10b0 (patch) | |
| tree | 966d5fefdcf24e1331c4d6d562e79d099482382b | |
| parent | c36bb2ca2a82338a822c6962f3373809b4bed814 (diff) | |
change rate to requests per second
| -rw-r--r-- | examples/esbuild.github.io.js | 1 | ||||
| -rw-r--r-- | js/template.js | 1 | ||||
| -rw-r--r-- | scrape.go | 85 |
3 files changed, 60 insertions, 27 deletions
diff --git a/examples/esbuild.github.io.js b/examples/esbuild.github.io.js index 735a61e..7839791 100644 --- a/examples/esbuild.github.io.js +++ b/examples/esbuild.github.io.js @@ -3,6 +3,7 @@ import { parse } from "flyscrape"; export const options = { url: "https://esbuild.github.io/plugins/", depth: 1, + rate: 100, allowedDomains: [ "esbuild.github.io", "nodejs.org", diff --git a/js/template.js b/js/template.js index d33adc5..a2b4518 100644 --- a/js/template.js +++ b/js/template.js @@ -4,6 +4,7 @@ export const options = { url: "https://news.ycombinator.com/", // Specify the URL to start scraping from. depth: 1, // Specify how deep links should be followed (0 = no follow). allowedDomains: ["news.ycombinator.com"], // Specify the allowed domains to follow. + rate: 100, // Specify the request rate in requests per second. } export default function({ html, url }) { @@ -4,6 +4,7 @@ import ( "log" "strings" "sync" + "time" "github.com/PuerkitoBio/goquery" "github.com/cornelk/hashmap" @@ -19,6 +20,7 @@ type ScrapeOptions struct { URL string `json:"url"` AllowedDomains []string `json:"allowedDomains"` Depth int `json:"depth"` + Rate float64 `json:"rate"` } type ScrapeResult struct { @@ -62,6 +64,9 @@ func (s *Scraper) Scrape() <-chan ScrapeResult { if s.FetchFunc == nil { s.FetchFunc = Fetch() } + if s.ScrapeOptions.Rate == 0 { + s.ScrapeOptions.Rate = 100 + } if len(s.ScrapeOptions.AllowedDomains) == 0 { u, err := url.Parse(s.ScrapeOptions.URL) if err == nil { @@ -75,11 +80,10 @@ func (s *Scraper) Scrape() <-chan ScrapeResult { s.visited = hashmap.New[string, struct{}]() s.wg = &sync.WaitGroup{} - for i := 0; i < s.Concurrency; i++ { - go s.worker(i, jobs, results) - } + go s.worker(jobs, results) s.wg.Add(1) + s.visited.Set(s.ScrapeOptions.URL, struct{}{}) jobs <- target{url: s.ScrapeOptions.URL, depth: s.ScrapeOptions.Depth} go func() { @@ -103,33 +107,37 @@ func (s *Scraper) Scrape() <-chan ScrapeResult { return scraperesults } -func (s *Scraper) worker(id int, jobs chan target, results chan<- result) { - for j := range jobs { - res := s.process(j) - - if j.depth > 0 { - for _, l := range res.links { - if _, ok := s.visited.Get(l); ok { - continue - } - - if !s.isURLAllowed(l) { - continue - } - - s.wg.Add(1) - select { - case jobs <- target{url: l, depth: j.depth - 1}: - s.visited.Set(l, struct{}{}) - default: - log.Println("queue is full, can't add url:", l) - s.wg.Done() +func (s *Scraper) worker(jobs chan target, results chan<- result) { + rate := time.Duration(float64(time.Second) / s.ScrapeOptions.Rate) + for j := range leakychan(jobs, rate) { + j := j + go func() { + res := s.process(j) + + if j.depth > 0 { + for _, l := range res.links { + if _, ok := s.visited.Get(l); ok { + continue + } + + if !s.isURLAllowed(l) { + continue + } + + s.wg.Add(1) + select { + case jobs <- target{url: l, depth: j.depth - 1}: + s.visited.Set(l, struct{}{}) + default: + log.Println("queue is full, can't add url:", l) + s.wg.Done() + } } } - } - results <- res - s.wg.Done() + results <- res + s.wg.Done() + }() } } @@ -208,3 +216,26 @@ func isValidLink(link *url.Url) bool { return true } + +func leakychan[T any](in chan T, rate time.Duration) chan T { + ticker := time.NewTicker(rate) + sem := make(chan struct{}, 1) + c := make(chan T) + + go func() { + for range ticker.C { + sem <- struct{}{} + } + }() + + go func() { + for v := range in { + <-sem + c <- v + } + ticker.Stop() + close(c) + }() + + return c +} |