summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2023-08-17 20:12:42 +0200
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2023-08-17 20:12:42 +0200
commit8812c84f32c74ac2f44af1abdb7e4e6f7cbf10b0 (patch)
tree966d5fefdcf24e1331c4d6d562e79d099482382b
parentc36bb2ca2a82338a822c6962f3373809b4bed814 (diff)
change rate to requests per second
-rw-r--r--examples/esbuild.github.io.js1
-rw-r--r--js/template.js1
-rw-r--r--scrape.go85
3 files changed, 60 insertions, 27 deletions
diff --git a/examples/esbuild.github.io.js b/examples/esbuild.github.io.js
index 735a61e..7839791 100644
--- a/examples/esbuild.github.io.js
+++ b/examples/esbuild.github.io.js
@@ -3,6 +3,7 @@ import { parse } from "flyscrape";
export const options = {
url: "https://esbuild.github.io/plugins/",
depth: 1,
+ rate: 100,
allowedDomains: [
"esbuild.github.io",
"nodejs.org",
diff --git a/js/template.js b/js/template.js
index d33adc5..a2b4518 100644
--- a/js/template.js
+++ b/js/template.js
@@ -4,6 +4,7 @@ export const options = {
url: "https://news.ycombinator.com/", // Specify the URL to start scraping from.
depth: 1, // Specify how deep links should be followed (0 = no follow).
allowedDomains: ["news.ycombinator.com"], // Specify the allowed domains to follow.
+ rate: 100, // Specify the request rate in requests per second.
}
export default function({ html, url }) {
diff --git a/scrape.go b/scrape.go
index 3ecfe7b..6ff92dc 100644
--- a/scrape.go
+++ b/scrape.go
@@ -4,6 +4,7 @@ import (
"log"
"strings"
"sync"
+ "time"
"github.com/PuerkitoBio/goquery"
"github.com/cornelk/hashmap"
@@ -19,6 +20,7 @@ type ScrapeOptions struct {
URL string `json:"url"`
AllowedDomains []string `json:"allowedDomains"`
Depth int `json:"depth"`
+ Rate float64 `json:"rate"`
}
type ScrapeResult struct {
@@ -62,6 +64,9 @@ func (s *Scraper) Scrape() <-chan ScrapeResult {
if s.FetchFunc == nil {
s.FetchFunc = Fetch()
}
+ if s.ScrapeOptions.Rate == 0 {
+ s.ScrapeOptions.Rate = 100
+ }
if len(s.ScrapeOptions.AllowedDomains) == 0 {
u, err := url.Parse(s.ScrapeOptions.URL)
if err == nil {
@@ -75,11 +80,10 @@ func (s *Scraper) Scrape() <-chan ScrapeResult {
s.visited = hashmap.New[string, struct{}]()
s.wg = &sync.WaitGroup{}
- for i := 0; i < s.Concurrency; i++ {
- go s.worker(i, jobs, results)
- }
+ go s.worker(jobs, results)
s.wg.Add(1)
+ s.visited.Set(s.ScrapeOptions.URL, struct{}{})
jobs <- target{url: s.ScrapeOptions.URL, depth: s.ScrapeOptions.Depth}
go func() {
@@ -103,33 +107,37 @@ func (s *Scraper) Scrape() <-chan ScrapeResult {
return scraperesults
}
-func (s *Scraper) worker(id int, jobs chan target, results chan<- result) {
- for j := range jobs {
- res := s.process(j)
-
- if j.depth > 0 {
- for _, l := range res.links {
- if _, ok := s.visited.Get(l); ok {
- continue
- }
-
- if !s.isURLAllowed(l) {
- continue
- }
-
- s.wg.Add(1)
- select {
- case jobs <- target{url: l, depth: j.depth - 1}:
- s.visited.Set(l, struct{}{})
- default:
- log.Println("queue is full, can't add url:", l)
- s.wg.Done()
+func (s *Scraper) worker(jobs chan target, results chan<- result) {
+ rate := time.Duration(float64(time.Second) / s.ScrapeOptions.Rate)
+ for j := range leakychan(jobs, rate) {
+ j := j
+ go func() {
+ res := s.process(j)
+
+ if j.depth > 0 {
+ for _, l := range res.links {
+ if _, ok := s.visited.Get(l); ok {
+ continue
+ }
+
+ if !s.isURLAllowed(l) {
+ continue
+ }
+
+ s.wg.Add(1)
+ select {
+ case jobs <- target{url: l, depth: j.depth - 1}:
+ s.visited.Set(l, struct{}{})
+ default:
+ log.Println("queue is full, can't add url:", l)
+ s.wg.Done()
+ }
}
}
- }
- results <- res
- s.wg.Done()
+ results <- res
+ s.wg.Done()
+ }()
}
}
@@ -208,3 +216,26 @@ func isValidLink(link *url.Url) bool {
return true
}
+
+func leakychan[T any](in chan T, rate time.Duration) chan T {
+ ticker := time.NewTicker(rate)
+ sem := make(chan struct{}, 1)
+ c := make(chan T)
+
+ go func() {
+ for range ticker.C {
+ sem <- struct{}{}
+ }
+ }()
+
+ go func() {
+ for v := range in {
+ <-sem
+ c <- v
+ }
+ ticker.Stop()
+ close(c)
+ }()
+
+ return c
+}