change rate to requests per second

author: Philipp Tanlak <philipp.tanlak@gmail.com> 2023-08-17 20:12:42 +0200
committer: Philipp Tanlak <philipp.tanlak@gmail.com> 2023-08-17 20:12:42 +0200
commit: 8812c84f32c74ac2f44af1abdb7e4e6f7cbf10b0 (patch)
tree: 966d5fefdcf24e1331c4d6d562e79d099482382b
parent: c36bb2ca2a82338a822c6962f3373809b4bed814 (diff)
3 files changed, 60 insertions, 27 deletions
diff --git a/examples/esbuild.github.io.js b/examples/esbuild.github.io.js
index 735a61e..7839791 100644
--- a/examples/esbuild.github.io.js
+++ b/examples/esbuild.github.io.js
@@ -3,6 +3,7 @@ import { parse } from "flyscrape";
 export const options = {
     url: "https://esbuild.github.io/plugins/",
     depth: 1,
+    rate: 100,
     allowedDomains: [
         "esbuild.github.io", 
         "nodejs.org",
diff --git a/js/template.js b/js/template.js
index d33adc5..a2b4518 100644
--- a/js/template.js
+++ b/js/template.js
@@ -4,6 +4,7 @@ export const options = {
     url: "https://news.ycombinator.com/",     // Specify the URL to start scraping from.
     depth: 1,                                 // Specify how deep links should be followed (0 = no follow).
     allowedDomains: ["news.ycombinator.com"], // Specify the allowed domains to follow.
+    rate: 100,                                // Specify the request rate in requests per second.
 }
 
 export default function({ html, url }) {
diff --git a/scrape.go b/scrape.go
index 3ecfe7b..6ff92dc 100644
--- a/scrape.go
+++ b/scrape.go
@@ -4,6 +4,7 @@ import (
 	"log"
 	"strings"
 	"sync"
+	"time"
 
 	"github.com/PuerkitoBio/goquery"
 	"github.com/cornelk/hashmap"
@@ -19,6 +20,7 @@ type ScrapeOptions struct {
 	URL            string   `json:"url"`
 	AllowedDomains []string `json:"allowedDomains"`
 	Depth          int      `json:"depth"`
+	Rate           float64  `json:"rate"`
 }
 
 type ScrapeResult struct {
@@ -62,6 +64,9 @@ func (s *Scraper) Scrape() <-chan ScrapeResult {
 	if s.FetchFunc == nil {
 		s.FetchFunc = Fetch()
 	}
+	if s.ScrapeOptions.Rate == 0 {
+		s.ScrapeOptions.Rate = 100
+	}
 	if len(s.ScrapeOptions.AllowedDomains) == 0 {
 		u, err := url.Parse(s.ScrapeOptions.URL)
 		if err == nil {
@@ -75,11 +80,10 @@ func (s *Scraper) Scrape() <-chan ScrapeResult {
 	s.visited = hashmap.New[string, struct{}]()
 	s.wg = &sync.WaitGroup{}
 
-	for i := 0; i < s.Concurrency; i++ {
-		go s.worker(i, jobs, results)
-	}
+	go s.worker(jobs, results)
 
 	s.wg.Add(1)
+	s.visited.Set(s.ScrapeOptions.URL, struct{}{})
 	jobs <- target{url: s.ScrapeOptions.URL, depth: s.ScrapeOptions.Depth}
 
 	go func() {
@@ -103,33 +107,37 @@ func (s *Scraper) Scrape() <-chan ScrapeResult {
 	return scraperesults
 }
 
-func (s *Scraper) worker(id int, jobs chan target, results chan<- result) {
-	for j := range jobs {
-		res := s.process(j)
-
-		if j.depth > 0 {
-			for _, l := range res.links {
-				if _, ok := s.visited.Get(l); ok {
-					continue
-				}
-
-				if !s.isURLAllowed(l) {
-					continue
-				}
-
-				s.wg.Add(1)
-				select {
-				case jobs <- target{url: l, depth: j.depth - 1}:
-					s.visited.Set(l, struct{}{})
-				default:
-					log.Println("queue is full, can't add url:", l)
-					s.wg.Done()
+func (s *Scraper) worker(jobs chan target, results chan<- result) {
+	rate := time.Duration(float64(time.Second) / s.ScrapeOptions.Rate)
+	for j := range leakychan(jobs, rate) {
+		j := j
+		go func() {
+			res := s.process(j)
+
+			if j.depth > 0 {
+				for _, l := range res.links {
+					if _, ok := s.visited.Get(l); ok {
+						continue
+					}
+
+					if !s.isURLAllowed(l) {
+						continue
+					}
+
+					s.wg.Add(1)
+					select {
+					case jobs <- target{url: l, depth: j.depth - 1}:
+						s.visited.Set(l, struct{}{})
+					default:
+						log.Println("queue is full, can't add url:", l)
+						s.wg.Done()
+					}
 				}
 			}
-		}
 
-		results <- res
-		s.wg.Done()
+			results <- res
+			s.wg.Done()
+		}()
 	}
 }
 
@@ -208,3 +216,26 @@ func isValidLink(link *url.Url) bool {
 
 	return true
 }
+
+func leakychan[T any](in chan T, rate time.Duration) chan T {
+	ticker := time.NewTicker(rate)
+	sem := make(chan struct{}, 1)
+	c := make(chan T)
+
+	go func() {
+		for range ticker.C {
+			sem <- struct{}{}
+		}
+	}()
+
+	go func() {
+		for v := range in {
+			<-sem
+			c <- v
+		}
+		ticker.Stop()
+		close(c)
+	}()
+
+	return c
+}
author	Philipp Tanlak <philipp.tanlak@gmail.com>	2023-08-17 20:12:42 +0200
committer	Philipp Tanlak <philipp.tanlak@gmail.com>	2023-08-17 20:12:42 +0200
commit	8812c84f32c74ac2f44af1abdb7e4e6f7cbf10b0 (patch)
tree	966d5fefdcf24e1331c4d6d562e79d099482382b
parent	c36bb2ca2a82338a822c6962f3373809b4bed814 (diff)