From 1fc497fbdc79a43c62ac2e8eaf4827752dbeef8e Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Thu, 5 Oct 2023 14:53:37 +0200 Subject: Refactor codebase into modules --- modules/urlfilter/urlfilter.go | 35 +++++++++++++++++------ modules/urlfilter/urlfilter_test.go | 55 +++++++++++++++++++++++-------------- 2 files changed, 62 insertions(+), 28 deletions(-) (limited to 'modules/urlfilter') diff --git a/modules/urlfilter/urlfilter.go b/modules/urlfilter/urlfilter.go index 00a4bd2..1297c35 100644 --- a/modules/urlfilter/urlfilter.go +++ b/modules/urlfilter/urlfilter.go @@ -11,7 +11,7 @@ import ( ) func init() { - flyscrape.RegisterModule(new(Module)) + flyscrape.RegisterModule(Module{}) } type Module struct { @@ -23,7 +23,18 @@ type Module struct { blockedURLsRE []*regexp.Regexp } -func (m *Module) OnLoad(v flyscrape.Visitor) { +func (Module) ModuleInfo() flyscrape.ModuleInfo { + return flyscrape.ModuleInfo{ + ID: "urlfilter", + New: func() flyscrape.Module { return new(Module) }, + } +} + +func (m *Module) Provision(v flyscrape.Context) { + if m.disabled() { + return + } + for _, pat := range m.AllowedURLs { re, err := regexp.Compile(pat) if err != nil { @@ -41,9 +52,13 @@ func (m *Module) OnLoad(v flyscrape.Visitor) { } } -func (m *Module) CanRequest(rawurl string, depth int) bool { +func (m *Module) ValidateRequest(r *flyscrape.Request) bool { + if m.disabled() { + return true + } + // allow root url - if rawurl == m.URL { + if r.URL == m.URL { return true } @@ -58,14 +73,14 @@ func (m *Module) CanRequest(rawurl string, depth int) bool { } for _, re := range m.allowedURLsRE { - if re.MatchString(rawurl) { + if re.MatchString(r.URL) { ok = true break } } for _, re := range m.blockedURLsRE { - if re.MatchString(rawurl) { + if re.MatchString(r.URL) { ok = false break } @@ -74,7 +89,11 @@ func (m *Module) CanRequest(rawurl string, depth int) bool { return ok } +func (m *Module) disabled() bool { + return len(m.AllowedURLs) == 0 && len(m.BlockedURLs) == 0 +} + var ( - _ flyscrape.CanRequest = (*Module)(nil) - _ flyscrape.OnLoad = (*Module)(nil) + _ flyscrape.RequestValidator = (*Module)(nil) + _ flyscrape.Provisioner = (*Module)(nil) ) diff --git a/modules/urlfilter/urlfilter_test.go b/modules/urlfilter/urlfilter_test.go index e383a32..9ebb8a5 100644 --- a/modules/urlfilter/urlfilter_test.go +++ b/modules/urlfilter/urlfilter_test.go @@ -5,16 +5,22 @@ package urlfilter_test import ( + "net/http" + "sync" "testing" "github.com/philippta/flyscrape" "github.com/philippta/flyscrape/modules/followlinks" + "github.com/philippta/flyscrape/modules/hook" "github.com/philippta/flyscrape/modules/starturl" "github.com/philippta/flyscrape/modules/urlfilter" "github.com/stretchr/testify/require" ) func TestURLFilterAllowed(t *testing.T) { + var urls []string + var mu sync.Mutex + scraper := flyscrape.NewScraper() scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) scraper.LoadModule(&followlinks.Module{}) @@ -22,16 +28,19 @@ func TestURLFilterAllowed(t *testing.T) { URL: "http://www.example.com/", AllowedURLs: []string{`/foo\?id=\d+`, `/bar$`}, }) - - scraper.SetTransport(flyscrape.MockTransport(200, ` - 123 - ABC - bar - barz`)) - - var urls []string - scraper.OnRequest(func(req *flyscrape.Request) { - urls = append(urls, req.URL) + scraper.LoadModule(hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` + 123 + ABC + bar + barz`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, }) scraper.Run() @@ -43,6 +52,9 @@ func TestURLFilterAllowed(t *testing.T) { } func TestURLFilterBlocked(t *testing.T) { + var urls []string + var mu sync.Mutex + scraper := flyscrape.NewScraper() scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) scraper.LoadModule(&followlinks.Module{}) @@ -50,16 +62,19 @@ func TestURLFilterBlocked(t *testing.T) { URL: "http://www.example.com/", BlockedURLs: []string{`/foo\?id=\d+`, `/bar$`}, }) - - scraper.SetTransport(flyscrape.MockTransport(200, ` - 123 - ABC - bar - barz`)) - - var urls []string - scraper.OnRequest(func(req *flyscrape.Request) { - urls = append(urls, req.URL) + scraper.LoadModule(hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.MockTransport(200, ` + 123 + ABC + bar + barz`) + }, + ReceiveResponseFn: func(r *flyscrape.Response) { + mu.Lock() + urls = append(urls, r.Request.URL) + mu.Unlock() + }, }) scraper.Run() -- cgit v1.2.3