diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-09-24 23:36:00 +0200 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-09-24 23:36:00 +0200 |
| commit | bd9e7f7acfd855d4685aa4544169c0e29cdbf205 (patch) | |
| tree | c5218c65359c0c2dee5a8db2670f30db677f068a /modules | |
| parent | 08df9258a532b653c243e077e82491dbe62ad854 (diff) | |
clean up modules
Diffstat (limited to 'modules')
| -rw-r--r-- | modules/depth/depth.go | 9 | ||||
| -rw-r--r-- | modules/depth/depth_test.go | 14 | ||||
| -rw-r--r-- | modules/domainfilter/domainfilter.go | 5 | ||||
| -rw-r--r-- | modules/domainfilter/domainfilter_test.go | 8 | ||||
| -rw-r--r-- | modules/followlinks/followlinks.go | 11 | ||||
| -rw-r--r-- | modules/followlinks/followlinks_test.go | 2 | ||||
| -rw-r--r-- | modules/jsonprinter/jsonprinter.go | 22 | ||||
| -rw-r--r-- | modules/jsonprinter/jsonprinter_test.go | 47 | ||||
| -rw-r--r-- | modules/ratelimit/ratelimit.go | 5 | ||||
| -rw-r--r-- | modules/ratelimit/ratelimit_test.go | 2 | ||||
| -rw-r--r-- | modules/starturl/starturl.go | 9 | ||||
| -rw-r--r-- | modules/starturl/starturl_test.go | 2 | ||||
| -rw-r--r-- | modules/urlfilter/urlfilter.go | 5 |
13 files changed, 34 insertions, 107 deletions
diff --git a/modules/depth/depth.go b/modules/depth/depth.go index 5efedc8..0cfbc71 100644 --- a/modules/depth/depth.go +++ b/modules/depth/depth.go @@ -16,15 +16,8 @@ type Module struct { Depth int `json:"depth"` } -func (m *Module) ID() string { - return "depth" -} - func (m *Module) CanRequest(url string, depth int) bool { return depth <= m.Depth } -var ( - _ flyscrape.Module = (*Module)(nil) - _ flyscrape.CanRequest = (*Module)(nil) -) +var _ flyscrape.CanRequest = (*Module)(nil) diff --git a/modules/depth/depth_test.go b/modules/depth/depth_test.go index 309e628..c9afd6f 100644 --- a/modules/depth/depth_test.go +++ b/modules/depth/depth_test.go @@ -17,17 +17,17 @@ import ( func TestDepth(t *testing.T) { scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) scraper.LoadModule(&followlinks.Module{}) scraper.LoadModule(&depth.Module{Depth: 2}) scraper.SetTransport(func(r *http.Request) (*http.Response, error) { switch r.URL.String() { - case "http://www.example.com/": + case "http://www.example.com": return flyscrape.MockResponse(200, `<a href="http://www.google.com">Google</a>`) - case "http://www.google.com/": + case "http://www.google.com": return flyscrape.MockResponse(200, `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`) - case "http://www.duckduckgo.com/": + case "http://www.duckduckgo.com": return flyscrape.MockResponse(200, `<a href="http://www.example.com">Example</a>`) } return flyscrape.MockResponse(200, "") @@ -41,7 +41,7 @@ func TestDepth(t *testing.T) { scraper.Run() require.Len(t, urls, 3) - require.Contains(t, urls, "http://www.example.com/") - require.Contains(t, urls, "http://www.google.com/") - require.Contains(t, urls, "http://www.duckduckgo.com/") + require.Contains(t, urls, "http://www.example.com") + require.Contains(t, urls, "http://www.google.com") + require.Contains(t, urls, "http://www.duckduckgo.com") } diff --git a/modules/domainfilter/domainfilter.go b/modules/domainfilter/domainfilter.go index b892882..ba9ebe6 100644 --- a/modules/domainfilter/domainfilter.go +++ b/modules/domainfilter/domainfilter.go @@ -19,10 +19,6 @@ type Module struct { BlockedDomains []string `json:"blockedDomains"` } -func (m *Module) ID() string { - return "domainfilter" -} - func (m *Module) OnLoad(v flyscrape.Visitor) { if u, err := url.Parse(m.URL); err == nil { m.AllowedDomains = append(m.AllowedDomains, u.Host()) @@ -56,7 +52,6 @@ func (m *Module) CanRequest(rawurl string, depth int) bool { } var ( - _ flyscrape.Module = (*Module)(nil) _ flyscrape.CanRequest = (*Module)(nil) _ flyscrape.OnLoad = (*Module)(nil) ) diff --git a/modules/domainfilter/domainfilter_test.go b/modules/domainfilter/domainfilter_test.go index 97bdc9c..884a89f 100644 --- a/modules/domainfilter/domainfilter_test.go +++ b/modules/domainfilter/domainfilter_test.go @@ -36,7 +36,7 @@ func TestDomainfilterAllowed(t *testing.T) { require.Len(t, urls, 2) require.Contains(t, urls, "http://www.example.com") - require.Contains(t, urls, "http://www.google.com/") + require.Contains(t, urls, "http://www.google.com") } func TestDomainfilterAllowedAll(t *testing.T) { @@ -61,8 +61,8 @@ func TestDomainfilterAllowedAll(t *testing.T) { require.Len(t, urls, 3) require.Contains(t, urls, "http://www.example.com") - require.Contains(t, urls, "http://www.duckduckgo.com/") - require.Contains(t, urls, "http://www.google.com/") + require.Contains(t, urls, "http://www.duckduckgo.com") + require.Contains(t, urls, "http://www.google.com") } func TestDomainfilterBlocked(t *testing.T) { @@ -88,5 +88,5 @@ func TestDomainfilterBlocked(t *testing.T) { require.Len(t, urls, 2) require.Contains(t, urls, "http://www.example.com") - require.Contains(t, urls, "http://www.duckduckgo.com/") + require.Contains(t, urls, "http://www.duckduckgo.com") } diff --git a/modules/followlinks/followlinks.go b/modules/followlinks/followlinks.go index dde0e90..99d6cee 100644 --- a/modules/followlinks/followlinks.go +++ b/modules/followlinks/followlinks.go @@ -14,17 +14,10 @@ func init() { type Module struct{} -func (m *Module) ID() string { - return "followlinks" -} - func (m *Module) OnResponse(resp *flyscrape.Response) { - for _, link := range flyscrape.ParseLinks(resp.HTML, resp.URL) { + for _, link := range flyscrape.ParseLinks(string(resp.Body), resp.Request.URL) { resp.Visit(link) } } -var ( - _ flyscrape.Module = (*Module)(nil) - _ flyscrape.OnResponse = (*Module)(nil) -) +var _ flyscrape.OnResponse = (*Module)(nil) diff --git a/modules/followlinks/followlinks_test.go b/modules/followlinks/followlinks_test.go index 03c3a6b..18c8ceb 100644 --- a/modules/followlinks/followlinks_test.go +++ b/modules/followlinks/followlinks_test.go @@ -34,6 +34,6 @@ func TestFollowLinks(t *testing.T) { require.Contains(t, urls, "http://www.example.com/baz") require.Contains(t, urls, "http://www.example.com/foo/bar") require.Contains(t, urls, "http://www.example.com/foo/baz") - require.Contains(t, urls, "http://www.google.com/") + require.Contains(t, urls, "http://www.google.com") require.Contains(t, urls, "http://www.google.com/baz") } diff --git a/modules/jsonprinter/jsonprinter.go b/modules/jsonprinter/jsonprinter.go index 3936277..3026f29 100644 --- a/modules/jsonprinter/jsonprinter.go +++ b/modules/jsonprinter/jsonprinter.go @@ -6,6 +6,7 @@ package jsonprinter import ( "fmt" + "time" "github.com/philippta/flyscrape" ) @@ -18,10 +19,6 @@ type Module struct { first bool } -func (m *Module) ID() string { - return "jsonprinter" -} - func (m *Module) OnResponse(resp *flyscrape.Response) { if resp.Error == nil && resp.Data == nil { return @@ -33,15 +30,28 @@ func (m *Module) OnResponse(resp *flyscrape.Response) { fmt.Println(",") } - fmt.Print(flyscrape.PrettyPrint(resp.ScrapeResult, " ")) + o := output{ + URL: resp.Request.URL, + Data: resp.Data, + Error: resp.Error, + Timestamp: time.Now(), + } + + fmt.Print(flyscrape.PrettyPrint(o, " ")) } func (m *Module) OnComplete() { fmt.Println("\n]") } +type output struct { + URL string `json:"url,omitempty"` + Data any `json:"data,omitempty"` + Error error `json:"error,omitempty"` + Timestamp time.Time `json:"timestamp,omitempty"` +} + var ( - _ flyscrape.Module = (*Module)(nil) _ flyscrape.OnResponse = (*Module)(nil) _ flyscrape.OnComplete = (*Module)(nil) ) diff --git a/modules/jsonprinter/jsonprinter_test.go b/modules/jsonprinter/jsonprinter_test.go deleted file mode 100644 index 29cc438..0000000 --- a/modules/jsonprinter/jsonprinter_test.go +++ /dev/null @@ -1,47 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at http://mozilla.org/MPL/2.0/. - -package jsonprinter_test - -import ( - "net/http" - "testing" - - "github.com/philippta/flyscrape" - "github.com/philippta/flyscrape/modules/depth" - "github.com/philippta/flyscrape/modules/followlinks" - "github.com/philippta/flyscrape/modules/starturl" - "github.com/stretchr/testify/require" -) - -func TestDepth(t *testing.T) { - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) - scraper.LoadModule(&followlinks.Module{}) - scraper.LoadModule(&depth.Module{Depth: 2}) - - scraper.SetTransport(func(r *http.Request) (*http.Response, error) { - switch r.URL.String() { - case "http://www.example.com/": - return flyscrape.MockResponse(200, `<a href="http://www.google.com">Google</a>`) - case "http://www.google.com/": - return flyscrape.MockResponse(200, `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`) - case "http://www.duckduckgo.com/": - return flyscrape.MockResponse(200, `<a href="http://www.example.com">Example</a>`) - } - return flyscrape.MockResponse(200, "") - }) - - var urls []string - scraper.OnRequest(func(req *flyscrape.Request) { - urls = append(urls, req.URL) - }) - - scraper.Run() - - require.Len(t, urls, 3) - require.Contains(t, urls, "http://www.example.com/") - require.Contains(t, urls, "http://www.google.com/") - require.Contains(t, urls, "http://www.duckduckgo.com/") -} diff --git a/modules/ratelimit/ratelimit.go b/modules/ratelimit/ratelimit.go index b02f5d5..be622f6 100644 --- a/modules/ratelimit/ratelimit.go +++ b/modules/ratelimit/ratelimit.go @@ -21,10 +21,6 @@ type Module struct { semaphore chan struct{} } -func (m *Module) ID() string { - return "ratelimit" -} - func (m *Module) OnLoad(v flyscrape.Visitor) { rate := time.Duration(float64(time.Second) / m.Rate) @@ -47,7 +43,6 @@ func (m *Module) OnComplete() { } var ( - _ flyscrape.Module = (*Module)(nil) _ flyscrape.OnRequest = (*Module)(nil) _ flyscrape.OnLoad = (*Module)(nil) _ flyscrape.OnComplete = (*Module)(nil) diff --git a/modules/ratelimit/ratelimit_test.go b/modules/ratelimit/ratelimit_test.go index c166371..5e91f8f 100644 --- a/modules/ratelimit/ratelimit_test.go +++ b/modules/ratelimit/ratelimit_test.go @@ -17,7 +17,7 @@ import ( func TestRatelimit(t *testing.T) { scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) scraper.LoadModule(&followlinks.Module{}) scraper.LoadModule(&ratelimit.Module{ Rate: 100, diff --git a/modules/starturl/starturl.go b/modules/starturl/starturl.go index b2e6c47..109d28f 100644 --- a/modules/starturl/starturl.go +++ b/modules/starturl/starturl.go @@ -16,15 +16,8 @@ type Module struct { URL string `json:"url"` } -func (m *Module) ID() string { - return "starturl" -} - func (m *Module) OnLoad(v flyscrape.Visitor) { v.Visit(m.URL) } -var ( - _ flyscrape.Module = (*Module)(nil) - _ flyscrape.OnLoad = (*Module)(nil) -) +var _ flyscrape.OnLoad = (*Module)(nil) diff --git a/modules/starturl/starturl_test.go b/modules/starturl/starturl_test.go index 647e197..6fab776 100644 --- a/modules/starturl/starturl_test.go +++ b/modules/starturl/starturl_test.go @@ -12,7 +12,7 @@ import ( "github.com/stretchr/testify/require" ) -func TestFollowLinks(t *testing.T) { +func TestStartURL(t *testing.T) { scraper := flyscrape.NewScraper() scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"}) scraper.SetTransport(flyscrape.MockTransport(200, "")) diff --git a/modules/urlfilter/urlfilter.go b/modules/urlfilter/urlfilter.go index 14576f0..00a4bd2 100644 --- a/modules/urlfilter/urlfilter.go +++ b/modules/urlfilter/urlfilter.go @@ -23,10 +23,6 @@ type Module struct { blockedURLsRE []*regexp.Regexp } -func (m *Module) ID() string { - return "urlfilter" -} - func (m *Module) OnLoad(v flyscrape.Visitor) { for _, pat := range m.AllowedURLs { re, err := regexp.Compile(pat) @@ -79,7 +75,6 @@ func (m *Module) CanRequest(rawurl string, depth int) bool { } var ( - _ flyscrape.Module = (*Module)(nil) _ flyscrape.CanRequest = (*Module)(nil) _ flyscrape.OnLoad = (*Module)(nil) ) |