summaryrefslogtreecommitdiff
path: root/modules
diff options
context:
space:
mode:
Diffstat (limited to 'modules')
-rw-r--r--modules/depth/depth.go9
-rw-r--r--modules/depth/depth_test.go14
-rw-r--r--modules/domainfilter/domainfilter.go5
-rw-r--r--modules/domainfilter/domainfilter_test.go8
-rw-r--r--modules/followlinks/followlinks.go11
-rw-r--r--modules/followlinks/followlinks_test.go2
-rw-r--r--modules/jsonprinter/jsonprinter.go22
-rw-r--r--modules/jsonprinter/jsonprinter_test.go47
-rw-r--r--modules/ratelimit/ratelimit.go5
-rw-r--r--modules/ratelimit/ratelimit_test.go2
-rw-r--r--modules/starturl/starturl.go9
-rw-r--r--modules/starturl/starturl_test.go2
-rw-r--r--modules/urlfilter/urlfilter.go5
13 files changed, 34 insertions, 107 deletions
diff --git a/modules/depth/depth.go b/modules/depth/depth.go
index 5efedc8..0cfbc71 100644
--- a/modules/depth/depth.go
+++ b/modules/depth/depth.go
@@ -16,15 +16,8 @@ type Module struct {
Depth int `json:"depth"`
}
-func (m *Module) ID() string {
- return "depth"
-}
-
func (m *Module) CanRequest(url string, depth int) bool {
return depth <= m.Depth
}
-var (
- _ flyscrape.Module = (*Module)(nil)
- _ flyscrape.CanRequest = (*Module)(nil)
-)
+var _ flyscrape.CanRequest = (*Module)(nil)
diff --git a/modules/depth/depth_test.go b/modules/depth/depth_test.go
index 309e628..c9afd6f 100644
--- a/modules/depth/depth_test.go
+++ b/modules/depth/depth_test.go
@@ -17,17 +17,17 @@ import (
func TestDepth(t *testing.T) {
scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"})
+ scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"})
scraper.LoadModule(&followlinks.Module{})
scraper.LoadModule(&depth.Module{Depth: 2})
scraper.SetTransport(func(r *http.Request) (*http.Response, error) {
switch r.URL.String() {
- case "http://www.example.com/":
+ case "http://www.example.com":
return flyscrape.MockResponse(200, `<a href="http://www.google.com">Google</a>`)
- case "http://www.google.com/":
+ case "http://www.google.com":
return flyscrape.MockResponse(200, `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`)
- case "http://www.duckduckgo.com/":
+ case "http://www.duckduckgo.com":
return flyscrape.MockResponse(200, `<a href="http://www.example.com">Example</a>`)
}
return flyscrape.MockResponse(200, "")
@@ -41,7 +41,7 @@ func TestDepth(t *testing.T) {
scraper.Run()
require.Len(t, urls, 3)
- require.Contains(t, urls, "http://www.example.com/")
- require.Contains(t, urls, "http://www.google.com/")
- require.Contains(t, urls, "http://www.duckduckgo.com/")
+ require.Contains(t, urls, "http://www.example.com")
+ require.Contains(t, urls, "http://www.google.com")
+ require.Contains(t, urls, "http://www.duckduckgo.com")
}
diff --git a/modules/domainfilter/domainfilter.go b/modules/domainfilter/domainfilter.go
index b892882..ba9ebe6 100644
--- a/modules/domainfilter/domainfilter.go
+++ b/modules/domainfilter/domainfilter.go
@@ -19,10 +19,6 @@ type Module struct {
BlockedDomains []string `json:"blockedDomains"`
}
-func (m *Module) ID() string {
- return "domainfilter"
-}
-
func (m *Module) OnLoad(v flyscrape.Visitor) {
if u, err := url.Parse(m.URL); err == nil {
m.AllowedDomains = append(m.AllowedDomains, u.Host())
@@ -56,7 +52,6 @@ func (m *Module) CanRequest(rawurl string, depth int) bool {
}
var (
- _ flyscrape.Module = (*Module)(nil)
_ flyscrape.CanRequest = (*Module)(nil)
_ flyscrape.OnLoad = (*Module)(nil)
)
diff --git a/modules/domainfilter/domainfilter_test.go b/modules/domainfilter/domainfilter_test.go
index 97bdc9c..884a89f 100644
--- a/modules/domainfilter/domainfilter_test.go
+++ b/modules/domainfilter/domainfilter_test.go
@@ -36,7 +36,7 @@ func TestDomainfilterAllowed(t *testing.T) {
require.Len(t, urls, 2)
require.Contains(t, urls, "http://www.example.com")
- require.Contains(t, urls, "http://www.google.com/")
+ require.Contains(t, urls, "http://www.google.com")
}
func TestDomainfilterAllowedAll(t *testing.T) {
@@ -61,8 +61,8 @@ func TestDomainfilterAllowedAll(t *testing.T) {
require.Len(t, urls, 3)
require.Contains(t, urls, "http://www.example.com")
- require.Contains(t, urls, "http://www.duckduckgo.com/")
- require.Contains(t, urls, "http://www.google.com/")
+ require.Contains(t, urls, "http://www.duckduckgo.com")
+ require.Contains(t, urls, "http://www.google.com")
}
func TestDomainfilterBlocked(t *testing.T) {
@@ -88,5 +88,5 @@ func TestDomainfilterBlocked(t *testing.T) {
require.Len(t, urls, 2)
require.Contains(t, urls, "http://www.example.com")
- require.Contains(t, urls, "http://www.duckduckgo.com/")
+ require.Contains(t, urls, "http://www.duckduckgo.com")
}
diff --git a/modules/followlinks/followlinks.go b/modules/followlinks/followlinks.go
index dde0e90..99d6cee 100644
--- a/modules/followlinks/followlinks.go
+++ b/modules/followlinks/followlinks.go
@@ -14,17 +14,10 @@ func init() {
type Module struct{}
-func (m *Module) ID() string {
- return "followlinks"
-}
-
func (m *Module) OnResponse(resp *flyscrape.Response) {
- for _, link := range flyscrape.ParseLinks(resp.HTML, resp.URL) {
+ for _, link := range flyscrape.ParseLinks(string(resp.Body), resp.Request.URL) {
resp.Visit(link)
}
}
-var (
- _ flyscrape.Module = (*Module)(nil)
- _ flyscrape.OnResponse = (*Module)(nil)
-)
+var _ flyscrape.OnResponse = (*Module)(nil)
diff --git a/modules/followlinks/followlinks_test.go b/modules/followlinks/followlinks_test.go
index 03c3a6b..18c8ceb 100644
--- a/modules/followlinks/followlinks_test.go
+++ b/modules/followlinks/followlinks_test.go
@@ -34,6 +34,6 @@ func TestFollowLinks(t *testing.T) {
require.Contains(t, urls, "http://www.example.com/baz")
require.Contains(t, urls, "http://www.example.com/foo/bar")
require.Contains(t, urls, "http://www.example.com/foo/baz")
- require.Contains(t, urls, "http://www.google.com/")
+ require.Contains(t, urls, "http://www.google.com")
require.Contains(t, urls, "http://www.google.com/baz")
}
diff --git a/modules/jsonprinter/jsonprinter.go b/modules/jsonprinter/jsonprinter.go
index 3936277..3026f29 100644
--- a/modules/jsonprinter/jsonprinter.go
+++ b/modules/jsonprinter/jsonprinter.go
@@ -6,6 +6,7 @@ package jsonprinter
import (
"fmt"
+ "time"
"github.com/philippta/flyscrape"
)
@@ -18,10 +19,6 @@ type Module struct {
first bool
}
-func (m *Module) ID() string {
- return "jsonprinter"
-}
-
func (m *Module) OnResponse(resp *flyscrape.Response) {
if resp.Error == nil && resp.Data == nil {
return
@@ -33,15 +30,28 @@ func (m *Module) OnResponse(resp *flyscrape.Response) {
fmt.Println(",")
}
- fmt.Print(flyscrape.PrettyPrint(resp.ScrapeResult, " "))
+ o := output{
+ URL: resp.Request.URL,
+ Data: resp.Data,
+ Error: resp.Error,
+ Timestamp: time.Now(),
+ }
+
+ fmt.Print(flyscrape.PrettyPrint(o, " "))
}
func (m *Module) OnComplete() {
fmt.Println("\n]")
}
+type output struct {
+ URL string `json:"url,omitempty"`
+ Data any `json:"data,omitempty"`
+ Error error `json:"error,omitempty"`
+ Timestamp time.Time `json:"timestamp,omitempty"`
+}
+
var (
- _ flyscrape.Module = (*Module)(nil)
_ flyscrape.OnResponse = (*Module)(nil)
_ flyscrape.OnComplete = (*Module)(nil)
)
diff --git a/modules/jsonprinter/jsonprinter_test.go b/modules/jsonprinter/jsonprinter_test.go
deleted file mode 100644
index 29cc438..0000000
--- a/modules/jsonprinter/jsonprinter_test.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-package jsonprinter_test
-
-import (
- "net/http"
- "testing"
-
- "github.com/philippta/flyscrape"
- "github.com/philippta/flyscrape/modules/depth"
- "github.com/philippta/flyscrape/modules/followlinks"
- "github.com/philippta/flyscrape/modules/starturl"
- "github.com/stretchr/testify/require"
-)
-
-func TestDepth(t *testing.T) {
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"})
- scraper.LoadModule(&followlinks.Module{})
- scraper.LoadModule(&depth.Module{Depth: 2})
-
- scraper.SetTransport(func(r *http.Request) (*http.Response, error) {
- switch r.URL.String() {
- case "http://www.example.com/":
- return flyscrape.MockResponse(200, `<a href="http://www.google.com">Google</a>`)
- case "http://www.google.com/":
- return flyscrape.MockResponse(200, `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`)
- case "http://www.duckduckgo.com/":
- return flyscrape.MockResponse(200, `<a href="http://www.example.com">Example</a>`)
- }
- return flyscrape.MockResponse(200, "")
- })
-
- var urls []string
- scraper.OnRequest(func(req *flyscrape.Request) {
- urls = append(urls, req.URL)
- })
-
- scraper.Run()
-
- require.Len(t, urls, 3)
- require.Contains(t, urls, "http://www.example.com/")
- require.Contains(t, urls, "http://www.google.com/")
- require.Contains(t, urls, "http://www.duckduckgo.com/")
-}
diff --git a/modules/ratelimit/ratelimit.go b/modules/ratelimit/ratelimit.go
index b02f5d5..be622f6 100644
--- a/modules/ratelimit/ratelimit.go
+++ b/modules/ratelimit/ratelimit.go
@@ -21,10 +21,6 @@ type Module struct {
semaphore chan struct{}
}
-func (m *Module) ID() string {
- return "ratelimit"
-}
-
func (m *Module) OnLoad(v flyscrape.Visitor) {
rate := time.Duration(float64(time.Second) / m.Rate)
@@ -47,7 +43,6 @@ func (m *Module) OnComplete() {
}
var (
- _ flyscrape.Module = (*Module)(nil)
_ flyscrape.OnRequest = (*Module)(nil)
_ flyscrape.OnLoad = (*Module)(nil)
_ flyscrape.OnComplete = (*Module)(nil)
diff --git a/modules/ratelimit/ratelimit_test.go b/modules/ratelimit/ratelimit_test.go
index c166371..5e91f8f 100644
--- a/modules/ratelimit/ratelimit_test.go
+++ b/modules/ratelimit/ratelimit_test.go
@@ -17,7 +17,7 @@ import (
func TestRatelimit(t *testing.T) {
scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"})
+ scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"})
scraper.LoadModule(&followlinks.Module{})
scraper.LoadModule(&ratelimit.Module{
Rate: 100,
diff --git a/modules/starturl/starturl.go b/modules/starturl/starturl.go
index b2e6c47..109d28f 100644
--- a/modules/starturl/starturl.go
+++ b/modules/starturl/starturl.go
@@ -16,15 +16,8 @@ type Module struct {
URL string `json:"url"`
}
-func (m *Module) ID() string {
- return "starturl"
-}
-
func (m *Module) OnLoad(v flyscrape.Visitor) {
v.Visit(m.URL)
}
-var (
- _ flyscrape.Module = (*Module)(nil)
- _ flyscrape.OnLoad = (*Module)(nil)
-)
+var _ flyscrape.OnLoad = (*Module)(nil)
diff --git a/modules/starturl/starturl_test.go b/modules/starturl/starturl_test.go
index 647e197..6fab776 100644
--- a/modules/starturl/starturl_test.go
+++ b/modules/starturl/starturl_test.go
@@ -12,7 +12,7 @@ import (
"github.com/stretchr/testify/require"
)
-func TestFollowLinks(t *testing.T) {
+func TestStartURL(t *testing.T) {
scraper := flyscrape.NewScraper()
scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"})
scraper.SetTransport(flyscrape.MockTransport(200, ""))
diff --git a/modules/urlfilter/urlfilter.go b/modules/urlfilter/urlfilter.go
index 14576f0..00a4bd2 100644
--- a/modules/urlfilter/urlfilter.go
+++ b/modules/urlfilter/urlfilter.go
@@ -23,10 +23,6 @@ type Module struct {
blockedURLsRE []*regexp.Regexp
}
-func (m *Module) ID() string {
- return "urlfilter"
-}
-
func (m *Module) OnLoad(v flyscrape.Visitor) {
for _, pat := range m.AllowedURLs {
re, err := regexp.Compile(pat)
@@ -79,7 +75,6 @@ func (m *Module) CanRequest(rawurl string, depth int) bool {
}
var (
- _ flyscrape.Module = (*Module)(nil)
_ flyscrape.CanRequest = (*Module)(nil)
_ flyscrape.OnLoad = (*Module)(nil)
)