summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md2
-rw-r--r--cmd/flyscrape/dev.go6
-rw-r--r--cmd/flyscrape/run.go4
-rw-r--r--js.go14
-rw-r--r--js/template.js2
-rw-r--r--js_test.go26
-rw-r--r--module.go33
-rw-r--r--modules/depth/depth.go9
-rw-r--r--modules/depth/depth_test.go14
-rw-r--r--modules/domainfilter/domainfilter.go5
-rw-r--r--modules/domainfilter/domainfilter_test.go8
-rw-r--r--modules/followlinks/followlinks.go11
-rw-r--r--modules/followlinks/followlinks_test.go2
-rw-r--r--modules/jsonprinter/jsonprinter.go22
-rw-r--r--modules/jsonprinter/jsonprinter_test.go47
-rw-r--r--modules/ratelimit/ratelimit.go5
-rw-r--r--modules/ratelimit/ratelimit_test.go2
-rw-r--r--modules/starturl/starturl.go9
-rw-r--r--modules/starturl/starturl_test.go2
-rw-r--r--modules/urlfilter/urlfilter.go5
-rw-r--r--scrape.go143
-rw-r--r--utils.go4
22 files changed, 144 insertions, 231 deletions
diff --git a/README.md b/README.md
index 3d06b1c..3021c7f 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ Below is an example scraping script that showcases the capabilities of **flyscra
```javascript
import { parse } from 'flyscrape';
-export const options = {
+export const config = {
url: 'https://news.ycombinator.com/', // Specify the URL to start scraping from.
depth: 1, // Specify how deep links should be followed. (default = 0, no follow)
allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
diff --git a/cmd/flyscrape/dev.go b/cmd/flyscrape/dev.go
index 169e6d3..95c627e 100644
--- a/cmd/flyscrape/dev.go
+++ b/cmd/flyscrape/dev.go
@@ -39,7 +39,7 @@ func (c *DevCommand) Run(args []string) error {
script := fs.Arg(0)
err := flyscrape.Watch(script, func(s string) error {
- opts, scrape, err := flyscrape.Compile(s)
+ cfg, scrape, err := flyscrape.Compile(s)
if err != nil {
screen.Clear()
screen.MoveTopLeft()
@@ -58,7 +58,7 @@ func (c *DevCommand) Run(args []string) error {
scraper := flyscrape.NewScraper()
scraper.ScrapeFunc = scrape
- flyscrape.LoadModules(scraper, opts)
+ flyscrape.LoadModules(scraper, cfg)
scraper.Run()
@@ -69,7 +69,7 @@ func (c *DevCommand) Run(args []string) error {
log.Println(resp.Error)
return
}
- fmt.Println(flyscrape.PrettyPrint(resp.ScrapeResult, ""))
+ fmt.Println(flyscrape.PrettyPrint(resp.Data, ""))
})
return nil
diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go
index 22f41fd..4580e6d 100644
--- a/cmd/flyscrape/run.go
+++ b/cmd/flyscrape/run.go
@@ -34,14 +34,14 @@ func (c *RunCommand) Run(args []string) error {
return fmt.Errorf("failed to read script %q: %w", script, err)
}
- opts, scrape, err := flyscrape.Compile(string(src))
+ cfg, scrape, err := flyscrape.Compile(string(src))
if err != nil {
return fmt.Errorf("failed to compile script: %w", err)
}
scraper := flyscrape.NewScraper()
scraper.ScrapeFunc = scrape
- flyscrape.LoadModules(scraper, opts)
+ flyscrape.LoadModules(scraper, cfg)
count := 0
start := time.Now()
diff --git a/js.go b/js.go
index ce0efc1..5343754 100644
--- a/js.go
+++ b/js.go
@@ -16,7 +16,7 @@ import (
v8 "rogchap.com/v8go"
)
-type Options []byte
+type Config []byte
type TransformError struct {
Line int
@@ -28,7 +28,7 @@ func (err TransformError) Error() string {
return fmt.Sprintf("%d:%d: %s", err.Line, err.Column, err.Text)
}
-func Compile(src string) (Options, ScrapeFunc, error) {
+func Compile(src string) (Config, ScrapeFunc, error) {
src, err := build(src)
if err != nil {
return nil, nil, err
@@ -58,7 +58,7 @@ func build(src string) (string, error) {
return string(res.Code), nil
}
-func vm(src string) (Options, ScrapeFunc, error) {
+func vm(src string) (Config, ScrapeFunc, error) {
ctx := v8.NewContext()
ctx.RunScript("var module = {}", "main.js")
@@ -72,12 +72,12 @@ func vm(src string) (Options, ScrapeFunc, error) {
return nil, nil, fmt.Errorf("running user script: %w", err)
}
- cfg, err := ctx.RunScript("JSON.stringify(options)", "main.js")
+ cfg, err := ctx.RunScript("JSON.stringify(config)", "main.js")
if err != nil {
- return nil, nil, fmt.Errorf("reading options: %w", err)
+ return nil, nil, fmt.Errorf("reading config: %w", err)
}
if !cfg.IsString() {
- return nil, nil, fmt.Errorf("options is not a string")
+ return nil, nil, fmt.Errorf("config is not a string")
}
scrape := func(params ScrapeParams) (any, error) {
@@ -97,7 +97,7 @@ func vm(src string) (Options, ScrapeFunc, error) {
return obj, nil
}
- return Options(cfg.String()), scrape, nil
+ return Config(cfg.String()), scrape, nil
}
func randSeq(n int) string {
diff --git a/js/template.js b/js/template.js
index 82196f0..1a030e5 100644
--- a/js/template.js
+++ b/js/template.js
@@ -1,6 +1,6 @@
import { parse } from 'flyscrape';
-export const options = {
+export const config = {
url: 'https://news.ycombinator.com/', // Specify the URL to start scraping from.
depth: 0, // Specify how deep links should be followed. (default = 0, no follow)
allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
diff --git a/js_test.go b/js_test.go
index 7496c68..9366a15 100644
--- a/js_test.go
+++ b/js_test.go
@@ -25,7 +25,7 @@ var html = `
var script = `
import { parse } from "flyscrape";
-export const options = {
+export const config = {
url: "https://localhost/",
}
@@ -41,9 +41,9 @@ export default function({ html, url }) {
`
func TestJSScrape(t *testing.T) {
- opts, run, err := flyscrape.Compile(script)
+ cfg, run, err := flyscrape.Compile(script)
require.NoError(t, err)
- require.NotNil(t, opts)
+ require.NotNil(t, cfg)
require.NotNil(t, run)
result, err := run(flyscrape.ScrapeParams{
@@ -61,9 +61,9 @@ func TestJSScrape(t *testing.T) {
}
func TestJSCompileError(t *testing.T) {
- opts, run, err := flyscrape.Compile("import foo;")
+ cfg, run, err := flyscrape.Compile("import foo;")
require.Error(t, err)
- require.Empty(t, opts)
+ require.Empty(t, cfg)
require.Nil(t, run)
var terr flyscrape.TransformError
@@ -76,31 +76,31 @@ func TestJSCompileError(t *testing.T) {
})
}
-func TestJSOptions(t *testing.T) {
+func TestJSConfig(t *testing.T) {
js := `
- export const options = {
+ export const config = {
url: 'http://localhost/',
depth: 5,
allowedDomains: ['example.com'],
}
export default function() {}
`
- rawOpts, _, err := flyscrape.Compile(js)
+ rawCfg, _, err := flyscrape.Compile(js)
require.NoError(t, err)
- type options struct {
+ type config struct {
URL string `json:"url"`
Depth int `json:"depth"`
AllowedDomains []string `json:"allowedDomains"`
}
- var opts options
- err = json.Unmarshal(rawOpts, &opts)
+ var cfg config
+ err = json.Unmarshal(rawCfg, &cfg)
require.NoError(t, err)
- require.Equal(t, options{
+ require.Equal(t, config{
URL: "http://localhost/",
Depth: 5,
AllowedDomains: []string{"example.com"},
- }, opts)
+ }, cfg)
}
diff --git a/module.go b/module.go
index bc90c02..1839b76 100644
--- a/module.go
+++ b/module.go
@@ -2,14 +2,10 @@ package flyscrape
import (
"encoding/json"
- "fmt"
"net/http"
- "sync"
)
-type Module interface {
- ID() string
-}
+type Module any
type Transport interface {
Transport(*http.Request) (*http.Response, error)
@@ -34,32 +30,15 @@ type OnComplete interface {
OnComplete()
}
-func RegisterModule(m Module) {
- id := m.ID()
- if id == "" {
- panic("module id is missing")
- }
-
- globalModulesMu.Lock()
- defer globalModulesMu.Unlock()
-
- if _, ok := globalModules[id]; ok {
- panic(fmt.Sprintf("module %s already registered", id))
- }
- globalModules[id] = m
+func RegisterModule(mod Module) {
+ globalModules = append(globalModules, mod)
}
-func LoadModules(s *Scraper, opts Options) {
- globalModulesMu.RLock()
- defer globalModulesMu.RUnlock()
-
+func LoadModules(s *Scraper, cfg Config) {
for _, mod := range globalModules {
- json.Unmarshal(opts, mod)
+ json.Unmarshal(cfg, mod)
s.LoadModule(mod)
}
}
-var (
- globalModules = map[string]Module{}
- globalModulesMu sync.RWMutex
-)
+var globalModules = []Module{}
diff --git a/modules/depth/depth.go b/modules/depth/depth.go
index 5efedc8..0cfbc71 100644
--- a/modules/depth/depth.go
+++ b/modules/depth/depth.go
@@ -16,15 +16,8 @@ type Module struct {
Depth int `json:"depth"`
}
-func (m *Module) ID() string {
- return "depth"
-}
-
func (m *Module) CanRequest(url string, depth int) bool {
return depth <= m.Depth
}
-var (
- _ flyscrape.Module = (*Module)(nil)
- _ flyscrape.CanRequest = (*Module)(nil)
-)
+var _ flyscrape.CanRequest = (*Module)(nil)
diff --git a/modules/depth/depth_test.go b/modules/depth/depth_test.go
index 309e628..c9afd6f 100644
--- a/modules/depth/depth_test.go
+++ b/modules/depth/depth_test.go
@@ -17,17 +17,17 @@ import (
func TestDepth(t *testing.T) {
scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"})
+ scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"})
scraper.LoadModule(&followlinks.Module{})
scraper.LoadModule(&depth.Module{Depth: 2})
scraper.SetTransport(func(r *http.Request) (*http.Response, error) {
switch r.URL.String() {
- case "http://www.example.com/":
+ case "http://www.example.com":
return flyscrape.MockResponse(200, `<a href="http://www.google.com">Google</a>`)
- case "http://www.google.com/":
+ case "http://www.google.com":
return flyscrape.MockResponse(200, `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`)
- case "http://www.duckduckgo.com/":
+ case "http://www.duckduckgo.com":
return flyscrape.MockResponse(200, `<a href="http://www.example.com">Example</a>`)
}
return flyscrape.MockResponse(200, "")
@@ -41,7 +41,7 @@ func TestDepth(t *testing.T) {
scraper.Run()
require.Len(t, urls, 3)
- require.Contains(t, urls, "http://www.example.com/")
- require.Contains(t, urls, "http://www.google.com/")
- require.Contains(t, urls, "http://www.duckduckgo.com/")
+ require.Contains(t, urls, "http://www.example.com")
+ require.Contains(t, urls, "http://www.google.com")
+ require.Contains(t, urls, "http://www.duckduckgo.com")
}
diff --git a/modules/domainfilter/domainfilter.go b/modules/domainfilter/domainfilter.go
index b892882..ba9ebe6 100644
--- a/modules/domainfilter/domainfilter.go
+++ b/modules/domainfilter/domainfilter.go
@@ -19,10 +19,6 @@ type Module struct {
BlockedDomains []string `json:"blockedDomains"`
}
-func (m *Module) ID() string {
- return "domainfilter"
-}
-
func (m *Module) OnLoad(v flyscrape.Visitor) {
if u, err := url.Parse(m.URL); err == nil {
m.AllowedDomains = append(m.AllowedDomains, u.Host())
@@ -56,7 +52,6 @@ func (m *Module) CanRequest(rawurl string, depth int) bool {
}
var (
- _ flyscrape.Module = (*Module)(nil)
_ flyscrape.CanRequest = (*Module)(nil)
_ flyscrape.OnLoad = (*Module)(nil)
)
diff --git a/modules/domainfilter/domainfilter_test.go b/modules/domainfilter/domainfilter_test.go
index 97bdc9c..884a89f 100644
--- a/modules/domainfilter/domainfilter_test.go
+++ b/modules/domainfilter/domainfilter_test.go
@@ -36,7 +36,7 @@ func TestDomainfilterAllowed(t *testing.T) {
require.Len(t, urls, 2)
require.Contains(t, urls, "http://www.example.com")
- require.Contains(t, urls, "http://www.google.com/")
+ require.Contains(t, urls, "http://www.google.com")
}
func TestDomainfilterAllowedAll(t *testing.T) {
@@ -61,8 +61,8 @@ func TestDomainfilterAllowedAll(t *testing.T) {
require.Len(t, urls, 3)
require.Contains(t, urls, "http://www.example.com")
- require.Contains(t, urls, "http://www.duckduckgo.com/")
- require.Contains(t, urls, "http://www.google.com/")
+ require.Contains(t, urls, "http://www.duckduckgo.com")
+ require.Contains(t, urls, "http://www.google.com")
}
func TestDomainfilterBlocked(t *testing.T) {
@@ -88,5 +88,5 @@ func TestDomainfilterBlocked(t *testing.T) {
require.Len(t, urls, 2)
require.Contains(t, urls, "http://www.example.com")
- require.Contains(t, urls, "http://www.duckduckgo.com/")
+ require.Contains(t, urls, "http://www.duckduckgo.com")
}
diff --git a/modules/followlinks/followlinks.go b/modules/followlinks/followlinks.go
index dde0e90..99d6cee 100644
--- a/modules/followlinks/followlinks.go
+++ b/modules/followlinks/followlinks.go
@@ -14,17 +14,10 @@ func init() {
type Module struct{}
-func (m *Module) ID() string {
- return "followlinks"
-}
-
func (m *Module) OnResponse(resp *flyscrape.Response) {
- for _, link := range flyscrape.ParseLinks(resp.HTML, resp.URL) {
+ for _, link := range flyscrape.ParseLinks(string(resp.Body), resp.Request.URL) {
resp.Visit(link)
}
}
-var (
- _ flyscrape.Module = (*Module)(nil)
- _ flyscrape.OnResponse = (*Module)(nil)
-)
+var _ flyscrape.OnResponse = (*Module)(nil)
diff --git a/modules/followlinks/followlinks_test.go b/modules/followlinks/followlinks_test.go
index 03c3a6b..18c8ceb 100644
--- a/modules/followlinks/followlinks_test.go
+++ b/modules/followlinks/followlinks_test.go
@@ -34,6 +34,6 @@ func TestFollowLinks(t *testing.T) {
require.Contains(t, urls, "http://www.example.com/baz")
require.Contains(t, urls, "http://www.example.com/foo/bar")
require.Contains(t, urls, "http://www.example.com/foo/baz")
- require.Contains(t, urls, "http://www.google.com/")
+ require.Contains(t, urls, "http://www.google.com")
require.Contains(t, urls, "http://www.google.com/baz")
}
diff --git a/modules/jsonprinter/jsonprinter.go b/modules/jsonprinter/jsonprinter.go
index 3936277..3026f29 100644
--- a/modules/jsonprinter/jsonprinter.go
+++ b/modules/jsonprinter/jsonprinter.go
@@ -6,6 +6,7 @@ package jsonprinter
import (
"fmt"
+ "time"
"github.com/philippta/flyscrape"
)
@@ -18,10 +19,6 @@ type Module struct {
first bool
}
-func (m *Module) ID() string {
- return "jsonprinter"
-}
-
func (m *Module) OnResponse(resp *flyscrape.Response) {
if resp.Error == nil && resp.Data == nil {
return
@@ -33,15 +30,28 @@ func (m *Module) OnResponse(resp *flyscrape.Response) {
fmt.Println(",")
}
- fmt.Print(flyscrape.PrettyPrint(resp.ScrapeResult, " "))
+ o := output{
+ URL: resp.Request.URL,
+ Data: resp.Data,
+ Error: resp.Error,
+ Timestamp: time.Now(),
+ }
+
+ fmt.Print(flyscrape.PrettyPrint(o, " "))
}
func (m *Module) OnComplete() {
fmt.Println("\n]")
}
+type output struct {
+ URL string `json:"url,omitempty"`
+ Data any `json:"data,omitempty"`
+ Error error `json:"error,omitempty"`
+ Timestamp time.Time `json:"timestamp,omitempty"`
+}
+
var (
- _ flyscrape.Module = (*Module)(nil)
_ flyscrape.OnResponse = (*Module)(nil)
_ flyscrape.OnComplete = (*Module)(nil)
)
diff --git a/modules/jsonprinter/jsonprinter_test.go b/modules/jsonprinter/jsonprinter_test.go
deleted file mode 100644
index 29cc438..0000000
--- a/modules/jsonprinter/jsonprinter_test.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-package jsonprinter_test
-
-import (
- "net/http"
- "testing"
-
- "github.com/philippta/flyscrape"
- "github.com/philippta/flyscrape/modules/depth"
- "github.com/philippta/flyscrape/modules/followlinks"
- "github.com/philippta/flyscrape/modules/starturl"
- "github.com/stretchr/testify/require"
-)
-
-func TestDepth(t *testing.T) {
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"})
- scraper.LoadModule(&followlinks.Module{})
- scraper.LoadModule(&depth.Module{Depth: 2})
-
- scraper.SetTransport(func(r *http.Request) (*http.Response, error) {
- switch r.URL.String() {
- case "http://www.example.com/":
- return flyscrape.MockResponse(200, `<a href="http://www.google.com">Google</a>`)
- case "http://www.google.com/":
- return flyscrape.MockResponse(200, `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`)
- case "http://www.duckduckgo.com/":
- return flyscrape.MockResponse(200, `<a href="http://www.example.com">Example</a>`)
- }
- return flyscrape.MockResponse(200, "")
- })
-
- var urls []string
- scraper.OnRequest(func(req *flyscrape.Request) {
- urls = append(urls, req.URL)
- })
-
- scraper.Run()
-
- require.Len(t, urls, 3)
- require.Contains(t, urls, "http://www.example.com/")
- require.Contains(t, urls, "http://www.google.com/")
- require.Contains(t, urls, "http://www.duckduckgo.com/")
-}
diff --git a/modules/ratelimit/ratelimit.go b/modules/ratelimit/ratelimit.go
index b02f5d5..be622f6 100644
--- a/modules/ratelimit/ratelimit.go
+++ b/modules/ratelimit/ratelimit.go
@@ -21,10 +21,6 @@ type Module struct {
semaphore chan struct{}
}
-func (m *Module) ID() string {
- return "ratelimit"
-}
-
func (m *Module) OnLoad(v flyscrape.Visitor) {
rate := time.Duration(float64(time.Second) / m.Rate)
@@ -47,7 +43,6 @@ func (m *Module) OnComplete() {
}
var (
- _ flyscrape.Module = (*Module)(nil)
_ flyscrape.OnRequest = (*Module)(nil)
_ flyscrape.OnLoad = (*Module)(nil)
_ flyscrape.OnComplete = (*Module)(nil)
diff --git a/modules/ratelimit/ratelimit_test.go b/modules/ratelimit/ratelimit_test.go
index c166371..5e91f8f 100644
--- a/modules/ratelimit/ratelimit_test.go
+++ b/modules/ratelimit/ratelimit_test.go
@@ -17,7 +17,7 @@ import (
func TestRatelimit(t *testing.T) {
scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"})
+ scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"})
scraper.LoadModule(&followlinks.Module{})
scraper.LoadModule(&ratelimit.Module{
Rate: 100,
diff --git a/modules/starturl/starturl.go b/modules/starturl/starturl.go
index b2e6c47..109d28f 100644
--- a/modules/starturl/starturl.go
+++ b/modules/starturl/starturl.go
@@ -16,15 +16,8 @@ type Module struct {
URL string `json:"url"`
}
-func (m *Module) ID() string {
- return "starturl"
-}
-
func (m *Module) OnLoad(v flyscrape.Visitor) {
v.Visit(m.URL)
}
-var (
- _ flyscrape.Module = (*Module)(nil)
- _ flyscrape.OnLoad = (*Module)(nil)
-)
+var _ flyscrape.OnLoad = (*Module)(nil)
diff --git a/modules/starturl/starturl_test.go b/modules/starturl/starturl_test.go
index 647e197..6fab776 100644
--- a/modules/starturl/starturl_test.go
+++ b/modules/starturl/starturl_test.go
@@ -12,7 +12,7 @@ import (
"github.com/stretchr/testify/require"
)
-func TestFollowLinks(t *testing.T) {
+func TestStartURL(t *testing.T) {
scraper := flyscrape.NewScraper()
scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"})
scraper.SetTransport(flyscrape.MockTransport(200, ""))
diff --git a/modules/urlfilter/urlfilter.go b/modules/urlfilter/urlfilter.go
index 14576f0..00a4bd2 100644
--- a/modules/urlfilter/urlfilter.go
+++ b/modules/urlfilter/urlfilter.go
@@ -23,10 +23,6 @@ type Module struct {
blockedURLsRE []*regexp.Regexp
}
-func (m *Module) ID() string {
- return "urlfilter"
-}
-
func (m *Module) OnLoad(v flyscrape.Visitor) {
for _, pat := range m.AllowedURLs {
re, err := regexp.Compile(pat)
@@ -79,7 +75,6 @@ func (m *Module) CanRequest(rawurl string, depth int) bool {
}
var (
- _ flyscrape.Module = (*Module)(nil)
_ flyscrape.CanRequest = (*Module)(nil)
_ flyscrape.OnLoad = (*Module)(nil)
)
diff --git a/scrape.go b/scrape.go
index 42b3c10..4186247 100644
--- a/scrape.go
+++ b/scrape.go
@@ -8,13 +8,15 @@ import (
"io"
"log"
"net/http"
+ "net/http/cookiejar"
+ "net/url"
"strings"
"sync"
- "time"
+
+ gourl "net/url"
"github.com/PuerkitoBio/goquery"
"github.com/cornelk/hashmap"
- "github.com/nlnwa/whatwg-url/url"
)
type ScrapeParams struct {
@@ -22,18 +24,6 @@ type ScrapeParams struct {
URL string
}
-type ScrapeResult struct {
- URL string `json:"url"`
- Data any `json:"data,omitempty"`
- Links []string `json:"-"`
- Error error `json:"error,omitempty"`
- Timestamp time.Time `json:"timestamp"`
-}
-
-func (s *ScrapeResult) omit() bool {
- return s.Error == nil && s.Data == nil
-}
-
type ScrapeFunc func(ScrapeParams) (any, error)
type FetchFunc func(url string) (string, error)
@@ -43,32 +33,39 @@ type Visitor interface {
MarkVisited(url string)
}
-type (
- Request struct {
- URL string
- Depth int
- }
+type Request struct {
+ Method string
+ URL string
+ Headers http.Header
+ Cookies http.CookieJar
+ Depth int
+}
- Response struct {
- ScrapeResult
- HTML string
- Visit func(url string)
- }
+type Response struct {
+ StatusCode int
+ Headers http.Header
+ Body []byte
+ Data any
+ Error error
+ Request *Request
- target struct {
- url string
- depth int
- }
-)
+ Visit func(url string)
+}
+
+type target struct {
+ url string
+ depth int
+}
type Scraper struct {
ScrapeFunc ScrapeFunc
- opts Options
- wg sync.WaitGroup
- jobs chan target
- visited *hashmap.Map[string, struct{}]
- modules *hashmap.Map[string, Module]
+ cfg Config
+ wg sync.WaitGroup
+ jobs chan target
+ visited *hashmap.Map[string, struct{}]
+ modules *hashmap.Map[string, Module]
+ cookieJar *cookiejar.Jar
canRequestHandlers []func(url string, depth int) bool
onRequestHandlers []func(*Request)
@@ -78,6 +75,7 @@ type Scraper struct {
}
func NewScraper() *Scraper {
+ jar, _ := cookiejar.New(nil)
s := &Scraper{
jobs: make(chan target, 1024),
visited: hashmap.New[string, struct{}](),
@@ -86,6 +84,7 @@ func NewScraper() *Scraper {
r.Header.Set("User-Agent", "flyscrape/0.1")
return http.DefaultClient.Do(r)
},
+ cookieJar: jar,
}
return s
}
@@ -165,58 +164,66 @@ func (s *Scraper) worker() {
}
}
- res, html := s.process(job)
- for _, handler := range s.onResponseHandlers {
- handler(&Response{
- ScrapeResult: res,
- HTML: html,
- Visit: func(url string) {
- s.enqueueJob(url, job.depth+1)
- },
- })
- }
+ s.process(job.url, job.depth)
}(job)
}
}
-func (s *Scraper) process(job target) (res ScrapeResult, html string) {
- res.URL = job.url
- res.Timestamp = time.Now()
+func (s *Scraper) process(url string, depth int) {
+ request := &Request{
+ Method: http.MethodGet,
+ URL: url,
+ Headers: http.Header{},
+ Cookies: s.cookieJar,
+ }
+
+ response := &Response{
+ Request: request,
+ Visit: func(url string) {
+ s.enqueueJob(url, depth+1)
+ },
+ }
+
+ defer func() {
+ for _, handler := range s.onResponseHandlers {
+ handler(response)
+ }
+ }()
- req, err := http.NewRequest(http.MethodGet, job.url, nil)
+ req, err := http.NewRequest(request.Method, request.URL, nil)
if err != nil {
- res.Error = err
+ response.Error = err
return
}
+ req.Header = request.Headers
for _, handler := range s.onRequestHandlers {
- handler(&Request{URL: job.url, Depth: job.depth})
+ handler(request)
}
resp, err := s.transport(req)
if err != nil {
- res.Error = err
+ response.Error = err
return
}
defer resp.Body.Close()
- body, err := io.ReadAll(resp.Body)
+ response.StatusCode = resp.StatusCode
+ response.Headers = resp.Header
+
+ response.Body, err = io.ReadAll(resp.Body)
if err != nil {
- res.Error = err
+ response.Error = err
return
}
- html = string(body)
-
if s.ScrapeFunc != nil {
- res.Data, err = s.ScrapeFunc(ScrapeParams{HTML: html, URL: job.url})
+ response.Data, err = s.ScrapeFunc(ScrapeParams{HTML: string(response.Body), URL: request.URL})
if err != nil {
- res.Error = err
+ response.Error = err
return
}
}
-
- return
}
func (s *Scraper) enqueueJob(url string, depth int) {
@@ -241,18 +248,22 @@ func ParseLinks(html string, origin string) []string {
return nil
}
- urlParser := url.NewParser(url.WithPercentEncodeSinglePercentSign())
+ originurl, err := url.Parse(origin)
+ if err != nil {
+ return nil
+ }
uniqueLinks := make(map[string]bool)
doc.Find("a").Each(func(i int, s *goquery.Selection) {
link, _ := s.Attr("href")
- parsedLink, err := urlParser.ParseRef(origin, link)
+ parsedLink, err := originurl.Parse(link)
+
if err != nil || !isValidLink(parsedLink) {
return
}
- absLink := parsedLink.Href(true)
+ absLink := parsedLink.String()
if !uniqueLinks[absLink] {
links = append(links, absLink)
@@ -263,12 +274,8 @@ func ParseLinks(html string, origin string) []string {
return links
}
-func isValidLink(link *url.Url) bool {
- if link.Scheme() != "" && link.Scheme() != "http" && link.Scheme() != "https" {
- return false
- }
-
- if strings.HasPrefix(link.String(), "javascript:") {
+func isValidLink(link *gourl.URL) bool {
+ if link.Scheme != "" && link.Scheme != "http" && link.Scheme != "https" {
return false
}
diff --git a/utils.go b/utils.go
index 8b52e76..73efa4a 100644
--- a/utils.go
+++ b/utils.go
@@ -27,6 +27,6 @@ func Print(v any, prefix string) string {
return prefix + strings.TrimSuffix(buf.String(), "\n")
}
-func ParseOptions(opts Options, v any) {
- json.Unmarshal(opts, v)
+func ParseConfig(cfg Config, v any) {
+ json.Unmarshal(cfg, v)
}