diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-09-24 23:36:00 +0200 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-09-24 23:36:00 +0200 |
| commit | bd9e7f7acfd855d4685aa4544169c0e29cdbf205 (patch) | |
| tree | c5218c65359c0c2dee5a8db2670f30db677f068a | |
| parent | 08df9258a532b653c243e077e82491dbe62ad854 (diff) | |
clean up modules
| -rw-r--r-- | README.md | 2 | ||||
| -rw-r--r-- | cmd/flyscrape/dev.go | 6 | ||||
| -rw-r--r-- | cmd/flyscrape/run.go | 4 | ||||
| -rw-r--r-- | js.go | 14 | ||||
| -rw-r--r-- | js/template.js | 2 | ||||
| -rw-r--r-- | js_test.go | 26 | ||||
| -rw-r--r-- | module.go | 33 | ||||
| -rw-r--r-- | modules/depth/depth.go | 9 | ||||
| -rw-r--r-- | modules/depth/depth_test.go | 14 | ||||
| -rw-r--r-- | modules/domainfilter/domainfilter.go | 5 | ||||
| -rw-r--r-- | modules/domainfilter/domainfilter_test.go | 8 | ||||
| -rw-r--r-- | modules/followlinks/followlinks.go | 11 | ||||
| -rw-r--r-- | modules/followlinks/followlinks_test.go | 2 | ||||
| -rw-r--r-- | modules/jsonprinter/jsonprinter.go | 22 | ||||
| -rw-r--r-- | modules/jsonprinter/jsonprinter_test.go | 47 | ||||
| -rw-r--r-- | modules/ratelimit/ratelimit.go | 5 | ||||
| -rw-r--r-- | modules/ratelimit/ratelimit_test.go | 2 | ||||
| -rw-r--r-- | modules/starturl/starturl.go | 9 | ||||
| -rw-r--r-- | modules/starturl/starturl_test.go | 2 | ||||
| -rw-r--r-- | modules/urlfilter/urlfilter.go | 5 | ||||
| -rw-r--r-- | scrape.go | 143 | ||||
| -rw-r--r-- | utils.go | 4 |
22 files changed, 144 insertions, 231 deletions
@@ -59,7 +59,7 @@ Below is an example scraping script that showcases the capabilities of **flyscra ```javascript import { parse } from 'flyscrape'; -export const options = { +export const config = { url: 'https://news.ycombinator.com/', // Specify the URL to start scraping from. depth: 1, // Specify how deep links should be followed. (default = 0, no follow) allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) diff --git a/cmd/flyscrape/dev.go b/cmd/flyscrape/dev.go index 169e6d3..95c627e 100644 --- a/cmd/flyscrape/dev.go +++ b/cmd/flyscrape/dev.go @@ -39,7 +39,7 @@ func (c *DevCommand) Run(args []string) error { script := fs.Arg(0) err := flyscrape.Watch(script, func(s string) error { - opts, scrape, err := flyscrape.Compile(s) + cfg, scrape, err := flyscrape.Compile(s) if err != nil { screen.Clear() screen.MoveTopLeft() @@ -58,7 +58,7 @@ func (c *DevCommand) Run(args []string) error { scraper := flyscrape.NewScraper() scraper.ScrapeFunc = scrape - flyscrape.LoadModules(scraper, opts) + flyscrape.LoadModules(scraper, cfg) scraper.Run() @@ -69,7 +69,7 @@ func (c *DevCommand) Run(args []string) error { log.Println(resp.Error) return } - fmt.Println(flyscrape.PrettyPrint(resp.ScrapeResult, "")) + fmt.Println(flyscrape.PrettyPrint(resp.Data, "")) }) return nil diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go index 22f41fd..4580e6d 100644 --- a/cmd/flyscrape/run.go +++ b/cmd/flyscrape/run.go @@ -34,14 +34,14 @@ func (c *RunCommand) Run(args []string) error { return fmt.Errorf("failed to read script %q: %w", script, err) } - opts, scrape, err := flyscrape.Compile(string(src)) + cfg, scrape, err := flyscrape.Compile(string(src)) if err != nil { return fmt.Errorf("failed to compile script: %w", err) } scraper := flyscrape.NewScraper() scraper.ScrapeFunc = scrape - flyscrape.LoadModules(scraper, opts) + flyscrape.LoadModules(scraper, cfg) count := 0 start := time.Now() @@ -16,7 +16,7 @@ import ( v8 "rogchap.com/v8go" ) -type Options []byte +type Config []byte type TransformError struct { Line int @@ -28,7 +28,7 @@ func (err TransformError) Error() string { return fmt.Sprintf("%d:%d: %s", err.Line, err.Column, err.Text) } -func Compile(src string) (Options, ScrapeFunc, error) { +func Compile(src string) (Config, ScrapeFunc, error) { src, err := build(src) if err != nil { return nil, nil, err @@ -58,7 +58,7 @@ func build(src string) (string, error) { return string(res.Code), nil } -func vm(src string) (Options, ScrapeFunc, error) { +func vm(src string) (Config, ScrapeFunc, error) { ctx := v8.NewContext() ctx.RunScript("var module = {}", "main.js") @@ -72,12 +72,12 @@ func vm(src string) (Options, ScrapeFunc, error) { return nil, nil, fmt.Errorf("running user script: %w", err) } - cfg, err := ctx.RunScript("JSON.stringify(options)", "main.js") + cfg, err := ctx.RunScript("JSON.stringify(config)", "main.js") if err != nil { - return nil, nil, fmt.Errorf("reading options: %w", err) + return nil, nil, fmt.Errorf("reading config: %w", err) } if !cfg.IsString() { - return nil, nil, fmt.Errorf("options is not a string") + return nil, nil, fmt.Errorf("config is not a string") } scrape := func(params ScrapeParams) (any, error) { @@ -97,7 +97,7 @@ func vm(src string) (Options, ScrapeFunc, error) { return obj, nil } - return Options(cfg.String()), scrape, nil + return Config(cfg.String()), scrape, nil } func randSeq(n int) string { diff --git a/js/template.js b/js/template.js index 82196f0..1a030e5 100644 --- a/js/template.js +++ b/js/template.js @@ -1,6 +1,6 @@ import { parse } from 'flyscrape'; -export const options = { +export const config = { url: 'https://news.ycombinator.com/', // Specify the URL to start scraping from. depth: 0, // Specify how deep links should be followed. (default = 0, no follow) allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) @@ -25,7 +25,7 @@ var html = ` var script = ` import { parse } from "flyscrape"; -export const options = { +export const config = { url: "https://localhost/", } @@ -41,9 +41,9 @@ export default function({ html, url }) { ` func TestJSScrape(t *testing.T) { - opts, run, err := flyscrape.Compile(script) + cfg, run, err := flyscrape.Compile(script) require.NoError(t, err) - require.NotNil(t, opts) + require.NotNil(t, cfg) require.NotNil(t, run) result, err := run(flyscrape.ScrapeParams{ @@ -61,9 +61,9 @@ func TestJSScrape(t *testing.T) { } func TestJSCompileError(t *testing.T) { - opts, run, err := flyscrape.Compile("import foo;") + cfg, run, err := flyscrape.Compile("import foo;") require.Error(t, err) - require.Empty(t, opts) + require.Empty(t, cfg) require.Nil(t, run) var terr flyscrape.TransformError @@ -76,31 +76,31 @@ func TestJSCompileError(t *testing.T) { }) } -func TestJSOptions(t *testing.T) { +func TestJSConfig(t *testing.T) { js := ` - export const options = { + export const config = { url: 'http://localhost/', depth: 5, allowedDomains: ['example.com'], } export default function() {} ` - rawOpts, _, err := flyscrape.Compile(js) + rawCfg, _, err := flyscrape.Compile(js) require.NoError(t, err) - type options struct { + type config struct { URL string `json:"url"` Depth int `json:"depth"` AllowedDomains []string `json:"allowedDomains"` } - var opts options - err = json.Unmarshal(rawOpts, &opts) + var cfg config + err = json.Unmarshal(rawCfg, &cfg) require.NoError(t, err) - require.Equal(t, options{ + require.Equal(t, config{ URL: "http://localhost/", Depth: 5, AllowedDomains: []string{"example.com"}, - }, opts) + }, cfg) } @@ -2,14 +2,10 @@ package flyscrape import ( "encoding/json" - "fmt" "net/http" - "sync" ) -type Module interface { - ID() string -} +type Module any type Transport interface { Transport(*http.Request) (*http.Response, error) @@ -34,32 +30,15 @@ type OnComplete interface { OnComplete() } -func RegisterModule(m Module) { - id := m.ID() - if id == "" { - panic("module id is missing") - } - - globalModulesMu.Lock() - defer globalModulesMu.Unlock() - - if _, ok := globalModules[id]; ok { - panic(fmt.Sprintf("module %s already registered", id)) - } - globalModules[id] = m +func RegisterModule(mod Module) { + globalModules = append(globalModules, mod) } -func LoadModules(s *Scraper, opts Options) { - globalModulesMu.RLock() - defer globalModulesMu.RUnlock() - +func LoadModules(s *Scraper, cfg Config) { for _, mod := range globalModules { - json.Unmarshal(opts, mod) + json.Unmarshal(cfg, mod) s.LoadModule(mod) } } -var ( - globalModules = map[string]Module{} - globalModulesMu sync.RWMutex -) +var globalModules = []Module{} diff --git a/modules/depth/depth.go b/modules/depth/depth.go index 5efedc8..0cfbc71 100644 --- a/modules/depth/depth.go +++ b/modules/depth/depth.go @@ -16,15 +16,8 @@ type Module struct { Depth int `json:"depth"` } -func (m *Module) ID() string { - return "depth" -} - func (m *Module) CanRequest(url string, depth int) bool { return depth <= m.Depth } -var ( - _ flyscrape.Module = (*Module)(nil) - _ flyscrape.CanRequest = (*Module)(nil) -) +var _ flyscrape.CanRequest = (*Module)(nil) diff --git a/modules/depth/depth_test.go b/modules/depth/depth_test.go index 309e628..c9afd6f 100644 --- a/modules/depth/depth_test.go +++ b/modules/depth/depth_test.go @@ -17,17 +17,17 @@ import ( func TestDepth(t *testing.T) { scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) scraper.LoadModule(&followlinks.Module{}) scraper.LoadModule(&depth.Module{Depth: 2}) scraper.SetTransport(func(r *http.Request) (*http.Response, error) { switch r.URL.String() { - case "http://www.example.com/": + case "http://www.example.com": return flyscrape.MockResponse(200, `<a href="http://www.google.com">Google</a>`) - case "http://www.google.com/": + case "http://www.google.com": return flyscrape.MockResponse(200, `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`) - case "http://www.duckduckgo.com/": + case "http://www.duckduckgo.com": return flyscrape.MockResponse(200, `<a href="http://www.example.com">Example</a>`) } return flyscrape.MockResponse(200, "") @@ -41,7 +41,7 @@ func TestDepth(t *testing.T) { scraper.Run() require.Len(t, urls, 3) - require.Contains(t, urls, "http://www.example.com/") - require.Contains(t, urls, "http://www.google.com/") - require.Contains(t, urls, "http://www.duckduckgo.com/") + require.Contains(t, urls, "http://www.example.com") + require.Contains(t, urls, "http://www.google.com") + require.Contains(t, urls, "http://www.duckduckgo.com") } diff --git a/modules/domainfilter/domainfilter.go b/modules/domainfilter/domainfilter.go index b892882..ba9ebe6 100644 --- a/modules/domainfilter/domainfilter.go +++ b/modules/domainfilter/domainfilter.go @@ -19,10 +19,6 @@ type Module struct { BlockedDomains []string `json:"blockedDomains"` } -func (m *Module) ID() string { - return "domainfilter" -} - func (m *Module) OnLoad(v flyscrape.Visitor) { if u, err := url.Parse(m.URL); err == nil { m.AllowedDomains = append(m.AllowedDomains, u.Host()) @@ -56,7 +52,6 @@ func (m *Module) CanRequest(rawurl string, depth int) bool { } var ( - _ flyscrape.Module = (*Module)(nil) _ flyscrape.CanRequest = (*Module)(nil) _ flyscrape.OnLoad = (*Module)(nil) ) diff --git a/modules/domainfilter/domainfilter_test.go b/modules/domainfilter/domainfilter_test.go index 97bdc9c..884a89f 100644 --- a/modules/domainfilter/domainfilter_test.go +++ b/modules/domainfilter/domainfilter_test.go @@ -36,7 +36,7 @@ func TestDomainfilterAllowed(t *testing.T) { require.Len(t, urls, 2) require.Contains(t, urls, "http://www.example.com") - require.Contains(t, urls, "http://www.google.com/") + require.Contains(t, urls, "http://www.google.com") } func TestDomainfilterAllowedAll(t *testing.T) { @@ -61,8 +61,8 @@ func TestDomainfilterAllowedAll(t *testing.T) { require.Len(t, urls, 3) require.Contains(t, urls, "http://www.example.com") - require.Contains(t, urls, "http://www.duckduckgo.com/") - require.Contains(t, urls, "http://www.google.com/") + require.Contains(t, urls, "http://www.duckduckgo.com") + require.Contains(t, urls, "http://www.google.com") } func TestDomainfilterBlocked(t *testing.T) { @@ -88,5 +88,5 @@ func TestDomainfilterBlocked(t *testing.T) { require.Len(t, urls, 2) require.Contains(t, urls, "http://www.example.com") - require.Contains(t, urls, "http://www.duckduckgo.com/") + require.Contains(t, urls, "http://www.duckduckgo.com") } diff --git a/modules/followlinks/followlinks.go b/modules/followlinks/followlinks.go index dde0e90..99d6cee 100644 --- a/modules/followlinks/followlinks.go +++ b/modules/followlinks/followlinks.go @@ -14,17 +14,10 @@ func init() { type Module struct{} -func (m *Module) ID() string { - return "followlinks" -} - func (m *Module) OnResponse(resp *flyscrape.Response) { - for _, link := range flyscrape.ParseLinks(resp.HTML, resp.URL) { + for _, link := range flyscrape.ParseLinks(string(resp.Body), resp.Request.URL) { resp.Visit(link) } } -var ( - _ flyscrape.Module = (*Module)(nil) - _ flyscrape.OnResponse = (*Module)(nil) -) +var _ flyscrape.OnResponse = (*Module)(nil) diff --git a/modules/followlinks/followlinks_test.go b/modules/followlinks/followlinks_test.go index 03c3a6b..18c8ceb 100644 --- a/modules/followlinks/followlinks_test.go +++ b/modules/followlinks/followlinks_test.go @@ -34,6 +34,6 @@ func TestFollowLinks(t *testing.T) { require.Contains(t, urls, "http://www.example.com/baz") require.Contains(t, urls, "http://www.example.com/foo/bar") require.Contains(t, urls, "http://www.example.com/foo/baz") - require.Contains(t, urls, "http://www.google.com/") + require.Contains(t, urls, "http://www.google.com") require.Contains(t, urls, "http://www.google.com/baz") } diff --git a/modules/jsonprinter/jsonprinter.go b/modules/jsonprinter/jsonprinter.go index 3936277..3026f29 100644 --- a/modules/jsonprinter/jsonprinter.go +++ b/modules/jsonprinter/jsonprinter.go @@ -6,6 +6,7 @@ package jsonprinter import ( "fmt" + "time" "github.com/philippta/flyscrape" ) @@ -18,10 +19,6 @@ type Module struct { first bool } -func (m *Module) ID() string { - return "jsonprinter" -} - func (m *Module) OnResponse(resp *flyscrape.Response) { if resp.Error == nil && resp.Data == nil { return @@ -33,15 +30,28 @@ func (m *Module) OnResponse(resp *flyscrape.Response) { fmt.Println(",") } - fmt.Print(flyscrape.PrettyPrint(resp.ScrapeResult, " ")) + o := output{ + URL: resp.Request.URL, + Data: resp.Data, + Error: resp.Error, + Timestamp: time.Now(), + } + + fmt.Print(flyscrape.PrettyPrint(o, " ")) } func (m *Module) OnComplete() { fmt.Println("\n]") } +type output struct { + URL string `json:"url,omitempty"` + Data any `json:"data,omitempty"` + Error error `json:"error,omitempty"` + Timestamp time.Time `json:"timestamp,omitempty"` +} + var ( - _ flyscrape.Module = (*Module)(nil) _ flyscrape.OnResponse = (*Module)(nil) _ flyscrape.OnComplete = (*Module)(nil) ) diff --git a/modules/jsonprinter/jsonprinter_test.go b/modules/jsonprinter/jsonprinter_test.go deleted file mode 100644 index 29cc438..0000000 --- a/modules/jsonprinter/jsonprinter_test.go +++ /dev/null @@ -1,47 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at http://mozilla.org/MPL/2.0/. - -package jsonprinter_test - -import ( - "net/http" - "testing" - - "github.com/philippta/flyscrape" - "github.com/philippta/flyscrape/modules/depth" - "github.com/philippta/flyscrape/modules/followlinks" - "github.com/philippta/flyscrape/modules/starturl" - "github.com/stretchr/testify/require" -) - -func TestDepth(t *testing.T) { - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) - scraper.LoadModule(&followlinks.Module{}) - scraper.LoadModule(&depth.Module{Depth: 2}) - - scraper.SetTransport(func(r *http.Request) (*http.Response, error) { - switch r.URL.String() { - case "http://www.example.com/": - return flyscrape.MockResponse(200, `<a href="http://www.google.com">Google</a>`) - case "http://www.google.com/": - return flyscrape.MockResponse(200, `<a href="http://www.duckduckgo.com">DuckDuckGo</a>`) - case "http://www.duckduckgo.com/": - return flyscrape.MockResponse(200, `<a href="http://www.example.com">Example</a>`) - } - return flyscrape.MockResponse(200, "") - }) - - var urls []string - scraper.OnRequest(func(req *flyscrape.Request) { - urls = append(urls, req.URL) - }) - - scraper.Run() - - require.Len(t, urls, 3) - require.Contains(t, urls, "http://www.example.com/") - require.Contains(t, urls, "http://www.google.com/") - require.Contains(t, urls, "http://www.duckduckgo.com/") -} diff --git a/modules/ratelimit/ratelimit.go b/modules/ratelimit/ratelimit.go index b02f5d5..be622f6 100644 --- a/modules/ratelimit/ratelimit.go +++ b/modules/ratelimit/ratelimit.go @@ -21,10 +21,6 @@ type Module struct { semaphore chan struct{} } -func (m *Module) ID() string { - return "ratelimit" -} - func (m *Module) OnLoad(v flyscrape.Visitor) { rate := time.Duration(float64(time.Second) / m.Rate) @@ -47,7 +43,6 @@ func (m *Module) OnComplete() { } var ( - _ flyscrape.Module = (*Module)(nil) _ flyscrape.OnRequest = (*Module)(nil) _ flyscrape.OnLoad = (*Module)(nil) _ flyscrape.OnComplete = (*Module)(nil) diff --git a/modules/ratelimit/ratelimit_test.go b/modules/ratelimit/ratelimit_test.go index c166371..5e91f8f 100644 --- a/modules/ratelimit/ratelimit_test.go +++ b/modules/ratelimit/ratelimit_test.go @@ -17,7 +17,7 @@ import ( func TestRatelimit(t *testing.T) { scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/"}) + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) scraper.LoadModule(&followlinks.Module{}) scraper.LoadModule(&ratelimit.Module{ Rate: 100, diff --git a/modules/starturl/starturl.go b/modules/starturl/starturl.go index b2e6c47..109d28f 100644 --- a/modules/starturl/starturl.go +++ b/modules/starturl/starturl.go @@ -16,15 +16,8 @@ type Module struct { URL string `json:"url"` } -func (m *Module) ID() string { - return "starturl" -} - func (m *Module) OnLoad(v flyscrape.Visitor) { v.Visit(m.URL) } -var ( - _ flyscrape.Module = (*Module)(nil) - _ flyscrape.OnLoad = (*Module)(nil) -) +var _ flyscrape.OnLoad = (*Module)(nil) diff --git a/modules/starturl/starturl_test.go b/modules/starturl/starturl_test.go index 647e197..6fab776 100644 --- a/modules/starturl/starturl_test.go +++ b/modules/starturl/starturl_test.go @@ -12,7 +12,7 @@ import ( "github.com/stretchr/testify/require" ) -func TestFollowLinks(t *testing.T) { +func TestStartURL(t *testing.T) { scraper := flyscrape.NewScraper() scraper.LoadModule(&starturl.Module{URL: "http://www.example.com/foo/bar"}) scraper.SetTransport(flyscrape.MockTransport(200, "")) diff --git a/modules/urlfilter/urlfilter.go b/modules/urlfilter/urlfilter.go index 14576f0..00a4bd2 100644 --- a/modules/urlfilter/urlfilter.go +++ b/modules/urlfilter/urlfilter.go @@ -23,10 +23,6 @@ type Module struct { blockedURLsRE []*regexp.Regexp } -func (m *Module) ID() string { - return "urlfilter" -} - func (m *Module) OnLoad(v flyscrape.Visitor) { for _, pat := range m.AllowedURLs { re, err := regexp.Compile(pat) @@ -79,7 +75,6 @@ func (m *Module) CanRequest(rawurl string, depth int) bool { } var ( - _ flyscrape.Module = (*Module)(nil) _ flyscrape.CanRequest = (*Module)(nil) _ flyscrape.OnLoad = (*Module)(nil) ) @@ -8,13 +8,15 @@ import ( "io" "log" "net/http" + "net/http/cookiejar" + "net/url" "strings" "sync" - "time" + + gourl "net/url" "github.com/PuerkitoBio/goquery" "github.com/cornelk/hashmap" - "github.com/nlnwa/whatwg-url/url" ) type ScrapeParams struct { @@ -22,18 +24,6 @@ type ScrapeParams struct { URL string } -type ScrapeResult struct { - URL string `json:"url"` - Data any `json:"data,omitempty"` - Links []string `json:"-"` - Error error `json:"error,omitempty"` - Timestamp time.Time `json:"timestamp"` -} - -func (s *ScrapeResult) omit() bool { - return s.Error == nil && s.Data == nil -} - type ScrapeFunc func(ScrapeParams) (any, error) type FetchFunc func(url string) (string, error) @@ -43,32 +33,39 @@ type Visitor interface { MarkVisited(url string) } -type ( - Request struct { - URL string - Depth int - } +type Request struct { + Method string + URL string + Headers http.Header + Cookies http.CookieJar + Depth int +} - Response struct { - ScrapeResult - HTML string - Visit func(url string) - } +type Response struct { + StatusCode int + Headers http.Header + Body []byte + Data any + Error error + Request *Request - target struct { - url string - depth int - } -) + Visit func(url string) +} + +type target struct { + url string + depth int +} type Scraper struct { ScrapeFunc ScrapeFunc - opts Options - wg sync.WaitGroup - jobs chan target - visited *hashmap.Map[string, struct{}] - modules *hashmap.Map[string, Module] + cfg Config + wg sync.WaitGroup + jobs chan target + visited *hashmap.Map[string, struct{}] + modules *hashmap.Map[string, Module] + cookieJar *cookiejar.Jar canRequestHandlers []func(url string, depth int) bool onRequestHandlers []func(*Request) @@ -78,6 +75,7 @@ type Scraper struct { } func NewScraper() *Scraper { + jar, _ := cookiejar.New(nil) s := &Scraper{ jobs: make(chan target, 1024), visited: hashmap.New[string, struct{}](), @@ -86,6 +84,7 @@ func NewScraper() *Scraper { r.Header.Set("User-Agent", "flyscrape/0.1") return http.DefaultClient.Do(r) }, + cookieJar: jar, } return s } @@ -165,58 +164,66 @@ func (s *Scraper) worker() { } } - res, html := s.process(job) - for _, handler := range s.onResponseHandlers { - handler(&Response{ - ScrapeResult: res, - HTML: html, - Visit: func(url string) { - s.enqueueJob(url, job.depth+1) - }, - }) - } + s.process(job.url, job.depth) }(job) } } -func (s *Scraper) process(job target) (res ScrapeResult, html string) { - res.URL = job.url - res.Timestamp = time.Now() +func (s *Scraper) process(url string, depth int) { + request := &Request{ + Method: http.MethodGet, + URL: url, + Headers: http.Header{}, + Cookies: s.cookieJar, + } + + response := &Response{ + Request: request, + Visit: func(url string) { + s.enqueueJob(url, depth+1) + }, + } + + defer func() { + for _, handler := range s.onResponseHandlers { + handler(response) + } + }() - req, err := http.NewRequest(http.MethodGet, job.url, nil) + req, err := http.NewRequest(request.Method, request.URL, nil) if err != nil { - res.Error = err + response.Error = err return } + req.Header = request.Headers for _, handler := range s.onRequestHandlers { - handler(&Request{URL: job.url, Depth: job.depth}) + handler(request) } resp, err := s.transport(req) if err != nil { - res.Error = err + response.Error = err return } defer resp.Body.Close() - body, err := io.ReadAll(resp.Body) + response.StatusCode = resp.StatusCode + response.Headers = resp.Header + + response.Body, err = io.ReadAll(resp.Body) if err != nil { - res.Error = err + response.Error = err return } - html = string(body) - if s.ScrapeFunc != nil { - res.Data, err = s.ScrapeFunc(ScrapeParams{HTML: html, URL: job.url}) + response.Data, err = s.ScrapeFunc(ScrapeParams{HTML: string(response.Body), URL: request.URL}) if err != nil { - res.Error = err + response.Error = err return } } - - return } func (s *Scraper) enqueueJob(url string, depth int) { @@ -241,18 +248,22 @@ func ParseLinks(html string, origin string) []string { return nil } - urlParser := url.NewParser(url.WithPercentEncodeSinglePercentSign()) + originurl, err := url.Parse(origin) + if err != nil { + return nil + } uniqueLinks := make(map[string]bool) doc.Find("a").Each(func(i int, s *goquery.Selection) { link, _ := s.Attr("href") - parsedLink, err := urlParser.ParseRef(origin, link) + parsedLink, err := originurl.Parse(link) + if err != nil || !isValidLink(parsedLink) { return } - absLink := parsedLink.Href(true) + absLink := parsedLink.String() if !uniqueLinks[absLink] { links = append(links, absLink) @@ -263,12 +274,8 @@ func ParseLinks(html string, origin string) []string { return links } -func isValidLink(link *url.Url) bool { - if link.Scheme() != "" && link.Scheme() != "http" && link.Scheme() != "https" { - return false - } - - if strings.HasPrefix(link.String(), "javascript:") { +func isValidLink(link *gourl.URL) bool { + if link.Scheme != "" && link.Scheme != "http" && link.Scheme != "https" { return false } @@ -27,6 +27,6 @@ func Print(v any, prefix string) string { return prefix + strings.TrimSuffix(buf.String(), "\n") } -func ParseOptions(opts Options, v any) { - json.Unmarshal(opts, v) +func ParseConfig(cfg Config, v any) { + json.Unmarshal(cfg, v) } |