summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2025-01-10 12:49:32 +0100
committerGitHub <noreply@github.com>2025-01-10 12:49:32 +0100
commitbf99c233a18c3165e0d4d251b41224e5bc6eb93d (patch)
treed32f0fd0770a049552cdd0d51e9402d594e9a35e
parent924184f37ef0d3e244f8e8991c259affb45d0ae2 (diff)
Implement nested scraping (#81)
-rw-r--r--README.md19
-rw-r--r--examples/hackernews_with_comments.js23
-rw-r--r--js.go49
-rw-r--r--js_lib_test.go7
-rw-r--r--js_test.go92
-rw-r--r--module.go3
-rw-r--r--scrape.go54
7 files changed, 229 insertions, 18 deletions
diff --git a/README.md b/README.md
index 233e2ca..6a3290c 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,7 @@ Flyscrape is a command-line web scraping tool designed for those without <br />a
- **Scriptable:** Use JavaScript to write your data extraction logic.
- **System Cookies:** Give Flyscrape access to your browsers cookie store.
- **Browser Mode:** Render JavaScript heavy pages using a headless Browser.
+- **Nested Scraping:** Extract data from linked pages within a single scrape.
## Overview
@@ -259,10 +260,20 @@ export const config = {
},
};
-export default function ({ doc, url, absoluteURL }) {
- // doc - Contains the parsed HTML document
- // url - Contains the scraped URL
- // absoluteURL(...) - Transforms relative URLs into absolute URLs
+export default function ({ doc, url, absoluteURL, scrape }) {
+ // doc
+ // Contains the parsed HTML document.
+
+ // url
+ // Contains the scraped URL.
+
+ // absoluteURL("/foo")
+ // Transforms a relative URL into absolute URL.
+
+ // scrape(url, function({ doc, url, absoluteURL, scrape }) {
+ // return { ... };
+ // })
+ // Scrapes a linked page and returns the scrape result.
}
```
diff --git a/examples/hackernews_with_comments.js b/examples/hackernews_with_comments.js
new file mode 100644
index 0000000..8d9cfb5
--- /dev/null
+++ b/examples/hackernews_with_comments.js
@@ -0,0 +1,23 @@
+export const config = {
+ url: "https://news.ycombinator.com/",
+};
+
+export default function({ doc, scrape }) {
+ const post = doc.find(".athing.submission").first();
+ const title = post.find(".titleline > a").text();
+ const commentsLink = post.next().find("a").last().attr("href");
+
+ const comments = scrape(commentsLink, function({ doc }) {
+ return doc.find(".comtr").map(comment => {
+ return {
+ author: comment.find(".hnuser").text(),
+ text: comment.find(".commtext").text(),
+ };
+ });
+ });
+
+ return {
+ title,
+ comments,
+ };
+}
diff --git a/js.go b/js.go
index 50c89ca..c56ebdb 100644
--- a/js.go
+++ b/js.go
@@ -27,8 +27,9 @@ var ScriptTemplate []byte
type Config []byte
type ScrapeParams struct {
- HTML string
- URL string
+ HTML string
+ URL string
+ Process func(url string) ([]byte, error)
}
type ScrapeFunc func(ScrapeParams) (any, error)
@@ -167,26 +168,21 @@ func scrape(vm *goja.Runtime) (ScrapeFunc, error) {
return nil, errors.New("failed to export scrape function")
}
- return func(p ScrapeParams) (any, error) {
- lock.Lock()
- defer lock.Unlock()
-
+ var newArg func(p ScrapeParams) (*goja.Object, error)
+ newArg = func(p ScrapeParams) (*goja.Object, error) {
doc, err := DocumentFromString(p.HTML)
if err != nil {
- log.Println(err)
return nil, err
}
baseurl, err := url.Parse(p.URL)
if err != nil {
- log.Println(err)
return nil, err
}
absoluteURL := func(ref string) string {
abs, err := baseurl.Parse(ref)
if err != nil {
- log.Println(err)
return ref
}
return abs.String()
@@ -196,8 +192,41 @@ func scrape(vm *goja.Runtime) (ScrapeFunc, error) {
o.Set("url", p.URL)
o.Set("doc", doc)
o.Set("absoluteURL", absoluteURL)
+ o.Set("scrape", func(url string, f func(goja.FunctionCall) goja.Value) goja.Value {
+ url = absoluteURL(url)
+
+ html, err := p.Process(url)
+ if err != nil {
+ return vm.ToValue(map[string]any{"error": err.Error()})
+ }
+
+ newp := ScrapeParams{
+ HTML: string(html),
+ URL: url,
+ Process: p.Process,
+ }
+
+ arg, err := newArg(newp)
+ if err != nil {
+ return vm.ToValue(map[string]any{"error": err.Error()})
+ }
+
+ return f(goja.FunctionCall{Arguments: []goja.Value{arg}})
+ })
+
+ return o, nil
+ }
+
+ return func(p ScrapeParams) (any, error) {
+ lock.Lock()
+ defer lock.Unlock()
+
+ arg, err := newArg(p)
+ if err != nil {
+ return nil, err
+ }
- ret := scrapefn(goja.FunctionCall{Arguments: []goja.Value{o}})
+ ret := scrapefn(goja.FunctionCall{Arguments: []goja.Value{arg}})
if goja.IsUndefined(ret) {
return nil, nil
}
diff --git a/js_lib_test.go b/js_lib_test.go
index aca8ce9..ad19380 100644
--- a/js_lib_test.go
+++ b/js_lib_test.go
@@ -8,6 +8,7 @@ import (
"encoding/json"
"net/http"
"os"
+ "sync/atomic"
"testing"
"github.com/philippta/flyscrape"
@@ -203,10 +204,10 @@ func TestJSLibHTTPDownload(t *testing.T) {
http.download("https://example.com/404.txt");
`
- nreqs := 0
+ var nreqs atomic.Int32
client := &http.Client{
Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
- nreqs++
+ nreqs.Add(1)
if r.URL.Path == "/content-disposition" {
resp, err := flyscrape.MockResponse(200, "hello world")
@@ -233,7 +234,7 @@ func TestJSLibHTTPDownload(t *testing.T) {
wait()
- require.Equal(t, nreqs, 8)
+ require.Equal(t, nreqs.Load(), int32(8))
require.FileExists(t, "foo.txt")
require.FileExists(t, "dir/my-foo.txt")
require.FileExists(t, "dir/bar.txt")
diff --git a/js_test.go b/js_test.go
index 0aeb9cd..4b08720 100644
--- a/js_test.go
+++ b/js_test.go
@@ -168,6 +168,98 @@ func TestJSScrapeNaN(t *testing.T) {
require.Nil(t, result)
}
+func TestJSScrapeParamURL(t *testing.T) {
+ js := `
+ export default function({ url }) {
+ return url;
+ }
+ `
+ exports, err := flyscrape.Compile(js, nil)
+ require.NoError(t, err)
+
+ result, err := exports.Scrape(flyscrape.ScrapeParams{
+ HTML: html,
+ URL: "http://localhost/",
+ })
+ require.NoError(t, err)
+ require.Equal(t, "http://localhost/", result)
+}
+
+func TestJSScrapeParamAbsoluteURL(t *testing.T) {
+ js := `
+ export default function({ absoluteURL }) {
+ return absoluteURL("/foo");
+ }
+ `
+ exports, err := flyscrape.Compile(js, nil)
+ require.NoError(t, err)
+
+ result, err := exports.Scrape(flyscrape.ScrapeParams{
+ HTML: html,
+ URL: "http://localhost/",
+ })
+ require.NoError(t, err)
+ require.Equal(t, "http://localhost/foo", result)
+}
+
+func TestJSScrapeParamScrape(t *testing.T) {
+ js := `
+ export default function({ scrape }) {
+ return scrape("/foo", function({ url }) {
+ return {
+ url: url,
+ foo: "bar",
+ };
+ });
+ }
+ `
+ exports, err := flyscrape.Compile(js, nil)
+ require.NoError(t, err)
+
+ result, err := exports.Scrape(flyscrape.ScrapeParams{
+ HTML: html,
+ URL: "http://localhost/",
+ Process: func(url string) ([]byte, error) {
+ return nil, nil
+ },
+ })
+ require.NoError(t, err)
+ require.Equal(t, map[string]any{
+ "url": "http://localhost/foo",
+ "foo": "bar",
+ }, result)
+}
+
+func TestJSScrapeParamScrapeDeep(t *testing.T) {
+ js := `
+ export default function({ scrape }) {
+ return scrape("/foo/", function({ url, scrape }) {
+ return {
+ url: url,
+ deep: scrape("bar", function({ url }) {
+ return url;
+ }),
+ };
+ });
+ }
+ `
+ exports, err := flyscrape.Compile(js, nil)
+ require.NoError(t, err)
+
+ result, err := exports.Scrape(flyscrape.ScrapeParams{
+ HTML: html,
+ URL: "http://localhost/",
+ Process: func(url string) ([]byte, error) {
+ return nil, nil
+ },
+ })
+ require.NoError(t, err)
+ require.Equal(t, map[string]any{
+ "url": "http://localhost/foo/",
+ "deep": "http://localhost/foo/bar",
+ }, result)
+}
+
func TestJSCompileError(t *testing.T) {
exports, err := flyscrape.Compile("import foo;", nil)
require.Error(t, err)
diff --git a/module.go b/module.go
index 47ccd31..3d3b659 100644
--- a/module.go
+++ b/module.go
@@ -63,6 +63,9 @@ func LoadModules(cfg Config) []Module {
// load standard modules in order
for _, id := range moduleOrder {
+ if _, ok := loaded[id]; ok {
+ continue
+ }
mod := modules[id].ModuleInfo().New()
if err := json.Unmarshal(cfg, mod); err != nil {
panic("failed to decode config: " + err.Error())
diff --git a/scrape.go b/scrape.go
index 1667f42..a183681 100644
--- a/scrape.go
+++ b/scrape.go
@@ -203,7 +203,13 @@ func (s *Scraper) process(url string, depth int) {
}
}()
- response.Data, err = s.ScrapeFunc(ScrapeParams{HTML: string(response.Body), URL: request.URL})
+ p := ScrapeParams{
+ HTML: string(response.Body),
+ URL: request.URL,
+ Process: s.processImmediate,
+ }
+
+ response.Data, err = s.ScrapeFunc(p)
if err != nil {
response.Error = err
return
@@ -212,6 +218,52 @@ func (s *Scraper) process(url string, depth int) {
}
}
+func (s *Scraper) processImmediate(url string) ([]byte, error) {
+ request := &Request{
+ Method: http.MethodGet,
+ URL: url,
+ Headers: http.Header{},
+ Cookies: s.Client.Jar,
+ }
+
+ for _, mod := range s.Modules {
+ if v, ok := mod.(RequestBuilder); ok {
+ v.BuildRequest(request)
+ }
+ }
+
+ req, err := http.NewRequest(request.Method, request.URL, nil)
+ if err != nil {
+ return nil, err
+ }
+ req.Header = request.Headers
+
+ for _, mod := range s.Modules {
+ if v, ok := mod.(RequestValidator); ok {
+ if !v.ValidateRequest(request) {
+ return nil, nil
+ }
+ }
+ }
+
+ resp, err := s.Client.Do(req)
+ if err != nil {
+ return nil, err
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+ return nil, fmt.Errorf("%d %s", resp.StatusCode, http.StatusText(resp.StatusCode))
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, err
+ }
+
+ return body, nil
+}
+
func (s *Scraper) enqueueJob(url string, depth int) {
url = strings.TrimSpace(url)
if url == "" {