diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2025-01-10 12:49:32 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-01-10 12:49:32 +0100 |
| commit | bf99c233a18c3165e0d4d251b41224e5bc6eb93d (patch) | |
| tree | d32f0fd0770a049552cdd0d51e9402d594e9a35e /js.go | |
| parent | 924184f37ef0d3e244f8e8991c259affb45d0ae2 (diff) | |
Implement nested scraping (#81)
Diffstat (limited to 'js.go')
| -rw-r--r-- | js.go | 49 |
1 files changed, 39 insertions, 10 deletions
@@ -27,8 +27,9 @@ var ScriptTemplate []byte type Config []byte type ScrapeParams struct { - HTML string - URL string + HTML string + URL string + Process func(url string) ([]byte, error) } type ScrapeFunc func(ScrapeParams) (any, error) @@ -167,26 +168,21 @@ func scrape(vm *goja.Runtime) (ScrapeFunc, error) { return nil, errors.New("failed to export scrape function") } - return func(p ScrapeParams) (any, error) { - lock.Lock() - defer lock.Unlock() - + var newArg func(p ScrapeParams) (*goja.Object, error) + newArg = func(p ScrapeParams) (*goja.Object, error) { doc, err := DocumentFromString(p.HTML) if err != nil { - log.Println(err) return nil, err } baseurl, err := url.Parse(p.URL) if err != nil { - log.Println(err) return nil, err } absoluteURL := func(ref string) string { abs, err := baseurl.Parse(ref) if err != nil { - log.Println(err) return ref } return abs.String() @@ -196,8 +192,41 @@ func scrape(vm *goja.Runtime) (ScrapeFunc, error) { o.Set("url", p.URL) o.Set("doc", doc) o.Set("absoluteURL", absoluteURL) + o.Set("scrape", func(url string, f func(goja.FunctionCall) goja.Value) goja.Value { + url = absoluteURL(url) + + html, err := p.Process(url) + if err != nil { + return vm.ToValue(map[string]any{"error": err.Error()}) + } + + newp := ScrapeParams{ + HTML: string(html), + URL: url, + Process: p.Process, + } + + arg, err := newArg(newp) + if err != nil { + return vm.ToValue(map[string]any{"error": err.Error()}) + } + + return f(goja.FunctionCall{Arguments: []goja.Value{arg}}) + }) + + return o, nil + } + + return func(p ScrapeParams) (any, error) { + lock.Lock() + defer lock.Unlock() + + arg, err := newArg(p) + if err != nil { + return nil, err + } - ret := scrapefn(goja.FunctionCall{Arguments: []goja.Value{o}}) + ret := scrapefn(goja.FunctionCall{Arguments: []goja.Value{arg}}) if goja.IsUndefined(ret) { return nil, nil } |