diff options
| -rw-r--r-- | README.md | 19 | ||||
| -rw-r--r-- | examples/hackernews_with_comments.js | 23 | ||||
| -rw-r--r-- | js.go | 49 | ||||
| -rw-r--r-- | js_lib_test.go | 7 | ||||
| -rw-r--r-- | js_test.go | 92 | ||||
| -rw-r--r-- | module.go | 3 | ||||
| -rw-r--r-- | scrape.go | 54 |
7 files changed, 229 insertions, 18 deletions
@@ -37,6 +37,7 @@ Flyscrape is a command-line web scraping tool designed for those without <br />a - **Scriptable:** Use JavaScript to write your data extraction logic. - **System Cookies:** Give Flyscrape access to your browsers cookie store. - **Browser Mode:** Render JavaScript heavy pages using a headless Browser. +- **Nested Scraping:** Extract data from linked pages within a single scrape. ## Overview @@ -259,10 +260,20 @@ export const config = { }, }; -export default function ({ doc, url, absoluteURL }) { - // doc - Contains the parsed HTML document - // url - Contains the scraped URL - // absoluteURL(...) - Transforms relative URLs into absolute URLs +export default function ({ doc, url, absoluteURL, scrape }) { + // doc + // Contains the parsed HTML document. + + // url + // Contains the scraped URL. + + // absoluteURL("/foo") + // Transforms a relative URL into absolute URL. + + // scrape(url, function({ doc, url, absoluteURL, scrape }) { + // return { ... }; + // }) + // Scrapes a linked page and returns the scrape result. } ``` diff --git a/examples/hackernews_with_comments.js b/examples/hackernews_with_comments.js new file mode 100644 index 0000000..8d9cfb5 --- /dev/null +++ b/examples/hackernews_with_comments.js @@ -0,0 +1,23 @@ +export const config = { + url: "https://news.ycombinator.com/", +}; + +export default function({ doc, scrape }) { + const post = doc.find(".athing.submission").first(); + const title = post.find(".titleline > a").text(); + const commentsLink = post.next().find("a").last().attr("href"); + + const comments = scrape(commentsLink, function({ doc }) { + return doc.find(".comtr").map(comment => { + return { + author: comment.find(".hnuser").text(), + text: comment.find(".commtext").text(), + }; + }); + }); + + return { + title, + comments, + }; +} @@ -27,8 +27,9 @@ var ScriptTemplate []byte type Config []byte type ScrapeParams struct { - HTML string - URL string + HTML string + URL string + Process func(url string) ([]byte, error) } type ScrapeFunc func(ScrapeParams) (any, error) @@ -167,26 +168,21 @@ func scrape(vm *goja.Runtime) (ScrapeFunc, error) { return nil, errors.New("failed to export scrape function") } - return func(p ScrapeParams) (any, error) { - lock.Lock() - defer lock.Unlock() - + var newArg func(p ScrapeParams) (*goja.Object, error) + newArg = func(p ScrapeParams) (*goja.Object, error) { doc, err := DocumentFromString(p.HTML) if err != nil { - log.Println(err) return nil, err } baseurl, err := url.Parse(p.URL) if err != nil { - log.Println(err) return nil, err } absoluteURL := func(ref string) string { abs, err := baseurl.Parse(ref) if err != nil { - log.Println(err) return ref } return abs.String() @@ -196,8 +192,41 @@ func scrape(vm *goja.Runtime) (ScrapeFunc, error) { o.Set("url", p.URL) o.Set("doc", doc) o.Set("absoluteURL", absoluteURL) + o.Set("scrape", func(url string, f func(goja.FunctionCall) goja.Value) goja.Value { + url = absoluteURL(url) + + html, err := p.Process(url) + if err != nil { + return vm.ToValue(map[string]any{"error": err.Error()}) + } + + newp := ScrapeParams{ + HTML: string(html), + URL: url, + Process: p.Process, + } + + arg, err := newArg(newp) + if err != nil { + return vm.ToValue(map[string]any{"error": err.Error()}) + } + + return f(goja.FunctionCall{Arguments: []goja.Value{arg}}) + }) + + return o, nil + } + + return func(p ScrapeParams) (any, error) { + lock.Lock() + defer lock.Unlock() + + arg, err := newArg(p) + if err != nil { + return nil, err + } - ret := scrapefn(goja.FunctionCall{Arguments: []goja.Value{o}}) + ret := scrapefn(goja.FunctionCall{Arguments: []goja.Value{arg}}) if goja.IsUndefined(ret) { return nil, nil } diff --git a/js_lib_test.go b/js_lib_test.go index aca8ce9..ad19380 100644 --- a/js_lib_test.go +++ b/js_lib_test.go @@ -8,6 +8,7 @@ import ( "encoding/json" "net/http" "os" + "sync/atomic" "testing" "github.com/philippta/flyscrape" @@ -203,10 +204,10 @@ func TestJSLibHTTPDownload(t *testing.T) { http.download("https://example.com/404.txt"); ` - nreqs := 0 + var nreqs atomic.Int32 client := &http.Client{ Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { - nreqs++ + nreqs.Add(1) if r.URL.Path == "/content-disposition" { resp, err := flyscrape.MockResponse(200, "hello world") @@ -233,7 +234,7 @@ func TestJSLibHTTPDownload(t *testing.T) { wait() - require.Equal(t, nreqs, 8) + require.Equal(t, nreqs.Load(), int32(8)) require.FileExists(t, "foo.txt") require.FileExists(t, "dir/my-foo.txt") require.FileExists(t, "dir/bar.txt") @@ -168,6 +168,98 @@ func TestJSScrapeNaN(t *testing.T) { require.Nil(t, result) } +func TestJSScrapeParamURL(t *testing.T) { + js := ` + export default function({ url }) { + return url; + } + ` + exports, err := flyscrape.Compile(js, nil) + require.NoError(t, err) + + result, err := exports.Scrape(flyscrape.ScrapeParams{ + HTML: html, + URL: "http://localhost/", + }) + require.NoError(t, err) + require.Equal(t, "http://localhost/", result) +} + +func TestJSScrapeParamAbsoluteURL(t *testing.T) { + js := ` + export default function({ absoluteURL }) { + return absoluteURL("/foo"); + } + ` + exports, err := flyscrape.Compile(js, nil) + require.NoError(t, err) + + result, err := exports.Scrape(flyscrape.ScrapeParams{ + HTML: html, + URL: "http://localhost/", + }) + require.NoError(t, err) + require.Equal(t, "http://localhost/foo", result) +} + +func TestJSScrapeParamScrape(t *testing.T) { + js := ` + export default function({ scrape }) { + return scrape("/foo", function({ url }) { + return { + url: url, + foo: "bar", + }; + }); + } + ` + exports, err := flyscrape.Compile(js, nil) + require.NoError(t, err) + + result, err := exports.Scrape(flyscrape.ScrapeParams{ + HTML: html, + URL: "http://localhost/", + Process: func(url string) ([]byte, error) { + return nil, nil + }, + }) + require.NoError(t, err) + require.Equal(t, map[string]any{ + "url": "http://localhost/foo", + "foo": "bar", + }, result) +} + +func TestJSScrapeParamScrapeDeep(t *testing.T) { + js := ` + export default function({ scrape }) { + return scrape("/foo/", function({ url, scrape }) { + return { + url: url, + deep: scrape("bar", function({ url }) { + return url; + }), + }; + }); + } + ` + exports, err := flyscrape.Compile(js, nil) + require.NoError(t, err) + + result, err := exports.Scrape(flyscrape.ScrapeParams{ + HTML: html, + URL: "http://localhost/", + Process: func(url string) ([]byte, error) { + return nil, nil + }, + }) + require.NoError(t, err) + require.Equal(t, map[string]any{ + "url": "http://localhost/foo/", + "deep": "http://localhost/foo/bar", + }, result) +} + func TestJSCompileError(t *testing.T) { exports, err := flyscrape.Compile("import foo;", nil) require.Error(t, err) @@ -63,6 +63,9 @@ func LoadModules(cfg Config) []Module { // load standard modules in order for _, id := range moduleOrder { + if _, ok := loaded[id]; ok { + continue + } mod := modules[id].ModuleInfo().New() if err := json.Unmarshal(cfg, mod); err != nil { panic("failed to decode config: " + err.Error()) @@ -203,7 +203,13 @@ func (s *Scraper) process(url string, depth int) { } }() - response.Data, err = s.ScrapeFunc(ScrapeParams{HTML: string(response.Body), URL: request.URL}) + p := ScrapeParams{ + HTML: string(response.Body), + URL: request.URL, + Process: s.processImmediate, + } + + response.Data, err = s.ScrapeFunc(p) if err != nil { response.Error = err return @@ -212,6 +218,52 @@ func (s *Scraper) process(url string, depth int) { } } +func (s *Scraper) processImmediate(url string) ([]byte, error) { + request := &Request{ + Method: http.MethodGet, + URL: url, + Headers: http.Header{}, + Cookies: s.Client.Jar, + } + + for _, mod := range s.Modules { + if v, ok := mod.(RequestBuilder); ok { + v.BuildRequest(request) + } + } + + req, err := http.NewRequest(request.Method, request.URL, nil) + if err != nil { + return nil, err + } + req.Header = request.Headers + + for _, mod := range s.Modules { + if v, ok := mod.(RequestValidator); ok { + if !v.ValidateRequest(request) { + return nil, nil + } + } + } + + resp, err := s.Client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, fmt.Errorf("%d %s", resp.StatusCode, http.StatusText(resp.StatusCode)) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + return body, nil +} + func (s *Scraper) enqueueJob(url string, depth int) { url = strings.TrimSpace(url) if url == "" { |