From 40f59fa7b19059b441ea766f0de859c6dd52f77e Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Thu, 12 Oct 2023 19:21:38 +0200 Subject: Add filter function and update readme --- README.md | 179 +++++++++++++++++++++++++++++++++++++++++------------- docs/logo-alt.png | Bin 0 -> 3245 bytes docs/logo.png | Bin 0 -> 3493 bytes js.go | 24 ++++++-- 4 files changed, 156 insertions(+), 47 deletions(-) create mode 100644 docs/logo-alt.png create mode 100644 docs/logo.png diff --git a/README.md b/README.md index ef8182a..303371b 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,76 @@ -# flyscrape +
-flyscrape is an elegant scraping tool for efficiently extracting data from websites. Whether you're a developer, data analyst, or researcher, flyscrape empowers you to effortlessly gather information from web pages and transform it into structured data. +

+ + + + + + + +

+ +
+ +

+flyscrape is an expressive and elegant web scraper, combining the speed of Go with the
flexibility of JavaScript. — Focus on data extraction rather than request juggling. +

+ +
## Features -- **Simple and Intuitive**: **flyscrape** offers an easy-to-use command-line interface that allows you to interact with scraping scripts effortlessly. +- Domains and URL filtering +- Depth control +- Request caching +- Rate limiting +- Development mode +- Single binary executable -- **Create New Scripts**: The `new` command enables you to generate sample scraping scripts quickly, providing you with a solid starting point for your scraping endeavors. -- **Run Scripts**: Execute your scraping script using the `run` command, and watch as **flyscrape** retrieves and processes data from the specified website. +## Example script + +```javascript +export const config = { + url: "https://news.ycombinator.com/", +} + +export default function ({ doc, absoluteURL }) { + const title = doc.find("title"); + const posts = doc.find(".athing"); + + return { + title: title.text(), + posts: posts.map((post) => { + const link = post.find(".titleline > a"); + + return { + title: link.text(), + url: link.attr("href"), + }; + }), + } +} +``` -- **Watch for Development**: The `dev` command allows you to watch your scraping script for changes and quickly iterate during development, helping you find the right data extraction queries. +```bash +$ flyscrape run hackernews.js +[ + { + "title": "Hacker News", + "url": "https://news.ycombinator.com/", + "data": { + "posts": [ + { + "title": "Show HN: flyscrape - An expressive and elegant web scraper", + "url": "https://flyscrape.com" + }, + ... + ], + } + } +] +``` ## Installation @@ -26,66 +86,101 @@ To install **flyscrape**, follow these simple steps: ## Usage -**flyscrape** offers several commands to assist you in your scraping journey: +``` +$ flyscrape +flyscrape is an elegant scraping tool for efficiently extracting data from websites. -### Creating a New Script +Usage: -Use the `new` command to create a new scraping script: + flyscrape [arguments] + +Commands: + + new creates a sample scraping script + run runs a scraping script + dev watches and re-runs a scraping script -```bash -flyscrape new example.js ``` -### Running a Script +### Create a new sample scraping script -Execute your scraping script using the `run` command: +The `new` command allows you to create a new boilerplate sample script which helps you getting started. -```bash -flyscrape run example.js +``` +flyscrape new example.js ``` -### Watching for Development +### Watch the script for changes during development -The `dev` command allows you to watch your scraping script for changes and quickly iterate during development: +The `dev` command allows you to watch your scraping script for changes and quickly iterate during development. In development mode, flyscrape will not follow any links and request caching is enabled. -```bash +``` flyscrape dev example.js ``` -## Example Script +### Run the scraping script + +The `dev` command allows you to run your script to its fullest extend. + +``` +flyscrape run example.js +``` + +## Configuration Below is an example scraping script that showcases the capabilities of **flyscrape**: ```javascript export const config = { - url: "https://news.ycombinator.com/", // Specify the URL to start scraping from. - // depth: 0, // Specify how deep links should be followed. (default = 0, no follow) - // allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) - // blockedDomains: [], // Specify the blocked domains. (default = none) - // allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) - // blockedURLs: [], // Specify the blocked URLs as regex. (default = non blocked) - // rate: 100, // Specify the rate in requests per second. (default = 100) - // cache: "file", // Enable file-based request caching. (default = no cache) + url: "https://example.com/", // Specify the URL to start scraping from. + depth: 0, // Specify how deep links should be followed. (default = 0, no follow) + allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) + blockedDomains: [], // Specify the blocked domains. (default = none) + allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) + blockedURLs: [], // Specify the blocked URLs as regex. (default = none) + rate: 100, // Specify the rate in requests per second. (default = no rate limit) + cache: "file", // Enable file-based request caching. (default = no cache) }; -export default function({ doc, absoluteURL }) { - const title = doc.find("title"); - const posts = doc.find(".athing"); - - return { - title: title.text(), - posts: posts.map((post) => { - const link = post.find(".titleline > a"); - - return { - title: link.text(), - url: absoluteURL(link.attr("href")), - }; - }), - }; +export default function ({ doc, url, absoluteURL }) { + // doc - Contains the parsed HTML document + // url - Contains the scraped URL + // absoluteURL(...) - Transforms relative URLs into absolute URLs } ``` +## Query API + +```javascript +//
Hey
+const el = doc.find(".element") +el.text() // "Hey" +el.html() // `
Hey
` +el.attr("foo") // "bar" +el.hasAttr("foo") // true +el.hasClass("element") // true + +// +const list = doc.find("ul") +list.children() // [
  • Item 1
  • ,
  • Item 2
  • ,
  • Item 3
  • ] + +const items = list.find("li") +items.length() // 3 +items.first() //
  • Item 1
  • +items.last() //
  • Item 3
  • +items.get(1) //
  • Item 2
  • +items.get(1).prev() //
  • Item 1
  • +items.get(1).next() //
  • Item 3
  • +items.get(1).parent() // +items.get(1).siblings() // [
  • Item 1
  • ,
  • Item 2
  • ,
  • Item 3
  • ] +items.map(item => item.text()) // ["Item 1", "Item 2", "Item 3"] +items.filter(item => item.hasClass("a")) // [
  • Item 1
  • ] +``` + ## Contributing We welcome contributions from the community! If you encounter any issues or have suggestions for improvement, please [submit an issue](https://github.com/philippta/flyscrape/issues). diff --git a/docs/logo-alt.png b/docs/logo-alt.png new file mode 100644 index 0000000..f136932 Binary files /dev/null and b/docs/logo-alt.png differ diff --git a/docs/logo.png b/docs/logo.png new file mode 100644 index 0000000..99b9aa1 Binary files /dev/null and b/docs/logo.png differ diff --git a/js.go b/js.go index 649fb29..d36f98a 100644 --- a/js.go +++ b/js.go @@ -18,7 +18,7 @@ import ( "github.com/PuerkitoBio/goquery" "github.com/dop251/goja" - gojaconsole "github.com/dop251/goja_nodejs/console" + "github.com/dop251/goja_nodejs/console" "github.com/dop251/goja_nodejs/require" "github.com/evanw/esbuild/pkg/api" ) @@ -50,6 +50,7 @@ func Compile(src string) (Config, ScrapeFunc, error) { if err != nil { return nil, nil, err } + return vm(src) } @@ -81,7 +82,7 @@ func vm(src string) (Config, ScrapeFunc, error) { registry := &require.Registry{} registry.Enable(vm) - gojaconsole.Enable(vm) + console.Enable(vm) if _, err := vm.RunString(removeIIFE(src)); err != nil { return nil, nil, fmt.Errorf("running user script: %w", err) @@ -101,6 +102,7 @@ func vm(src string) (Config, ScrapeFunc, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(p.HTML)) if err != nil { + log.Println(err) return nil, err } @@ -111,7 +113,6 @@ func vm(src string) (Config, ScrapeFunc, error) { } suffix := strconv.FormatUint(c.Add(1), 10) - vm.Set("html_"+suffix, p.HTML) vm.Set("url_"+suffix, p.URL) vm.Set("doc_"+suffix, wrap(vm, doc.Selection)) vm.Set("absurl_"+suffix, func(ref string) string { @@ -123,13 +124,15 @@ func vm(src string) (Config, ScrapeFunc, error) { return abs.String() }) - data, err := vm.RunString(fmt.Sprintf(`JSON.stringify(stdin_default({html: html_%s, doc: doc_%s, url: url_%s, absoluteURL: absurl_%s}))`, suffix, suffix, suffix, suffix)) + data, err := vm.RunString(fmt.Sprintf(`JSON.stringify(stdin_default({doc: doc_%s, url: url_%s, absoluteURL: absurl_%s}))`, suffix, suffix, suffix)) if err != nil { + log.Println(err) return nil, err } var obj any if err := json.Unmarshal([]byte(data.String()), &obj); err != nil { + log.Println(err) return nil, err } @@ -141,6 +144,7 @@ func vm(src string) (Config, ScrapeFunc, error) { func wrap(vm *goja.Runtime, sel *goquery.Selection) map[string]any { o := map[string]any{} + o["WARNING"] = "Forgot to call text(), html() or attr()?" o["text"] = sel.Text o["html"] = func() string { h, _ := goquery.OuterHtml(sel); return h } o["attr"] = func(name string) string { v, _ := sel.Attr(name); return v } @@ -150,7 +154,6 @@ func wrap(vm *goja.Runtime, sel *goquery.Selection) map[string]any { o["first"] = func() map[string]any { return wrap(vm, sel.First()) } o["last"] = func() map[string]any { return wrap(vm, sel.Last()) } o["get"] = func(index int) map[string]any { return wrap(vm, sel.Eq(index)) } - o["index"] = sel.Index() o["find"] = func(s string) map[string]any { return wrap(vm, sel.Find(s)) } o["next"] = func() map[string]any { return wrap(vm, sel.Next()) } o["prev"] = func() map[string]any { return wrap(vm, sel.Prev()) } @@ -165,6 +168,17 @@ func wrap(vm *goja.Runtime, sel *goquery.Selection) map[string]any { }) return vals } + o["filter"] = func(callback func(map[string]any, int) bool) []any { + var vals []any + sel.Each(func(i int, s *goquery.Selection) { + el := wrap(vm, s) + ok := callback(el, i) + if ok { + vals = append(vals, el) + } + }) + return vals + } return o } -- cgit v1.2.3