From 40f59fa7b19059b441ea766f0de859c6dd52f77e Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Thu, 12 Oct 2023 19:21:38 +0200 Subject: Add filter function and update readme --- README.md | 179 +++++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 137 insertions(+), 42 deletions(-) (limited to 'README.md') diff --git a/README.md b/README.md index ef8182a..303371b 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,76 @@ -# flyscrape +
-flyscrape is an elegant scraping tool for efficiently extracting data from websites. Whether you're a developer, data analyst, or researcher, flyscrape empowers you to effortlessly gather information from web pages and transform it into structured data. +

+ + + + + + + +

+ +
+ +

+flyscrape is an expressive and elegant web scraper, combining the speed of Go with the
flexibility of JavaScript. — Focus on data extraction rather than request juggling. +

+ +
## Features -- **Simple and Intuitive**: **flyscrape** offers an easy-to-use command-line interface that allows you to interact with scraping scripts effortlessly. +- Domains and URL filtering +- Depth control +- Request caching +- Rate limiting +- Development mode +- Single binary executable -- **Create New Scripts**: The `new` command enables you to generate sample scraping scripts quickly, providing you with a solid starting point for your scraping endeavors. -- **Run Scripts**: Execute your scraping script using the `run` command, and watch as **flyscrape** retrieves and processes data from the specified website. +## Example script + +```javascript +export const config = { + url: "https://news.ycombinator.com/", +} + +export default function ({ doc, absoluteURL }) { + const title = doc.find("title"); + const posts = doc.find(".athing"); + + return { + title: title.text(), + posts: posts.map((post) => { + const link = post.find(".titleline > a"); + + return { + title: link.text(), + url: link.attr("href"), + }; + }), + } +} +``` -- **Watch for Development**: The `dev` command allows you to watch your scraping script for changes and quickly iterate during development, helping you find the right data extraction queries. +```bash +$ flyscrape run hackernews.js +[ + { + "title": "Hacker News", + "url": "https://news.ycombinator.com/", + "data": { + "posts": [ + { + "title": "Show HN: flyscrape - An expressive and elegant web scraper", + "url": "https://flyscrape.com" + }, + ... + ], + } + } +] +``` ## Installation @@ -26,66 +86,101 @@ To install **flyscrape**, follow these simple steps: ## Usage -**flyscrape** offers several commands to assist you in your scraping journey: +``` +$ flyscrape +flyscrape is an elegant scraping tool for efficiently extracting data from websites. -### Creating a New Script +Usage: -Use the `new` command to create a new scraping script: + flyscrape [arguments] + +Commands: + + new creates a sample scraping script + run runs a scraping script + dev watches and re-runs a scraping script -```bash -flyscrape new example.js ``` -### Running a Script +### Create a new sample scraping script -Execute your scraping script using the `run` command: +The `new` command allows you to create a new boilerplate sample script which helps you getting started. -```bash -flyscrape run example.js +``` +flyscrape new example.js ``` -### Watching for Development +### Watch the script for changes during development -The `dev` command allows you to watch your scraping script for changes and quickly iterate during development: +The `dev` command allows you to watch your scraping script for changes and quickly iterate during development. In development mode, flyscrape will not follow any links and request caching is enabled. -```bash +``` flyscrape dev example.js ``` -## Example Script +### Run the scraping script + +The `dev` command allows you to run your script to its fullest extend. + +``` +flyscrape run example.js +``` + +## Configuration Below is an example scraping script that showcases the capabilities of **flyscrape**: ```javascript export const config = { - url: "https://news.ycombinator.com/", // Specify the URL to start scraping from. - // depth: 0, // Specify how deep links should be followed. (default = 0, no follow) - // allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) - // blockedDomains: [], // Specify the blocked domains. (default = none) - // allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) - // blockedURLs: [], // Specify the blocked URLs as regex. (default = non blocked) - // rate: 100, // Specify the rate in requests per second. (default = 100) - // cache: "file", // Enable file-based request caching. (default = no cache) + url: "https://example.com/", // Specify the URL to start scraping from. + depth: 0, // Specify how deep links should be followed. (default = 0, no follow) + allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) + blockedDomains: [], // Specify the blocked domains. (default = none) + allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) + blockedURLs: [], // Specify the blocked URLs as regex. (default = none) + rate: 100, // Specify the rate in requests per second. (default = no rate limit) + cache: "file", // Enable file-based request caching. (default = no cache) }; -export default function({ doc, absoluteURL }) { - const title = doc.find("title"); - const posts = doc.find(".athing"); - - return { - title: title.text(), - posts: posts.map((post) => { - const link = post.find(".titleline > a"); - - return { - title: link.text(), - url: absoluteURL(link.attr("href")), - }; - }), - }; +export default function ({ doc, url, absoluteURL }) { + // doc - Contains the parsed HTML document + // url - Contains the scraped URL + // absoluteURL(...) - Transforms relative URLs into absolute URLs } ``` +## Query API + +```javascript +//
Hey
+const el = doc.find(".element") +el.text() // "Hey" +el.html() // `
Hey
` +el.attr("foo") // "bar" +el.hasAttr("foo") // true +el.hasClass("element") // true + +// +const list = doc.find("ul") +list.children() // [
  • Item 1
  • ,
  • Item 2
  • ,
  • Item 3
  • ] + +const items = list.find("li") +items.length() // 3 +items.first() //
  • Item 1
  • +items.last() //
  • Item 3
  • +items.get(1) //
  • Item 2
  • +items.get(1).prev() //
  • Item 1
  • +items.get(1).next() //
  • Item 3
  • +items.get(1).parent() // +items.get(1).siblings() // [
  • Item 1
  • ,
  • Item 2
  • ,
  • Item 3
  • ] +items.map(item => item.text()) // ["Item 1", "Item 2", "Item 3"] +items.filter(item => item.hasClass("a")) // [
  • Item 1
  • ] +``` + ## Contributing We welcome contributions from the community! If you encounter any issues or have suggestions for improvement, please [submit an issue](https://github.com/philippta/flyscrape/issues). -- cgit v1.2.3