From a1196ef273a94567bba73fc4a06ce6901105627c Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Sat, 24 Feb 2024 22:41:27 +0100 Subject: docs: Add demo --- .github/assets/flyscrape-demo.jpg | Bin 0 -> 197366 bytes README.md | 14 +++- template.js | 130 +++++++++++++++++++------------------- 3 files changed, 76 insertions(+), 68 deletions(-) create mode 100644 .github/assets/flyscrape-demo.jpg diff --git a/.github/assets/flyscrape-demo.jpg b/.github/assets/flyscrape-demo.jpg new file mode 100644 index 0000000..c17bdef Binary files /dev/null and b/.github/assets/flyscrape-demo.jpg differ diff --git a/README.md b/README.md index 9f3c8bb..759e8a2 100644 --- a/README.md +++ b/README.md @@ -13,21 +13,29 @@

-flyscrape is a standalone and scriptable web scraper, combining the speed of Go with the flexibility of JavaScript. — Focus on data extraction rather than request juggling. +Flyscrape is a command-line web scraping tool designed for those without
advanced programming skills, enabling precise extraction of website data.


-Installation · Documentation · Releases +Installation · Documentation · Releases

+ +## Demo + + + + + + ## Features - **Standalone:** Flyscrape comes as a single binary executable. - **jQuery-like:** Extract data from HTML pages with a familiar API. - **Scriptable:** Use JavaScript to write your data extraction logic. -- **Tons of features:** 20 features to configure the scraping behavior. +- **System Cookies:** Give Flyscrape access to your browsers cookie store. - **Browser Mode:** Render JavaScript heavy pages using a headless Browser. ## Overview diff --git a/template.js b/template.js index b466a4e..36bb909 100644 --- a/template.js +++ b/template.js @@ -1,76 +1,76 @@ export const config = { - // Specify the URL to start scraping from. - url: "https://example.com/", + // Specify the URL to start scraping from. + url: "https://example.com/", - // Enable rendering with headless browser. (default = false) - // browser: true, + // Enable rendering with headless browser. (default = false) + // browser: true, - // Specify if browser should be headless or not. (default = true) - // headless: false, + // Specify if browser should be headless or not. (default = true) + // headless: false, - // Specify the multiple URLs to start scraping from. (default = []) - // urls: [ - // "https://anothersite.com/", - // "https://yetanother.com/", - // ], + // Specify the multiple URLs to start scraping from. (default = []) + // urls: [ + // "https://anothersite.com/", + // "https://yetanother.com/", + // ], - // Specify how deep links should be followed. (default = 0, no follow) - // depth: 5, + // Specify how deep links should be followed. (default = 0, no follow) + // depth: 5, - // Speficy the css selectors to follow. (default = ["a[href]"]) - // follow: [".next > a", ".related a"], - - // Specify the allowed domains. ['*'] for all. (default = domain from url) - // allowedDomains: ["example.com", "anothersite.com"], - - // Specify the blocked domains. (default = none) - // blockedDomains: ["somesite.com"], + // Speficy the css selectors to follow. (default = ["a[href]"]) + // follow: [".next > a", ".related a"], + + // Specify the allowed domains. ['*'] for all. (default = domain from url) + // allowedDomains: ["example.com", "anothersite.com"], - // Specify the allowed URLs as regex. (default = all allowed) - // allowedURLs: ["/posts", "/articles/\d+"], + // Specify the blocked domains. (default = none) + // blockedDomains: ["somesite.com"], + + // Specify the allowed URLs as regex. (default = all allowed) + // allowedURLs: ["/posts", "/articles/\d+"], + + // Specify the blocked URLs as regex. (default = none) + // blockedURLs: ["/admin"], - // Specify the blocked URLs as regex. (default = none) - // blockedURLs: ["/admin"], - - // Specify the rate in requests per minute. (default = no rate limit) - // rate: 60, - - // Specify the number of concurrent requests. (default = no limit) - // concurrency: 1, - - // Specify a single HTTP(S) proxy URL. (default = no proxy) - // Note: Not compatible with browser mode. - // proxy: "http://someproxy.com:8043", - - // Specify multiple HTTP(S) proxy URLs. (default = no proxy) - // Note: Not compatible with browser mode. - // proxies: [ - // "http://someproxy.com:8043", - // "http://someotherproxy.com:8043", - // ], - - // Enable file-based request caching. (default = no cache) - // cache: "file", - - // Specify the HTTP request header. (default = none) - // headers: { - // "Authorization": "Bearer ...", - // "User-Agent": "Mozilla ...", - // }, - - // Use the cookie store of your local browser. (default = off) - // Options: "chrome" | "edge" | "firefox" - // cookies: "chrome", - - // Specify the output options. - // output: { - // // Specify the output file. (default = stdout) - // file: "results.json", - // - // // Specify the output format. (default = json) - // // Options: "json" | "ndjson" - // format: "json", - // }, + // Specify the rate in requests per minute. (default = no rate limit) + // rate: 60, + + // Specify the number of concurrent requests. (default = no limit) + // concurrency: 1, + + // Specify a single HTTP(S) proxy URL. (default = no proxy) + // Note: Not compatible with browser mode. + // proxy: "http://someproxy.com:8043", + + // Specify multiple HTTP(S) proxy URLs. (default = no proxy) + // Note: Not compatible with browser mode. + // proxies: [ + // "http://someproxy.com:8043", + // "http://someotherproxy.com:8043", + // ], + + // Enable file-based request caching. (default = no cache) + // cache: "file", + + // Specify the HTTP request header. (default = none) + // headers: { + // "Authorization": "Bearer ...", + // "User-Agent": "Mozilla ...", + // }, + + // Use the cookie store of your local browser. (default = off) + // Options: "chrome" | "edge" | "firefox" + // cookies: "chrome", + + // Specify the output options. + // output: { + // // Specify the output file. (default = stdout) + // file: "results.json", + // + // // Specify the output format. (default = json) + // // Options: "json" | "ndjson" + // format: "json", + // }, }; export default function({ doc, absoluteURL }) { -- cgit v1.2.3