diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2024-02-24 22:41:27 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-02-24 22:41:27 +0100 |
| commit | a1196ef273a94567bba73fc4a06ce6901105627c (patch) | |
| tree | 634acbfb79d29e4b28f794964f634fea12406754 | |
| parent | aff32d2072b691e4308953e28640d544c377d69d (diff) | |
docs: Add demo
| -rw-r--r-- | .github/assets/flyscrape-demo.jpg | bin | 0 -> 197366 bytes | |||
| -rw-r--r-- | README.md | 14 | ||||
| -rw-r--r-- | template.js | 130 |
3 files changed, 76 insertions, 68 deletions
diff --git a/.github/assets/flyscrape-demo.jpg b/.github/assets/flyscrape-demo.jpg Binary files differnew file mode 100644 index 0000000..c17bdef --- /dev/null +++ b/.github/assets/flyscrape-demo.jpg @@ -13,21 +13,29 @@ <br /> <p align="center"> -<b>flyscrape</b> is a standalone and scriptable web scraper, combining the speed of Go with the flexibility of JavaScript. — Focus on data extraction rather than request juggling. +Flyscrape is a command-line web scraping tool designed for those without <br />advanced programming skills, enabling precise extraction of website data. </p> <br /> <p align="center"> -<a href="#installation">Installation</a> · <a href="https://flyscrape.com/docs/">Documentation</a> · <a href="https://github.com/philippta/flyscrape/releases">Releases</a> +<a href="#installation">Installation</a> · <a href="https://flyscrape.com/docs/getting-started">Documentation</a> · <a href="https://github.com/philippta/flyscrape/releases">Releases</a> </p> + +## Demo + + +<a href="https://www.youtube.com/watch?v=Yj11v4n9JHA"> + <img src=".github/assets/flyscrape-demo.jpg" style="border-radius: 6px"> +</a> + ## Features - **Standalone:** Flyscrape comes as a single binary executable. - **jQuery-like:** Extract data from HTML pages with a familiar API. - **Scriptable:** Use JavaScript to write your data extraction logic. -- **Tons of features:** 20 features to configure the scraping behavior. +- **System Cookies:** Give Flyscrape access to your browsers cookie store. - **Browser Mode:** Render JavaScript heavy pages using a headless Browser. ## Overview diff --git a/template.js b/template.js index b466a4e..36bb909 100644 --- a/template.js +++ b/template.js @@ -1,76 +1,76 @@ export const config = { - // Specify the URL to start scraping from. - url: "https://example.com/", + // Specify the URL to start scraping from. + url: "https://example.com/", - // Enable rendering with headless browser. (default = false) - // browser: true, + // Enable rendering with headless browser. (default = false) + // browser: true, - // Specify if browser should be headless or not. (default = true) - // headless: false, + // Specify if browser should be headless or not. (default = true) + // headless: false, - // Specify the multiple URLs to start scraping from. (default = []) - // urls: [ - // "https://anothersite.com/", - // "https://yetanother.com/", - // ], + // Specify the multiple URLs to start scraping from. (default = []) + // urls: [ + // "https://anothersite.com/", + // "https://yetanother.com/", + // ], - // Specify how deep links should be followed. (default = 0, no follow) - // depth: 5, + // Specify how deep links should be followed. (default = 0, no follow) + // depth: 5, - // Speficy the css selectors to follow. (default = ["a[href]"]) - // follow: [".next > a", ".related a"], - - // Specify the allowed domains. ['*'] for all. (default = domain from url) - // allowedDomains: ["example.com", "anothersite.com"], - - // Specify the blocked domains. (default = none) - // blockedDomains: ["somesite.com"], + // Speficy the css selectors to follow. (default = ["a[href]"]) + // follow: [".next > a", ".related a"], + + // Specify the allowed domains. ['*'] for all. (default = domain from url) + // allowedDomains: ["example.com", "anothersite.com"], - // Specify the allowed URLs as regex. (default = all allowed) - // allowedURLs: ["/posts", "/articles/\d+"], + // Specify the blocked domains. (default = none) + // blockedDomains: ["somesite.com"], + + // Specify the allowed URLs as regex. (default = all allowed) + // allowedURLs: ["/posts", "/articles/\d+"], + + // Specify the blocked URLs as regex. (default = none) + // blockedURLs: ["/admin"], - // Specify the blocked URLs as regex. (default = none) - // blockedURLs: ["/admin"], - - // Specify the rate in requests per minute. (default = no rate limit) - // rate: 60, - - // Specify the number of concurrent requests. (default = no limit) - // concurrency: 1, - - // Specify a single HTTP(S) proxy URL. (default = no proxy) - // Note: Not compatible with browser mode. - // proxy: "http://someproxy.com:8043", - - // Specify multiple HTTP(S) proxy URLs. (default = no proxy) - // Note: Not compatible with browser mode. - // proxies: [ - // "http://someproxy.com:8043", - // "http://someotherproxy.com:8043", - // ], - - // Enable file-based request caching. (default = no cache) - // cache: "file", - - // Specify the HTTP request header. (default = none) - // headers: { - // "Authorization": "Bearer ...", - // "User-Agent": "Mozilla ...", - // }, - - // Use the cookie store of your local browser. (default = off) - // Options: "chrome" | "edge" | "firefox" - // cookies: "chrome", - - // Specify the output options. - // output: { - // // Specify the output file. (default = stdout) - // file: "results.json", - // - // // Specify the output format. (default = json) - // // Options: "json" | "ndjson" - // format: "json", - // }, + // Specify the rate in requests per minute. (default = no rate limit) + // rate: 60, + + // Specify the number of concurrent requests. (default = no limit) + // concurrency: 1, + + // Specify a single HTTP(S) proxy URL. (default = no proxy) + // Note: Not compatible with browser mode. + // proxy: "http://someproxy.com:8043", + + // Specify multiple HTTP(S) proxy URLs. (default = no proxy) + // Note: Not compatible with browser mode. + // proxies: [ + // "http://someproxy.com:8043", + // "http://someotherproxy.com:8043", + // ], + + // Enable file-based request caching. (default = no cache) + // cache: "file", + + // Specify the HTTP request header. (default = none) + // headers: { + // "Authorization": "Bearer ...", + // "User-Agent": "Mozilla ...", + // }, + + // Use the cookie store of your local browser. (default = off) + // Options: "chrome" | "edge" | "firefox" + // cookies: "chrome", + + // Specify the output options. + // output: { + // // Specify the output file. (default = stdout) + // file: "results.json", + // + // // Specify the output format. (default = json) + // // Options: "json" | "ndjson" + // format: "json", + // }, }; export default function({ doc, absoluteURL }) { |