summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.github/assets/flyscrape-demo.jpgbin0 -> 197366 bytes
-rw-r--r--README.md14
-rw-r--r--template.js130
3 files changed, 76 insertions, 68 deletions
diff --git a/.github/assets/flyscrape-demo.jpg b/.github/assets/flyscrape-demo.jpg
new file mode 100644
index 0000000..c17bdef
--- /dev/null
+++ b/.github/assets/flyscrape-demo.jpg
Binary files differ
diff --git a/README.md b/README.md
index 9f3c8bb..759e8a2 100644
--- a/README.md
+++ b/README.md
@@ -13,21 +13,29 @@
<br />
<p align="center">
-<b>flyscrape</b> is a standalone and scriptable web scraper, combining the speed of Go with the flexibility of JavaScript. — Focus on data extraction rather than request juggling.
+Flyscrape is a command-line web scraping tool designed for those without <br />advanced programming skills, enabling precise extraction of website data.
</p>
<br />
<p align="center">
-<a href="#installation">Installation</a> · <a href="https://flyscrape.com/docs/">Documentation</a> · <a href="https://github.com/philippta/flyscrape/releases">Releases</a>
+<a href="#installation">Installation</a> · <a href="https://flyscrape.com/docs/getting-started">Documentation</a> · <a href="https://github.com/philippta/flyscrape/releases">Releases</a>
</p>
+
+## Demo
+
+
+<a href="https://www.youtube.com/watch?v=Yj11v4n9JHA">
+ <img src=".github/assets/flyscrape-demo.jpg" style="border-radius: 6px">
+</a>
+
## Features
- **Standalone:** Flyscrape comes as a single binary executable.
- **jQuery-like:** Extract data from HTML pages with a familiar API.
- **Scriptable:** Use JavaScript to write your data extraction logic.
-- **Tons of features:** 20 features to configure the scraping behavior.
+- **System Cookies:** Give Flyscrape access to your browsers cookie store.
- **Browser Mode:** Render JavaScript heavy pages using a headless Browser.
## Overview
diff --git a/template.js b/template.js
index b466a4e..36bb909 100644
--- a/template.js
+++ b/template.js
@@ -1,76 +1,76 @@
export const config = {
- // Specify the URL to start scraping from.
- url: "https://example.com/",
+ // Specify the URL to start scraping from.
+ url: "https://example.com/",
- // Enable rendering with headless browser. (default = false)
- // browser: true,
+ // Enable rendering with headless browser. (default = false)
+ // browser: true,
- // Specify if browser should be headless or not. (default = true)
- // headless: false,
+ // Specify if browser should be headless or not. (default = true)
+ // headless: false,
- // Specify the multiple URLs to start scraping from. (default = [])
- // urls: [
- // "https://anothersite.com/",
- // "https://yetanother.com/",
- // ],
+ // Specify the multiple URLs to start scraping from. (default = [])
+ // urls: [
+ // "https://anothersite.com/",
+ // "https://yetanother.com/",
+ // ],
- // Specify how deep links should be followed. (default = 0, no follow)
- // depth: 5,
+ // Specify how deep links should be followed. (default = 0, no follow)
+ // depth: 5,
- // Speficy the css selectors to follow. (default = ["a[href]"])
- // follow: [".next > a", ".related a"],
-
- // Specify the allowed domains. ['*'] for all. (default = domain from url)
- // allowedDomains: ["example.com", "anothersite.com"],
-
- // Specify the blocked domains. (default = none)
- // blockedDomains: ["somesite.com"],
+ // Speficy the css selectors to follow. (default = ["a[href]"])
+ // follow: [".next > a", ".related a"],
+
+ // Specify the allowed domains. ['*'] for all. (default = domain from url)
+ // allowedDomains: ["example.com", "anothersite.com"],
- // Specify the allowed URLs as regex. (default = all allowed)
- // allowedURLs: ["/posts", "/articles/\d+"],
+ // Specify the blocked domains. (default = none)
+ // blockedDomains: ["somesite.com"],
+
+ // Specify the allowed URLs as regex. (default = all allowed)
+ // allowedURLs: ["/posts", "/articles/\d+"],
+
+ // Specify the blocked URLs as regex. (default = none)
+ // blockedURLs: ["/admin"],
- // Specify the blocked URLs as regex. (default = none)
- // blockedURLs: ["/admin"],
-
- // Specify the rate in requests per minute. (default = no rate limit)
- // rate: 60,
-
- // Specify the number of concurrent requests. (default = no limit)
- // concurrency: 1,
-
- // Specify a single HTTP(S) proxy URL. (default = no proxy)
- // Note: Not compatible with browser mode.
- // proxy: "http://someproxy.com:8043",
-
- // Specify multiple HTTP(S) proxy URLs. (default = no proxy)
- // Note: Not compatible with browser mode.
- // proxies: [
- // "http://someproxy.com:8043",
- // "http://someotherproxy.com:8043",
- // ],
-
- // Enable file-based request caching. (default = no cache)
- // cache: "file",
-
- // Specify the HTTP request header. (default = none)
- // headers: {
- // "Authorization": "Bearer ...",
- // "User-Agent": "Mozilla ...",
- // },
-
- // Use the cookie store of your local browser. (default = off)
- // Options: "chrome" | "edge" | "firefox"
- // cookies: "chrome",
-
- // Specify the output options.
- // output: {
- // // Specify the output file. (default = stdout)
- // file: "results.json",
- //
- // // Specify the output format. (default = json)
- // // Options: "json" | "ndjson"
- // format: "json",
- // },
+ // Specify the rate in requests per minute. (default = no rate limit)
+ // rate: 60,
+
+ // Specify the number of concurrent requests. (default = no limit)
+ // concurrency: 1,
+
+ // Specify a single HTTP(S) proxy URL. (default = no proxy)
+ // Note: Not compatible with browser mode.
+ // proxy: "http://someproxy.com:8043",
+
+ // Specify multiple HTTP(S) proxy URLs. (default = no proxy)
+ // Note: Not compatible with browser mode.
+ // proxies: [
+ // "http://someproxy.com:8043",
+ // "http://someotherproxy.com:8043",
+ // ],
+
+ // Enable file-based request caching. (default = no cache)
+ // cache: "file",
+
+ // Specify the HTTP request header. (default = none)
+ // headers: {
+ // "Authorization": "Bearer ...",
+ // "User-Agent": "Mozilla ...",
+ // },
+
+ // Use the cookie store of your local browser. (default = off)
+ // Options: "chrome" | "edge" | "firefox"
+ // cookies: "chrome",
+
+ // Specify the output options.
+ // output: {
+ // // Specify the output file. (default = stdout)
+ // file: "results.json",
+ //
+ // // Specify the output format. (default = json)
+ // // Options: "json" | "ndjson"
+ // format: "json",
+ // },
};
export default function({ doc, absoluteURL }) {