From b1e2c8fd5cb5dfa46bc440a12eafaf56cd844b1c Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Mon, 24 Nov 2025 20:54:57 +0100 Subject: Docs --- content/docs/configuration/_index.md | 25 +++++++++++++++ content/docs/configuration/browser-mode.md | 40 +++++++++++++++++++++++ content/docs/configuration/caching.md | 36 +++++++++++++++++++++ content/docs/configuration/concurrency.md | 18 +++++++++++ content/docs/configuration/cookies.md | 36 +++++++++++++++++++++ content/docs/configuration/depth.md | 24 ++++++++++++++ content/docs/configuration/domain-filter.md | 43 +++++++++++++++++++++++++ content/docs/configuration/headers.md | 17 ++++++++++ content/docs/configuration/link-following.md | 33 +++++++++++++++++++ content/docs/configuration/output.md | 47 ++++++++++++++++++++++++++++ content/docs/configuration/proxies.md | 33 +++++++++++++++++++ content/docs/configuration/rate-limiting.md | 15 +++++++++ content/docs/configuration/retry.md | 26 +++++++++++++++ content/docs/configuration/starting-url.md | 29 +++++++++++++++++ content/docs/configuration/url-filter.md | 42 +++++++++++++++++++++++++ 15 files changed, 464 insertions(+) create mode 100644 content/docs/configuration/_index.md create mode 100644 content/docs/configuration/browser-mode.md create mode 100644 content/docs/configuration/caching.md create mode 100644 content/docs/configuration/concurrency.md create mode 100644 content/docs/configuration/cookies.md create mode 100644 content/docs/configuration/depth.md create mode 100644 content/docs/configuration/domain-filter.md create mode 100644 content/docs/configuration/headers.md create mode 100644 content/docs/configuration/link-following.md create mode 100644 content/docs/configuration/output.md create mode 100644 content/docs/configuration/proxies.md create mode 100644 content/docs/configuration/rate-limiting.md create mode 100644 content/docs/configuration/retry.md create mode 100644 content/docs/configuration/starting-url.md create mode 100644 content/docs/configuration/url-filter.md (limited to 'content/docs/configuration') diff --git a/content/docs/configuration/_index.md b/content/docs/configuration/_index.md new file mode 100644 index 0000000..ac27c8b --- /dev/null +++ b/content/docs/configuration/_index.md @@ -0,0 +1,25 @@ +--- +title: 'Configuration' +weight: 4 +sidebar: + open: true +next: '/docs/configuration/starting-url' +prev: '/docs/api-reference' +--- + +{{< cards >}} + {{< card link="starting-url" title="Starting URL" icon="play" >}} + {{< card link="depth" title="Depth" icon="arrow-down" >}} + {{< card link="domain-filter" title="Domain Filter" icon="cube-transparent" >}} + {{< card link="url-filter" title="URL Filter" icon="sparkles" >}} + {{< card link="link-following" title="Link Following" icon="link" >}} + {{< card link="concurrency" title="Concurrency" icon="paper-airplane" >}} + {{< card link="rate-limiting" title="Rate Limiting" icon="chart-square-bar" >}} + {{< card link="retry" title="Retry" icon="refresh" >}} + {{< card link="caching" title="Caching" icon="template" >}} + {{< card link="proxies" title="Proxies" icon="server" >}} + {{< card link="cookies" title="Cookies" icon="finger-print" >}} + {{< card link="headers" title="Headers" icon="sort-ascending" >}} + {{< card link="browser-mode" title="Browser Mode" icon="desktop-computer" >}} + {{< card link="output" title="Output File and Format" icon="presentation-chart-bar" >}} +{{< /cards >}} diff --git a/content/docs/configuration/browser-mode.md b/content/docs/configuration/browser-mode.md new file mode 100644 index 0000000..bbb2c1e --- /dev/null +++ b/content/docs/configuration/browser-mode.md @@ -0,0 +1,40 @@ +--- +title: 'Browser Mode' +weight: 10 +--- + +The Browser Mode controls the interaction with a headless Chromium browser. Enabling the browser mode allows `flyscrape` to download a Chromium browser once and use it to render JavaScript-heavy pages. + +## Browser Mode + +To enable Browser Mode, set the `browser` option to `true` in your configuration. This allows `flyscrape` to use a headless Chromium browser for rendering JavaScript during the scraping process. + +```javascript {filename="Configuration"} +export const config = { + browser: true, +}; +``` + +In the above example, Browser Mode is enabled, allowing `flyscrape` to render pages that rely on JavaScript execution. + +## Headless Option + +The `headless` option, when combined with Browser Mode, controls whether the Chromium browser should run in headless mode or not. Headless mode means the browser operates without a graphical user interface, which can be useful for background processes. + +```javascript {filename="Configuration"} +export const config = { + browser: true, + headless: false, +}; +``` + +In this example, the Chromium browser will run in non-headless mode. If you set `headless` to `true`, the browser will run without a visible GUI. + +```javascript {filename="Configuration"} +export const config = { + browser: true, + headless: true, +}; +``` + +In this example, the Chromium browser will run in headless mode, suitable for scenarios where graphical rendering is unnecessary. diff --git a/content/docs/configuration/caching.md b/content/docs/configuration/caching.md new file mode 100644 index 0000000..2c6766a --- /dev/null +++ b/content/docs/configuration/caching.md @@ -0,0 +1,36 @@ +--- +title: 'Caching' +weight: 7 +--- + +The `cache` config option allows you to enable file-based request caching. When enabled every request cached with its raw response. When the cache is populated and you re-run the scraper, requests will be served directly from cache. + +This also allows you to modify your scraping script afterwards and collect new results immediately. + +```javascript {filename="Configuration"} +export const config = { + url: "http://example.com/", + cache: "file", + // ... +}; +``` + +### Cache File + +When caching is enabled using the `cache: "file"` option, a `.cache` file will be created with the name of your scraping script. + +```bash {filename="Terminal"} +$ flyscrape run hackernews.js # Will populate: hackernews.cache +``` + +### Shared cache + +In case you want to share a cache between different scraping scripts, you can specify where to store the cache file. + +```javascript {filename="Configuration"} +export const config = { + url: "http://example.com/", + cache: "file:/some/path/shared.cache", + // ... +}; +``` diff --git a/content/docs/configuration/concurrency.md b/content/docs/configuration/concurrency.md new file mode 100644 index 0000000..0e5e181 --- /dev/null +++ b/content/docs/configuration/concurrency.md @@ -0,0 +1,18 @@ +--- +title: 'Concurrency' +weight: 6 +--- + +The concurrency setting controls the number of simultaneous requests that the scraper can make. This is specified in the configuration object of your scraping script. + +```javascript +export const config = { + // Specify the number of concurrent requests. + concurrency: 5, +}; +``` + +In the above example, the scraper will make up to 5 requests at the same time. + +If the concurrency setting is not specified, there is no limit to the number of concurrent requests. + diff --git a/content/docs/configuration/cookies.md b/content/docs/configuration/cookies.md new file mode 100644 index 0000000..f73d495 --- /dev/null +++ b/content/docs/configuration/cookies.md @@ -0,0 +1,36 @@ +--- +title: 'Cookies' +weight: 9 +--- + +The Cookies configuration in the `flyscrape` script's configuration object allows you to specify the behavior of the cookie store during the scraping process. Cookies are often used for authentication and session management on websites. + +## Cookies Configuration + +To configure the cookie store behavior, set the `cookies` field in your configuration. The `cookies` option supports three values: `"chrome"`, `"edge"`, and `"firefox"`. Each value corresponds to using the cookie store of the respective local browser. + +When the `cookies` option is set to `"chrome"`, `"edge"`, or `"firefox"`, `flyscrape` utilizes the cookie store of the user's installed browser. + +```javascript {filename="Configuration"} +export const config = { + cookies: "chrome", +}; +``` + +In the above example, the `cookies` option is set to `"chrome"`, indicating that `flyscrape` should use the cookie store of the local Chrome browser. + +```javascript {filename="Configuration"} +export const config = { + cookies: "firefox", +}; +``` + +In this example, the `cookies` option is set to `"firefox"`, instructing `flyscrape` to use the cookie store of the local Firefox browser. + +```javascript {filename="Configuration"} +export const config = { + cookies: "edge", +}; +``` + +In this example, the `cookies` option is set to `"edge"`, indicating that `flyscrape` should use the cookie store of the local Edge browser. diff --git a/content/docs/configuration/depth.md b/content/docs/configuration/depth.md new file mode 100644 index 0000000..d100470 --- /dev/null +++ b/content/docs/configuration/depth.md @@ -0,0 +1,24 @@ +--- +title: 'Depth' +weight: 2 +--- + +The `depth` config option allows you to specify how deep the scraping process should follow links from the initial URL. + +When no value is provided or `depth` is set to `0` link following is disabled and it will only scrape the initial URL. + +```javascript {filename="Configuration"} +export const config = { + url: "http://example.com/", + depth: 2, + // ... +}; +``` + +With the config provided in the example the scraper would follow links like this: + +``` +http://example.com/ (depth = 0, initial URL) +↳ http://example.com/deeply (depth = 1) + ↳ http://example.com/deeply/nested (depth = 2) +``` diff --git a/content/docs/configuration/domain-filter.md b/content/docs/configuration/domain-filter.md new file mode 100644 index 0000000..184ee2f --- /dev/null +++ b/content/docs/configuration/domain-filter.md @@ -0,0 +1,43 @@ +--- +title: 'Domain Filter' +weight: 3 +--- + +The `allowedDomains` and `blockedDomains` config options allow you to specify a list of domains which are accessible or blocked during scraping. + +```javascript {filename="Configuration"} +export const options = { + url: "http://example.com/", + allowedDomains: ["subdomain.example.com"], + // ... +}; +``` + +## Allowed Domains + +This config option controls which additional domains are allowed to be visted during scraping. The domain of the initial URL is always allowed. + +You can also allow all domains to be accessible by setting `allowedDomains` to `["*"]`. To then further restrict access, you can specify `blockedDomains`. + +```javascript {filename="Configuration"} +export const options = { + url: "http://example.com/", + allowedDomains: ["*"], + // ... +}; +``` + +## Blocked Domains + +This config option controls which additional domains are blocked from being accessed. By default all domains other than the domain of the initial URL or those specified in `allowedDomains` are blocked. + +You can best use `blockedDomains` in conjunction with `allowedDomains: ["*"]`, allowing the scraping process to access all domains except what's specified in `blockedDomains`. + +```javascript {filename="Configuration"} +export const options = { + url: "http://example.com/", + allowedDomains: ["*"], + blockedDomains: ["google.com", "bing.com"], + // ... +}; +``` diff --git a/content/docs/configuration/headers.md b/content/docs/configuration/headers.md new file mode 100644 index 0000000..2b8f82c --- /dev/null +++ b/content/docs/configuration/headers.md @@ -0,0 +1,17 @@ +--- +title: 'Headers' +weight: 9 +--- + +The `headers` config option allows you to specify the custom HTTP headers sent with each request. + +```javascript {filename="Configuration"} +export const config = { + headers: { + "Authorization": "Bearer ey....", + "User-Agent": "Mozilla/5.0 (Macintosh ...", + }, + // ... +}; +``` + diff --git a/content/docs/configuration/link-following.md b/content/docs/configuration/link-following.md new file mode 100644 index 0000000..b9755f7 --- /dev/null +++ b/content/docs/configuration/link-following.md @@ -0,0 +1,33 @@ +--- +title: 'Link Following' +weight: 5 +--- + +The `follow` config option allows you to specify a list of CSS selectors that determine which links the scraper should follow. + +When no value is provided the scraper will follow all links found with the `a[href]` selector. + +```javascript {filename="Configuration"} +export const config = { + url: "http://example.com/", + follow: [ + ".pagination > a[href]", + ".nav a[href]", + ], + // ... +}; +``` + +## Following non `href` attributes + +For special cases where the link is not to be found in the `href`, you specify a selector with a different ending attribute. + +```javascript {filename="Configuration"} +export const config = { + url: "http://example.com/", + follow: [ + ".articles > div[data-url]", + ], + // ... +}; +``` diff --git a/content/docs/configuration/output.md b/content/docs/configuration/output.md new file mode 100644 index 0000000..2470865 --- /dev/null +++ b/content/docs/configuration/output.md @@ -0,0 +1,47 @@ +--- +title: 'Output File and Format' +weight: 10 +--- + +The output file and format are specified in the configuration object of your scraping script. They determine where the scraped data will be saved and in what format. + +## Output File + +The output file is the file where the scraped data will be saved. If not specified, the data will be printed to the standard output (stdout). + +```javascript {filename="Configuration"} +export const config = { + output: { + // Specify the output file. + file: "results.json", + }, +}; +``` + +In the above example, the scraped data will be saved in a file named `results.json`. + +## Output Format + +The output format is the format in which the scraped data will be saved. The options are `json` and `ndjson`. + +```javascript {filename="Configuration"} +export const config = { + output: { + // Specify the output format. + format: "json", + }, +}; +``` + +In the above example, the scraped data will be saved in JSON format. + +```javascript {filename="Configuration"} +export const config = { + output: { + // Specify the output format. + format: "ndjson", + }, +}; +``` + +In this example, the scraped data will be saved in newline-delimited JSON (NDJSON) format. Each line in the output file will be a separate JSON object. diff --git a/content/docs/configuration/proxies.md b/content/docs/configuration/proxies.md new file mode 100644 index 0000000..913630d --- /dev/null +++ b/content/docs/configuration/proxies.md @@ -0,0 +1,33 @@ +--- +title: 'Proxies' +weight: 8 +--- + +The proxy feature allows you to route your scraping requests through a specified HTTP(S) proxy. This can be useful for bypassing IP-based rate limits or accessing region-restricted content. + +```javascript +export const config = { + // Specify a single HTTP(S) proxy URL. + proxy: "http://someproxy.com:8043", +}; +``` + +In the above example, all scraping requests will be routed through the proxy at `http://someproxy.com:8043`. + +## Multiple Proxies + +You can also specify multiple proxy URLs. The scraper will rotate between these proxies for each request. + +```javascript +export const config = { + // Specify multiple HTTP(S) proxy URLs. + proxies: [ + "http://someproxy.com:8043", + "http://someotherproxy.com:8043", + ], +}; +``` + +In this example, the scraper will randomly pick between the proxies at `http://someproxy.com:8043` and `http://someotherproxy.com:8043`. + +Note: If both `proxy` and `proxies` are specified, all proxies will be respected. diff --git a/content/docs/configuration/rate-limiting.md b/content/docs/configuration/rate-limiting.md new file mode 100644 index 0000000..4b5bf9c --- /dev/null +++ b/content/docs/configuration/rate-limiting.md @@ -0,0 +1,15 @@ +--- +title: 'Rate Limiting' +weight: 6 +--- + +The `rate` config option allows you to specify at which rate the scraper should send out requests. The rate is measured in _Requests per Minute_ (RPM). + +When no `rate` is specified, rate limiting is disabled and the scraper will send out requests as fast as it can. + +```javascript {filename="Configuration"} +export const options = { + url: "http://example.com/", + rate: 100, +}; +``` diff --git a/content/docs/configuration/retry.md b/content/docs/configuration/retry.md new file mode 100644 index 0000000..cf00698 --- /dev/null +++ b/content/docs/configuration/retry.md @@ -0,0 +1,26 @@ +--- +title: 'Retry' +weight: 6 +--- + +The retry feature allows the scraper to automatically retry failed requests. This is particularly useful when dealing with unstable networks or servers that occasionally return error status codes. + +The retry feature is automatically enabled and will retry requests that return the following HTTP status codes: + +- 403 Forbidden +- 408 Request Timeout +- 425 Too Early +- 429 Too Many Requests +- 500 Internal Server Error +- 502 Bad Gateway +- 503 Service Unavailable +- 504 Gateway Timeout + +### Retry Delays + +After a failed request, the scraper will wait for a certain amount of time before retrying the request. The delay increases with each consecutive failed attempt, according to the following schedule: + +- 1st retry: 1 second delay +- 2nd retry: 2 seconds delay +- 3rd retry: 5 seconds delay +- 4th retry: 10 seconds delay diff --git a/content/docs/configuration/starting-url.md b/content/docs/configuration/starting-url.md new file mode 100644 index 0000000..6b60d7e --- /dev/null +++ b/content/docs/configuration/starting-url.md @@ -0,0 +1,29 @@ +--- +title: 'Starting URL' +weight: 1 +prev: '/docs/configuration' +--- + +The `url` config option allows you to specify the initial URL at which the scraper should start its scraping process. + +```javascript {filename="Configuration"} +export const config = { + url: "http://example.com/", + // ... +}; +``` + +## Multiple starting URLs + +In case you have more than one URL you want to scrape (or to start from) you can specify them with the `urls` config option. + +```javascript {filename="Configuration"} +export const config = { + urls: [ + "http://example.com/", + "http://anothersite.com/", + "http://yetanothersite.com/", + ], + // ... +}; +``` diff --git a/content/docs/configuration/url-filter.md b/content/docs/configuration/url-filter.md new file mode 100644 index 0000000..80d3544 --- /dev/null +++ b/content/docs/configuration/url-filter.md @@ -0,0 +1,42 @@ +--- +title: 'URL Filter' +weight: 4 +prev: /docs/getting-started +--- + +The `allowedURLs` and `blockedURLs` config options allow you to specify a list of URL patterns (in form of regular expressions) which are accessible or blocked during scraping. + +```javascript {filename="Configuration"} +export const options = { + url: "http://example.com/", + allowedURLs: ["/articles/.*", "/authors/.*"], + blockedURLs: ["/authors/admin"], + // ... +}; +``` + +## Allowed URLs + +This config option controls which URLs are allowed to be visted during scraping. When no value is provided all URLs are allowed to be visited if not otherwise blocked. + +When a list of URL patterns is provided, only URLs matching one or more of these patterns are allowed to be visted. + +```javascript {filename="Configuration"} +export const options = { + url: "http://example.com/", + allowedURLs: ["/products/"], +}; +``` + +## Blocked URLs + +This config option controls which URLs are blocked from being visted during scraping. + +When a list of URL patterns is provided, URLs matching one or more of these patterns are blocked from to be visted. + +```javascript {filename="Configuration"} +export const options = { + url: "http://example.com/", + blockedURLs: ["/restricted"], +}; +``` -- cgit v1.2.3