From 297835a9b2c1673ec62932271160bc036d95832c Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Mon, 24 Nov 2025 20:58:57 +0100 Subject: public -> docs --- public/docs/reference-script/index.html | 645 -------------------------------- 1 file changed, 645 deletions(-) delete mode 100644 public/docs/reference-script/index.html (limited to 'public/docs/reference-script') diff --git a/public/docs/reference-script/index.html b/public/docs/reference-script/index.html deleted file mode 100644 index 797dd0c..0000000 --- a/public/docs/reference-script/index.html +++ /dev/null @@ -1,645 +0,0 @@ - - - - - - - - - - - - - - Reference Script – Flyscrape - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - -
-
-
-
Reference Script
-
- -
-

Reference Script

-

This script serves as a reference that show all features of Flyscrape and how to use them. Feel free to copy and paste this as a starter script.

-
Reference
import { parse } from "flyscrape";
-import { download } from "flyscrape/http";
-import http from "flyscrape/http";
-
-export const config = {
-    // Specify the URL to start scraping from.
-    url: "https://example.com/",
-
-    // Specify the multiple URLs to start scraping from.   (default = [])
-    urls: [                          
-        "https://anothersite.com/",
-        "https://yetanother.com/",
-    ],
-
-    // Specify how deep links should be followed.          (default = 0, no follow)
-    depth: 5,                        
-
-    // Speficy the css selectors to follow.                (default = ["a[href]"])
-    follow: [".next > a", ".related a"],                      
- 
-    // Specify the allowed domains. ['*'] for all.         (default = domain from url)
-    allowedDomains: ["example.com", "anothersite.com"],              
- 
-    // Specify the blocked domains.                        (default = none)
-    blockedDomains: ["somesite.com"],              
-
-    // Specify the allowed URLs as regex.                  (default = all allowed)
-    allowedURLs: ["/posts", "/articles/\d+"],                 
- 
-    // Specify the blocked URLs as regex.                  (default = none)
-    blockedURLs: ["/admin"],                 
-   
-    // Specify the rate in requests per minute.            (default = no rate limit)
-    rate: 60,                       
-
-    // Specify the number of concurrent requests.          (default = no limit)
-    concurrency: 1,                       
-
-    // Specify a single HTTP(S) proxy URL.                 (default = no proxy)
-    proxy: "http://someproxy.com:8043",
-
-    // Specify multiple HTTP(S) proxy URLs.                (default = no proxy)
-    proxies: [
-      "http://someproxy.com:8043",
-      "http://someotherproxy.com:8043",
-    ],                     
-
-    // Enable file-based request caching.                  (default = no cache)
-    cache: "file",                   
-
-    // Specify the HTTP request header.                    (default = none)
-    headers: {                       
-        "Authorization": "Bearer ...",
-        "User-Agent": "Mozilla ...",
-    },
-
-    // Specify the output options.
-    output: {
-        // Specify the output file.                        (default = stdout)
-        file: "results.json",
-        
-        // Specify the output format.                      (default = json)
-        // Options: "json" | "ndjson"
-        format: "json",
-    },
-};
-
-// Optional setup function, called before scraping starts.
-export function setup() {
-    // Fetch login form.
-    const { body } = http.get("http://example.com/login");
-
-    // Extract csrf token from form.
-    const csrf = parse(body).find("input[name=csrf]").attr("value");
-
-    // Submit login form.
-    http.postForm("http://example.com/login", {
-      "username": "jondoe",
-      "password": "supersecret",
-      "csrf": csrf,
-    });
-}
-
-export default function ({ doc, url, absoluteURL }) {
-  // doc              - Contains the parsed HTML document
-  // url              - Contains the scraped URL
-  // absoluteURL(...) - Transforms relative URLs into absolute URLs
-
-  // Find all users.
-  const userlist = doc.find(".user")
-
-  // Download the profile picture of each user.
-  userlist.each(user => {
-    const name = user.find(".name").text()
-    const pictureURL = absoluteURL(user.find("img").attr("src"));
-
-    download(pictureURL, `profile-pictures/${name}.jpg`)
-  })
-
-  // Return users name, address and age.
-  return {
-    users: userlist.map(user => {
-      const name = user.find(".name").text()
-      const address = user.find(".address").text()
-      const age = user.find(".age").text()
-
-      return { name, address, age };
-    })
-  };
-}
- -
-
- -
-
- - -
-
-
- - - - - - - - - - -- cgit v1.2.3