From b1e2c8fd5cb5dfa46bc440a12eafaf56cd844b1c Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Mon, 24 Nov 2025 20:54:57 +0100 Subject: Docs --- public/docs/reference-script/index.html | 645 ++++++++++++++++++++++++++++++++ 1 file changed, 645 insertions(+) create mode 100644 public/docs/reference-script/index.html (limited to 'public/docs/reference-script/index.html') diff --git a/public/docs/reference-script/index.html b/public/docs/reference-script/index.html new file mode 100644 index 0000000..797dd0c --- /dev/null +++ b/public/docs/reference-script/index.html @@ -0,0 +1,645 @@ + + + + + + + + + + + + + + Reference Script – Flyscrape + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+
+
+
Reference Script
+
+ +
+

Reference Script

+

This script serves as a reference that show all features of Flyscrape and how to use them. Feel free to copy and paste this as a starter script.

+
Reference
import { parse } from "flyscrape";
+import { download } from "flyscrape/http";
+import http from "flyscrape/http";
+
+export const config = {
+    // Specify the URL to start scraping from.
+    url: "https://example.com/",
+
+    // Specify the multiple URLs to start scraping from.   (default = [])
+    urls: [                          
+        "https://anothersite.com/",
+        "https://yetanother.com/",
+    ],
+
+    // Specify how deep links should be followed.          (default = 0, no follow)
+    depth: 5,                        
+
+    // Speficy the css selectors to follow.                (default = ["a[href]"])
+    follow: [".next > a", ".related a"],                      
+ 
+    // Specify the allowed domains. ['*'] for all.         (default = domain from url)
+    allowedDomains: ["example.com", "anothersite.com"],              
+ 
+    // Specify the blocked domains.                        (default = none)
+    blockedDomains: ["somesite.com"],              
+
+    // Specify the allowed URLs as regex.                  (default = all allowed)
+    allowedURLs: ["/posts", "/articles/\d+"],                 
+ 
+    // Specify the blocked URLs as regex.                  (default = none)
+    blockedURLs: ["/admin"],                 
+   
+    // Specify the rate in requests per minute.            (default = no rate limit)
+    rate: 60,                       
+
+    // Specify the number of concurrent requests.          (default = no limit)
+    concurrency: 1,                       
+
+    // Specify a single HTTP(S) proxy URL.                 (default = no proxy)
+    proxy: "http://someproxy.com:8043",
+
+    // Specify multiple HTTP(S) proxy URLs.                (default = no proxy)
+    proxies: [
+      "http://someproxy.com:8043",
+      "http://someotherproxy.com:8043",
+    ],                     
+
+    // Enable file-based request caching.                  (default = no cache)
+    cache: "file",                   
+
+    // Specify the HTTP request header.                    (default = none)
+    headers: {                       
+        "Authorization": "Bearer ...",
+        "User-Agent": "Mozilla ...",
+    },
+
+    // Specify the output options.
+    output: {
+        // Specify the output file.                        (default = stdout)
+        file: "results.json",
+        
+        // Specify the output format.                      (default = json)
+        // Options: "json" | "ndjson"
+        format: "json",
+    },
+};
+
+// Optional setup function, called before scraping starts.
+export function setup() {
+    // Fetch login form.
+    const { body } = http.get("http://example.com/login");
+
+    // Extract csrf token from form.
+    const csrf = parse(body).find("input[name=csrf]").attr("value");
+
+    // Submit login form.
+    http.postForm("http://example.com/login", {
+      "username": "jondoe",
+      "password": "supersecret",
+      "csrf": csrf,
+    });
+}
+
+export default function ({ doc, url, absoluteURL }) {
+  // doc              - Contains the parsed HTML document
+  // url              - Contains the scraped URL
+  // absoluteURL(...) - Transforms relative URLs into absolute URLs
+
+  // Find all users.
+  const userlist = doc.find(".user")
+
+  // Download the profile picture of each user.
+  userlist.each(user => {
+    const name = user.find(".name").text()
+    const pictureURL = absoluteURL(user.find("img").attr("src"));
+
+    download(pictureURL, `profile-pictures/${name}.jpg`)
+  })
+
+  // Return users name, address and age.
+  return {
+    users: userlist.map(user => {
+      const name = user.find(".name").text()
+      const address = user.find(".address").text()
+      const age = user.find(".age").text()
+
+      return { name, address, age };
+    })
+  };
+}
+ +
+
+ +
+
+ + +
+
+
+ + + + + + + + + + -- cgit v1.2.3