summaryrefslogtreecommitdiff
path: root/content/docs/full-example-script.md
diff options
context:
space:
mode:
Diffstat (limited to 'content/docs/full-example-script.md')
-rw-r--r--content/docs/full-example-script.md115
1 files changed, 115 insertions, 0 deletions
diff --git a/content/docs/full-example-script.md b/content/docs/full-example-script.md
new file mode 100644
index 0000000..41cccf5
--- /dev/null
+++ b/content/docs/full-example-script.md
@@ -0,0 +1,115 @@
+---
+title: 'Full Example Script'
+weight: 3
+---
+
+This script serves as a reference that show all features of Flyscrape and how to use them. Feel free to copy and paste this as a starter script.
+
+```javascript{filename="Reference"}
+import { parse } from "flyscrape";
+import { download } from "flyscrape/http";
+import http from "flyscrape/http";
+
+export const config = {
+ // Specify the URL to start scraping from.
+ url: "https://example.com/",
+
+ // Specify the multiple URLs to start scraping from. (default = [])
+ urls: [
+ "https://anothersite.com/",
+ "https://yetanother.com/",
+ ],
+
+ // Enable rendering with headless browser. (default = false)
+ browser: true,
+
+ // Specify if browser should be headless or not. (default = true)
+ headless: false,
+
+ // Specify how deep links should be followed. (default = 0, no follow)
+ depth: 5,
+
+ // Speficy the css selectors to follow. (default = ["a[href]"])
+ follow: [".next > a", ".related a"],
+
+ // Specify the allowed domains. ['*'] for all. (default = domain from url)
+ allowedDomains: ["example.com", "anothersite.com"],
+
+ // Specify the blocked domains. (default = none)
+ blockedDomains: ["somesite.com"],
+
+ // Specify the allowed URLs as regex. (default = all allowed)
+ allowedURLs: ["/posts", "/articles/\d+"],
+
+ // Specify the blocked URLs as regex. (default = none)
+ blockedURLs: ["/admin"],
+
+ // Specify the rate in requests per minute. (default = no rate limit)
+ rate: 60,
+
+ // Specify the number of concurrent requests. (default = no limit)
+ concurrency: 1,
+
+ // Specify a single HTTP(S) proxy URL. (default = no proxy)
+ // Note: Not compatible with browser mode.
+ proxy: "http://someproxy.com:8043",
+
+ // Specify multiple HTTP(S) proxy URLs. (default = no proxy)
+ // Note: Not compatible with browser mode.
+ proxies: [
+ "http://someproxy.com:8043",
+ "http://someotherproxy.com:8043",
+ ],
+
+ // Enable file-based request caching. (default = no cache)
+ cache: "file",
+
+ // Specify the HTTP request header. (default = none)
+ headers: {
+ "Authorization": "Bearer ...",
+ "User-Agent": "Mozilla ...",
+ },
+
+ // Use the cookie store of your local browser. (default = off)
+ // Options: "chrome" | "edge" | "firefox"
+ cookies: "chrome",
+
+ // Specify the output options.
+ output: {
+ // Specify the output file. (default = stdout)
+ file: "results.json",
+
+ // Specify the output format. (default = json)
+ // Options: "json" | "ndjson"
+ format: "json",
+ },
+};
+
+export default function ({ doc, url, absoluteURL }) {
+ // doc - Contains the parsed HTML document
+ // url - Contains the scraped URL
+ // absoluteURL(...) - Transforms relative URLs into absolute URLs
+
+ // Find all users.
+ const userlist = doc.find(".user")
+
+ // Download the profile picture of each user.
+ userlist.each(user => {
+ const name = user.find(".name").text()
+ const pictureURL = absoluteURL(user.find("img").attr("src"));
+
+ download(pictureURL, `profile-pictures/${name}.jpg`)
+ })
+
+ // Return users name, address and age.
+ return {
+ users: userlist.map(user => {
+ const name = user.find(".name").text()
+ const address = user.find(".address").text()
+ const age = user.find(".age").text()
+
+ return { name, address, age };
+ })
+ };
+}
+```