From 3e01902887bdc52e743ef6cec53a5c89cb5637f0 Mon Sep 17 00:00:00 2001 From: rafiramadhana Date: Thu, 16 Nov 2023 05:36:22 +0700 Subject: Update documentation --- README.md | 24 ++++++++++++++---------- examples/multiple_starting_urls.js | 25 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 10 deletions(-) create mode 100644 examples/multiple_starting_urls.js diff --git a/README.md b/README.md index 0f4f356..4d46bbd 100644 --- a/README.md +++ b/README.md @@ -118,16 +118,20 @@ Below is an example scraping script that showcases the capabilities of flyscrape ```javascript export const config = { - url: "https://example.com/", // Specify the URL to start scraping from. - depth: 0, // Specify how deep links should be followed. (default = 0, no follow) - follow: [], // Speficy the css selectors to follow (default = ["a[href]"]) - allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) - blockedDomains: [], // Specify the blocked domains. (default = none) - allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) - blockedURLs: [], // Specify the blocked URLs as regex. (default = none) - rate: 100, // Specify the rate in requests per second. (default = no rate limit) - proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy) - cache: "file", // Enable file-based request caching. (default = no cache) + url: "https://example.com/", // Specify the URL to start scraping from. + urls: [ // Specify the URL(S) to start scraping from. If both .url and .urls + "https://example.com/foo", // are provided, all of the specified URLs will be scraped. + "https://example.com/foo", + ] + depth: 0, // Specify how deep links should be followed. (default = 0, no follow) + follow: [], // Speficy the css selectors to follow (default = ["a[href]"]) + allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) + blockedDomains: [], // Specify the blocked domains. (default = none) + allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) + blockedURLs: [], // Specify the blocked URLs as regex. (default = none) + rate: 100, // Specify the rate in requests per second. (default = no rate limit) + proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy) + cache: "file", // Enable file-based request caching. (default = no cache) }; export function setup() { diff --git a/examples/multiple_starting_urls.js b/examples/multiple_starting_urls.js new file mode 100644 index 0000000..5cb7ac9 --- /dev/null +++ b/examples/multiple_starting_urls.js @@ -0,0 +1,25 @@ +export const config = { + urls: [ + "https://news.ycombinator.com/show", + "https://news.ycombinator.com/ask", + ], +}; + +export default function({ doc, absoluteURL }) { + const posts = doc.find(".athing"); + + return { + posts: posts.map((post) => { + const link = post.find(".titleline > a"); + const meta = post.next(); + + return { + url: absoluteURL(link.attr("href")), + user: meta.find(".hnuser").text(), + title: link.text(), + points: meta.find(".score").text().replace(" points", ""), + created: meta.find(".age").attr("title"), + }; + }), + }; +} -- cgit v1.2.3