diff options
| author | rafiramadhana <rf.ramadhana@gmail.com> | 2023-11-16 05:36:22 +0700 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-11-15 23:48:30 +0100 |
| commit | 3e01902887bdc52e743ef6cec53a5c89cb5637f0 (patch) | |
| tree | 607f77cf8b9b00ab73e1003331eebefae7abc9ae | |
| parent | beadfd1db3d2398b9b1e66d60779a7b2649af044 (diff) | |
Update documentation
| -rw-r--r-- | README.md | 24 | ||||
| -rw-r--r-- | examples/multiple_starting_urls.js | 25 |
2 files changed, 39 insertions, 10 deletions
@@ -118,16 +118,20 @@ Below is an example scraping script that showcases the capabilities of flyscrape ```javascript export const config = { - url: "https://example.com/", // Specify the URL to start scraping from. - depth: 0, // Specify how deep links should be followed. (default = 0, no follow) - follow: [], // Speficy the css selectors to follow (default = ["a[href]"]) - allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) - blockedDomains: [], // Specify the blocked domains. (default = none) - allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) - blockedURLs: [], // Specify the blocked URLs as regex. (default = none) - rate: 100, // Specify the rate in requests per second. (default = no rate limit) - proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy) - cache: "file", // Enable file-based request caching. (default = no cache) + url: "https://example.com/", // Specify the URL to start scraping from. + urls: [ // Specify the URL(S) to start scraping from. If both .url and .urls + "https://example.com/foo", // are provided, all of the specified URLs will be scraped. + "https://example.com/foo", + ] + depth: 0, // Specify how deep links should be followed. (default = 0, no follow) + follow: [], // Speficy the css selectors to follow (default = ["a[href]"]) + allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) + blockedDomains: [], // Specify the blocked domains. (default = none) + allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) + blockedURLs: [], // Specify the blocked URLs as regex. (default = none) + rate: 100, // Specify the rate in requests per second. (default = no rate limit) + proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy) + cache: "file", // Enable file-based request caching. (default = no cache) }; export function setup() { diff --git a/examples/multiple_starting_urls.js b/examples/multiple_starting_urls.js new file mode 100644 index 0000000..5cb7ac9 --- /dev/null +++ b/examples/multiple_starting_urls.js @@ -0,0 +1,25 @@ +export const config = { + urls: [ + "https://news.ycombinator.com/show", + "https://news.ycombinator.com/ask", + ], +}; + +export default function({ doc, absoluteURL }) { + const posts = doc.find(".athing"); + + return { + posts: posts.map((post) => { + const link = post.find(".titleline > a"); + const meta = post.next(); + + return { + url: absoluteURL(link.attr("href")), + user: meta.find(".hnuser").text(), + title: link.text(), + points: meta.find(".score").text().replace(" points", ""), + created: meta.find(".age").attr("title"), + }; + }), + }; +} |