summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md24
-rw-r--r--examples/multiple_starting_urls.js25
2 files changed, 39 insertions, 10 deletions
diff --git a/README.md b/README.md
index 0f4f356..4d46bbd 100644
--- a/README.md
+++ b/README.md
@@ -118,16 +118,20 @@ Below is an example scraping script that showcases the capabilities of flyscrape
```javascript
export const config = {
- url: "https://example.com/", // Specify the URL to start scraping from.
- depth: 0, // Specify how deep links should be followed. (default = 0, no follow)
- follow: [], // Speficy the css selectors to follow (default = ["a[href]"])
- allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
- blockedDomains: [], // Specify the blocked domains. (default = none)
- allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed)
- blockedURLs: [], // Specify the blocked URLs as regex. (default = none)
- rate: 100, // Specify the rate in requests per second. (default = no rate limit)
- proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy)
- cache: "file", // Enable file-based request caching. (default = no cache)
+ url: "https://example.com/", // Specify the URL to start scraping from.
+ urls: [ // Specify the URL(S) to start scraping from. If both .url and .urls
+ "https://example.com/foo", // are provided, all of the specified URLs will be scraped.
+ "https://example.com/foo",
+ ]
+ depth: 0, // Specify how deep links should be followed. (default = 0, no follow)
+ follow: [], // Speficy the css selectors to follow (default = ["a[href]"])
+ allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
+ blockedDomains: [], // Specify the blocked domains. (default = none)
+ allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed)
+ blockedURLs: [], // Specify the blocked URLs as regex. (default = none)
+ rate: 100, // Specify the rate in requests per second. (default = no rate limit)
+ proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy)
+ cache: "file", // Enable file-based request caching. (default = no cache)
};
export function setup() {
diff --git a/examples/multiple_starting_urls.js b/examples/multiple_starting_urls.js
new file mode 100644
index 0000000..5cb7ac9
--- /dev/null
+++ b/examples/multiple_starting_urls.js
@@ -0,0 +1,25 @@
+export const config = {
+ urls: [
+ "https://news.ycombinator.com/show",
+ "https://news.ycombinator.com/ask",
+ ],
+};
+
+export default function({ doc, absoluteURL }) {
+ const posts = doc.find(".athing");
+
+ return {
+ posts: posts.map((post) => {
+ const link = post.find(".titleline > a");
+ const meta = post.next();
+
+ return {
+ url: absoluteURL(link.attr("href")),
+ user: meta.find(".hnuser").text(),
+ title: link.text(),
+ points: meta.find(".score").text().replace(" points", ""),
+ created: meta.find(".age").attr("title"),
+ };
+ }),
+ };
+}