summaryrefslogtreecommitdiff
path: root/template.js
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2024-02-07 23:20:55 +0100
committerGitHub <noreply@github.com>2024-02-07 23:20:55 +0100
commit0d6494d164cc490d62473eae0fbd79d5573bb380 (patch)
tree7a4586e89920b6abd4f6c7724f42634d66cf5f68 /template.js
parent60139e7de275473332b560b4139a6a01c3da184c (diff)
Add retry module and change rate to requests per minute (#37)v0.7.0
Diffstat (limited to 'template.js')
-rw-r--r--template.js74
1 files changed, 56 insertions, 18 deletions
diff --git a/template.js b/template.js
index a7a510f..17fe6ba 100644
--- a/template.js
+++ b/template.js
@@ -1,27 +1,65 @@
export const config = {
- url: "https://news.ycombinator.com/", // Specify the URL to start scraping from.
- // depth: 0, // Specify how deep links should be followed. (default = 0, no follow)
- // allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
- // blockedDomains: [], // Specify the blocked domains. (default = none)
- // allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed)
- // blockedURLs: [], // Specify the blocked URLs as regex. (default = non blocked)
- // rate: 100, // Specify the rate in requests per second. (default = 100)
- // cache: "file", // Enable file-based request caching. (default = no cache)
+ // Specify the URL to start scraping from.
+ url: "https://example.com/",
+
+ // Specify the multiple URLs to start scraping from. (default = [])
+ // urls: [
+ // "https://anothersite.com/",
+ // "https://yetanother.com/",
+ // ],
+
+ // Specify how deep links should be followed. (default = 0, no follow)
+ // depth: 5,
+
+ // Speficy the css selectors to follow. (default = ["a[href]"])
+ // follow: [".next > a", ".related a"],
+
+ // Specify the allowed domains. ['*'] for all. (default = domain from url)
+ // allowedDomains: ["example.com", "anothersite.com"],
+
+ // Specify the blocked domains. (default = none)
+ // blockedDomains: ["somesite.com"],
+
+ // Specify the allowed URLs as regex. (default = all allowed)
+ // allowedURLs: ["/posts", "/articles/\d+"],
+
+ // Specify the blocked URLs as regex. (default = none)
+ // blockedURLs: ["/admin"],
+
+ // Specify the rate in requests per minute. (default = no rate limit)
+ // rate: 60,
+
+ // Specify the number of concurrent requests. (default = no limit)
+ // concurrency: 1,
+
+ // Specify a single HTTP(S) proxy URL. (default = no proxy)
+ // proxy: "http://someproxy.com:8043",
+
+ // Specify multiple HTTP(S) proxy URLs. (default = no proxy)
+ // proxies: [
+ // "http://someproxy.com:8043",
+ // "http://someotherproxy.com:8043",
+ // ],
+
+ // Enable file-based request caching. (default = no cache)
+ // cache: "file",
+
+ // Specify the HTTP request header. (default = none)
+ // headers: {
+ // "Authorization": "Bearer ...",
+ // "User-Agent": "Mozilla ...",
+ // },
};
export default function({ doc, absoluteURL }) {
- const title = doc.find("title");
- const posts = doc.find(".athing");
+ const title = doc.find("h1");
+ const link = doc.find("a");
return {
title: title.text(),
- posts: posts.map((post) => {
- const link = post.find(".titleline > a");
-
- return {
- title: link.text(),
- url: absoluteURL(link.attr("href")),
- };
- }),
+ link: {
+ text: link.text(),
+ url: absoluteURL(link.attr("href")),
+ },
};
}