From 0d6494d164cc490d62473eae0fbd79d5573bb380 Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Wed, 7 Feb 2024 23:20:55 +0100 Subject: Add retry module and change rate to requests per minute (#37) --- template.js | 74 ++++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 18 deletions(-) (limited to 'template.js') diff --git a/template.js b/template.js index a7a510f..17fe6ba 100644 --- a/template.js +++ b/template.js @@ -1,27 +1,65 @@ export const config = { - url: "https://news.ycombinator.com/", // Specify the URL to start scraping from. - // depth: 0, // Specify how deep links should be followed. (default = 0, no follow) - // allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) - // blockedDomains: [], // Specify the blocked domains. (default = none) - // allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) - // blockedURLs: [], // Specify the blocked URLs as regex. (default = non blocked) - // rate: 100, // Specify the rate in requests per second. (default = 100) - // cache: "file", // Enable file-based request caching. (default = no cache) + // Specify the URL to start scraping from. + url: "https://example.com/", + + // Specify the multiple URLs to start scraping from. (default = []) + // urls: [ + // "https://anothersite.com/", + // "https://yetanother.com/", + // ], + + // Specify how deep links should be followed. (default = 0, no follow) + // depth: 5, + + // Speficy the css selectors to follow. (default = ["a[href]"]) + // follow: [".next > a", ".related a"], + + // Specify the allowed domains. ['*'] for all. (default = domain from url) + // allowedDomains: ["example.com", "anothersite.com"], + + // Specify the blocked domains. (default = none) + // blockedDomains: ["somesite.com"], + + // Specify the allowed URLs as regex. (default = all allowed) + // allowedURLs: ["/posts", "/articles/\d+"], + + // Specify the blocked URLs as regex. (default = none) + // blockedURLs: ["/admin"], + + // Specify the rate in requests per minute. (default = no rate limit) + // rate: 60, + + // Specify the number of concurrent requests. (default = no limit) + // concurrency: 1, + + // Specify a single HTTP(S) proxy URL. (default = no proxy) + // proxy: "http://someproxy.com:8043", + + // Specify multiple HTTP(S) proxy URLs. (default = no proxy) + // proxies: [ + // "http://someproxy.com:8043", + // "http://someotherproxy.com:8043", + // ], + + // Enable file-based request caching. (default = no cache) + // cache: "file", + + // Specify the HTTP request header. (default = none) + // headers: { + // "Authorization": "Bearer ...", + // "User-Agent": "Mozilla ...", + // }, }; export default function({ doc, absoluteURL }) { - const title = doc.find("title"); - const posts = doc.find(".athing"); + const title = doc.find("h1"); + const link = doc.find("a"); return { title: title.text(), - posts: posts.map((post) => { - const link = post.find(".titleline > a"); - - return { - title: link.text(), - url: absoluteURL(link.attr("href")), - }; - }), + link: { + text: link.text(), + url: absoluteURL(link.attr("href")), + }, }; } -- cgit v1.2.3