diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2024-02-07 23:20:55 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-02-07 23:20:55 +0100 |
| commit | 0d6494d164cc490d62473eae0fbd79d5573bb380 (patch) | |
| tree | 7a4586e89920b6abd4f6c7724f42634d66cf5f68 /template.js | |
| parent | 60139e7de275473332b560b4139a6a01c3da184c (diff) | |
Add retry module and change rate to requests per minute (#37)v0.7.0
Diffstat (limited to 'template.js')
| -rw-r--r-- | template.js | 74 |
1 files changed, 56 insertions, 18 deletions
diff --git a/template.js b/template.js index a7a510f..17fe6ba 100644 --- a/template.js +++ b/template.js @@ -1,27 +1,65 @@ export const config = { - url: "https://news.ycombinator.com/", // Specify the URL to start scraping from. - // depth: 0, // Specify how deep links should be followed. (default = 0, no follow) - // allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) - // blockedDomains: [], // Specify the blocked domains. (default = none) - // allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) - // blockedURLs: [], // Specify the blocked URLs as regex. (default = non blocked) - // rate: 100, // Specify the rate in requests per second. (default = 100) - // cache: "file", // Enable file-based request caching. (default = no cache) + // Specify the URL to start scraping from. + url: "https://example.com/", + + // Specify the multiple URLs to start scraping from. (default = []) + // urls: [ + // "https://anothersite.com/", + // "https://yetanother.com/", + // ], + + // Specify how deep links should be followed. (default = 0, no follow) + // depth: 5, + + // Speficy the css selectors to follow. (default = ["a[href]"]) + // follow: [".next > a", ".related a"], + + // Specify the allowed domains. ['*'] for all. (default = domain from url) + // allowedDomains: ["example.com", "anothersite.com"], + + // Specify the blocked domains. (default = none) + // blockedDomains: ["somesite.com"], + + // Specify the allowed URLs as regex. (default = all allowed) + // allowedURLs: ["/posts", "/articles/\d+"], + + // Specify the blocked URLs as regex. (default = none) + // blockedURLs: ["/admin"], + + // Specify the rate in requests per minute. (default = no rate limit) + // rate: 60, + + // Specify the number of concurrent requests. (default = no limit) + // concurrency: 1, + + // Specify a single HTTP(S) proxy URL. (default = no proxy) + // proxy: "http://someproxy.com:8043", + + // Specify multiple HTTP(S) proxy URLs. (default = no proxy) + // proxies: [ + // "http://someproxy.com:8043", + // "http://someotherproxy.com:8043", + // ], + + // Enable file-based request caching. (default = no cache) + // cache: "file", + + // Specify the HTTP request header. (default = none) + // headers: { + // "Authorization": "Bearer ...", + // "User-Agent": "Mozilla ...", + // }, }; export default function({ doc, absoluteURL }) { - const title = doc.find("title"); - const posts = doc.find(".athing"); + const title = doc.find("h1"); + const link = doc.find("a"); return { title: title.text(), - posts: posts.map((post) => { - const link = post.find(".titleline > a"); - - return { - title: link.text(), - url: absoluteURL(link.attr("href")), - }; - }), + link: { + text: link.text(), + url: absoluteURL(link.attr("href")), + }, }; } |