summaryrefslogtreecommitdiff
path: root/README.md
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2023-10-12 18:57:21 +0200
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2023-10-12 18:57:21 +0200
commitdfbacde1fdb95452233308731c0670abf3ac94bf (patch)
tree2c8270d94f2b70a1cf44b3398f8e33d93bf87cb3 /README.md
parentfb84ca746e92e371161f1e1de3b01a048a9ae979 (diff)
Replace v8go with goja
Diffstat (limited to 'README.md')
-rw-r--r--README.md61
1 files changed, 25 insertions, 36 deletions
diff --git a/README.md b/README.md
index 3021c7f..ef8182a 100644
--- a/README.md
+++ b/README.md
@@ -57,43 +57,32 @@ flyscrape dev example.js
Below is an example scraping script that showcases the capabilities of **flyscrape**:
```javascript
-import { parse } from 'flyscrape';
-
export const config = {
- url: 'https://news.ycombinator.com/', // Specify the URL to start scraping from.
- depth: 1, // Specify how deep links should be followed. (default = 0, no follow)
- allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
- blockedDomains: [], // Specify the blocked domains. (default = none)
- allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed)
- blockedURLs: [], // Specify the blocked URLs as regex. (default = non blocked)
- proxy: '', // Specify the HTTP(S) proxy to use. (default = no proxy)
- rate: 100, // Specify the rate in requests per second. (default = 100)
-}
-
-export default function({ html, url }) {
- const $ = parse(html);
- const title = $('title');
- const entries = $('.athing').toArray();
-
- if (!entries.length) {
- return null; // Omits scraped pages without entries.
- }
-
- return {
- title: title.text(), // Extract the page title.
- entries: entries.map(entry => { // Extract all news entries.
- const link = $(entry).find('.titleline > a');
- const rank = $(entry).find('.rank');
- const points = $(entry).next().find('.score');
-
- return {
- title: link.text(), // Extract the title text.
- url: link.attr('href'), // Extract the link href.
- rank: parseInt(rank.text().slice(0, -1)), // Extract and cleanup the rank.
- points: parseInt(points.text().replace(' points', '')), // Extract and cleanup the points.
- }
- }),
- };
+ url: "https://news.ycombinator.com/", // Specify the URL to start scraping from.
+ // depth: 0, // Specify how deep links should be followed. (default = 0, no follow)
+ // allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
+ // blockedDomains: [], // Specify the blocked domains. (default = none)
+ // allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed)
+ // blockedURLs: [], // Specify the blocked URLs as regex. (default = non blocked)
+ // rate: 100, // Specify the rate in requests per second. (default = 100)
+ // cache: "file", // Enable file-based request caching. (default = no cache)
+};
+
+export default function({ doc, absoluteURL }) {
+ const title = doc.find("title");
+ const posts = doc.find(".athing");
+
+ return {
+ title: title.text(),
+ posts: posts.map((post) => {
+ const link = post.find(".titleline > a");
+
+ return {
+ title: link.text(),
+ url: absoluteURL(link.attr("href")),
+ };
+ }),
+ };
}
```