Replace v8go with goja

author: Philipp Tanlak <philipp.tanlak@gmail.com> 2023-10-12 18:57:21 +0200
committer: Philipp Tanlak <philipp.tanlak@gmail.com> 2023-10-12 18:57:21 +0200
commit: dfbacde1fdb95452233308731c0670abf3ac94bf (patch)
tree: 2c8270d94f2b70a1cf44b3398f8e33d93bf87cb3 /README.md
parent: fb84ca746e92e371161f1e1de3b01a048a9ae979 (diff)
1 files changed, 25 insertions, 36 deletions
diff --git a/README.md b/README.md
index 3021c7f..ef8182a 100644
--- a/README.md
+++ b/README.md
@@ -57,43 +57,32 @@ flyscrape dev example.js
 Below is an example scraping script that showcases the capabilities of **flyscrape**:
 
 ```javascript
-import { parse } from 'flyscrape';
-
 export const config = {
-    url: 'https://news.ycombinator.com/',     // Specify the URL to start scraping from.
-    depth: 1,                                 // Specify how deep links should be followed.  (default = 0, no follow)
-    allowedDomains: [],                       // Specify the allowed domains. ['*'] for all. (default = domain from url)
-    blockedDomains: [],                       // Specify the blocked domains.                (default = none)
-    allowedURLs: [],                          // Specify the allowed URLs as regex.          (default = all allowed)
-    blockedURLs: [],                          // Specify the blocked URLs as regex.          (default = non blocked)
-    proxy: '',                                // Specify the HTTP(S) proxy to use.           (default = no proxy)
-    rate: 100,                                // Specify the rate in requests per second.    (default = 100)
-}
-
-export default function({ html, url }) {
-    const $ = parse(html);
-    const title = $('title');
-    const entries = $('.athing').toArray();
-
-    if (!entries.length) {
-        return null; // Omits scraped pages without entries.
-    }
-
-    return {
-        title: title.text(),                                            // Extract the page title.
-        entries: entries.map(entry => {                                 // Extract all news entries.
-            const link = $(entry).find('.titleline > a');
-            const rank = $(entry).find('.rank');
-            const points = $(entry).next().find('.score');
-
-            return {
-                title: link.text(),                                     // Extract the title text.
-                url: link.attr('href'),                                 // Extract the link href.
-                rank: parseInt(rank.text().slice(0, -1)),               // Extract and cleanup the rank.
-                points: parseInt(points.text().replace(' points', '')), // Extract and cleanup the points.
-            }
-        }),
-    };
+  url: "https://news.ycombinator.com/", // Specify the URL to start scraping from.
+  // depth: 0,                          // Specify how deep links should be followed.  (default = 0, no follow)
+  // allowedDomains: [],                // Specify the allowed domains. ['*'] for all. (default = domain from url)
+  // blockedDomains: [],                // Specify the blocked domains.                (default = none)
+  // allowedURLs: [],                   // Specify the allowed URLs as regex.          (default = all allowed)
+  // blockedURLs: [],                   // Specify the blocked URLs as regex.          (default = non blocked)
+  // rate: 100,                         // Specify the rate in requests per second.    (default = 100)
+  // cache: "file",                     // Enable file-based request caching.          (default = no cache)
+};
+
+export default function({ doc, absoluteURL }) {
+  const title = doc.find("title");
+  const posts = doc.find(".athing");
+
+  return {
+    title: title.text(),
+    posts: posts.map((post) => {
+      const link = post.find(".titleline > a");
+
+      return {
+        title: link.text(),
+        url: absoluteURL(link.attr("href")),
+      };
+    }),
+  };
 }
 ```
author	Philipp Tanlak <philipp.tanlak@gmail.com>	2023-10-12 18:57:21 +0200
committer	Philipp Tanlak <philipp.tanlak@gmail.com>	2023-10-12 18:57:21 +0200
commit	dfbacde1fdb95452233308731c0670abf3ac94bf (patch)
tree	2c8270d94f2b70a1cf44b3398f8e33d93bf87cb3 /README.md
parent	fb84ca746e92e371161f1e1de3b01a048a9ae979 (diff)