commit bundle

author: Philipp Tanlak <philipp.tanlak@gmail.com> 2023-08-28 18:41:52 +0200
committer: Philipp Tanlak <philipp.tanlak@gmail.com> 2023-08-28 18:41:52 +0200
commit: e8feea22a4803dbd19224f48e8beaab458cf387f (patch)
tree: 91d302f76787f3dfdbb4df42d9310c74c2b2a1c9 /README.md
parent: 48ff300980369d6c06729a3e15575b77365ff59e (diff)
1 files changed, 20 insertions, 19 deletions
diff --git a/README.md b/README.md
index e1c416a..666ac5e 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-# flyscrape - Elegant Website Scraping Tool
+# flyscrape
 
-flyscrape is a powerful command-line tool designed to streamline the process of efficiently extracting data from websites. Whether you're a developer, data analyst, or researcher, flyscrape empowers you to effortlessly gather information from web pages and transform it into structured data. With its intuitive command-line interface and versatile capabilities, flyscrape simplifies the scraping process while delivering accurate and customizable results.
+flyscrape is a powerful command-line tool designed to streamline the process of efficiently extracting data from websites. Whether you're a developer, data analyst, or researcher, flyscrape empowers you to effortlessly gather information from web pages and transform it into structured data. 
 
 ## Features
 
@@ -21,7 +21,7 @@ To install **flyscrape**, follow these simple steps:
 2. Install **flyscrape**: Open a terminal and run the following command:
 
    ```bash
-   go install github.com/philippta/flyscrape@latest
+   go install github.com/philippta/flyscrape/cmd/flyscrape@latest
    ```
 
 ## Usage
@@ -40,7 +40,6 @@ flyscrape new example.js
 
 Execute your scraping script using the `run` command:
 
-
 ```bash
 flyscrape run example.js
 ```
@@ -61,12 +60,15 @@ Below is an example scraping script that showcases the capabilities of **flyscra
 import { parse } from 'flyscrape';
 
 export const options = {
-    url: 'https://news.ycombinator.com/',
-    depth: 1,
-    allowedDomains: ['news.ycombinator.com'],
-    blockedDomains: [],
-    rate: 100,
-};
+    url: 'https://news.ycombinator.com/',     // Specify the URL to start scraping from.
+    depth: 1,                                 // Specify how deep links should be followed.  (default = 0, no follow)
+    allowedDomains: [],                       // Specify the allowed domains. ['*'] for all. (default = domain from url)
+    blockedDomains: [],                       // Specify the blocked domains.                (default = none)
+    allowedURLs: [],                          // Specify the allowed URLs as regex.          (default = all allowed)
+    blockedURLs: [],                          // Specify the blocked URLs as regex.          (default = non blocked)
+    proxy: '',                                // Specify the HTTP(S) proxy to use.           (default = no proxy)
+    rate: 100,                                // Specify the rate in requests per second.    (default = 100)
+}
 
 export default function({ html, url }) {
     const $ = parse(html);
@@ -74,22 +76,22 @@ export default function({ html, url }) {
     const entries = $('.athing').toArray();
 
     if (!entries.length) {
-        return null;
+        return null; // Omits scraped pages without entries.
     }
 
     return {
-        title: title.text(),
-        entries: entries.map(entry => {
+        title: title.text(),                                            // Extract the page title.
+        entries: entries.map(entry => {                                 // Extract all news entries.
             const link = $(entry).find('.titleline > a');
             const rank = $(entry).find('.rank');
             const points = $(entry).next().find('.score');
 
             return {
-                title: link.text(),
-                url: link.attr('href'),
-                rank: parseInt(rank.text().slice(0, -1)),
-                points: parseInt(points.text().replace(' points', '')),
-            };
+                title: link.text(),                                     // Extract the title text.
+                url: link.attr('href'),                                 // Extract the link href.
+                rank: parseInt(rank.text().slice(0, -1)),               // Extract and cleanup the rank.
+                points: parseInt(points.text().replace(' points', '')), // Extract and cleanup the points.
+            }
         }),
     };
 }
@@ -98,4 +100,3 @@ export default function({ html, url }) {
 ## Contributing
 
 We welcome contributions from the community! If you encounter any issues or have suggestions for improvement, please [submit an issue](https://github.com/philippta/flyscrape/issues).
-
author	Philipp Tanlak <philipp.tanlak@gmail.com>	2023-08-28 18:41:52 +0200
committer	Philipp Tanlak <philipp.tanlak@gmail.com>	2023-08-28 18:41:52 +0200
commit	e8feea22a4803dbd19224f48e8beaab458cf387f (patch)
tree	91d302f76787f3dfdbb4df42d9310c74c2b2a1c9 /README.md
parent	48ff300980369d6c06729a3e15575b77365ff59e (diff)