summaryrefslogtreecommitdiff
path: root/README.md
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2023-08-28 18:41:52 +0200
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2023-08-28 18:41:52 +0200
commite8feea22a4803dbd19224f48e8beaab458cf387f (patch)
tree91d302f76787f3dfdbb4df42d9310c74c2b2a1c9 /README.md
parent48ff300980369d6c06729a3e15575b77365ff59e (diff)
commit bundle
Diffstat (limited to 'README.md')
-rw-r--r--README.md39
1 files changed, 20 insertions, 19 deletions
diff --git a/README.md b/README.md
index e1c416a..666ac5e 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-# flyscrape - Elegant Website Scraping Tool
+# flyscrape
-flyscrape is a powerful command-line tool designed to streamline the process of efficiently extracting data from websites. Whether you're a developer, data analyst, or researcher, flyscrape empowers you to effortlessly gather information from web pages and transform it into structured data. With its intuitive command-line interface and versatile capabilities, flyscrape simplifies the scraping process while delivering accurate and customizable results.
+flyscrape is a powerful command-line tool designed to streamline the process of efficiently extracting data from websites. Whether you're a developer, data analyst, or researcher, flyscrape empowers you to effortlessly gather information from web pages and transform it into structured data.
## Features
@@ -21,7 +21,7 @@ To install **flyscrape**, follow these simple steps:
2. Install **flyscrape**: Open a terminal and run the following command:
```bash
- go install github.com/philippta/flyscrape@latest
+ go install github.com/philippta/flyscrape/cmd/flyscrape@latest
```
## Usage
@@ -40,7 +40,6 @@ flyscrape new example.js
Execute your scraping script using the `run` command:
-
```bash
flyscrape run example.js
```
@@ -61,12 +60,15 @@ Below is an example scraping script that showcases the capabilities of **flyscra
import { parse } from 'flyscrape';
export const options = {
- url: 'https://news.ycombinator.com/',
- depth: 1,
- allowedDomains: ['news.ycombinator.com'],
- blockedDomains: [],
- rate: 100,
-};
+ url: 'https://news.ycombinator.com/', // Specify the URL to start scraping from.
+ depth: 1, // Specify how deep links should be followed. (default = 0, no follow)
+ allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
+ blockedDomains: [], // Specify the blocked domains. (default = none)
+ allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed)
+ blockedURLs: [], // Specify the blocked URLs as regex. (default = non blocked)
+ proxy: '', // Specify the HTTP(S) proxy to use. (default = no proxy)
+ rate: 100, // Specify the rate in requests per second. (default = 100)
+}
export default function({ html, url }) {
const $ = parse(html);
@@ -74,22 +76,22 @@ export default function({ html, url }) {
const entries = $('.athing').toArray();
if (!entries.length) {
- return null;
+ return null; // Omits scraped pages without entries.
}
return {
- title: title.text(),
- entries: entries.map(entry => {
+ title: title.text(), // Extract the page title.
+ entries: entries.map(entry => { // Extract all news entries.
const link = $(entry).find('.titleline > a');
const rank = $(entry).find('.rank');
const points = $(entry).next().find('.score');
return {
- title: link.text(),
- url: link.attr('href'),
- rank: parseInt(rank.text().slice(0, -1)),
- points: parseInt(points.text().replace(' points', '')),
- };
+ title: link.text(), // Extract the title text.
+ url: link.attr('href'), // Extract the link href.
+ rank: parseInt(rank.text().slice(0, -1)), // Extract and cleanup the rank.
+ points: parseInt(points.text().replace(' points', '')), // Extract and cleanup the points.
+ }
}),
};
}
@@ -98,4 +100,3 @@ export default function({ html, url }) {
## Contributing
We welcome contributions from the community! If you encounter any issues or have suggestions for improvement, please [submit an issue](https://github.com/philippta/flyscrape/issues).
-