summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md109
-rw-r--r--modules/proxy/proxy.go4
-rw-r--r--modules/proxy/proxy_test.go11
3 files changed, 86 insertions, 38 deletions
diff --git a/README.md b/README.md
index cb7aacc..fcbbafa 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@
## Features
-- **Highly Configurable:** 10 options to fine-tune your scraper.
+- **Highly Configurable:** 13 options to fine-tune your scraper.
- **Standalone:** flyscrape comes as a single binary executable.
- **Scriptable:** Use JavaScript to write your data extraction logic.
- **Simple API:** Extract data from HTML pages with a familiar API.
@@ -32,21 +32,35 @@
- **Request Caching:** Re-run scripts on websites you already scraped.
- **Zero Dependencies:** No need to fill up your disk with npm packages.
-## Example script
+## Overview
+
+- [Example](#example)
+- [Installation](#installation)
+ - [Pre-compiled binary](#pre-compiled-binary)
+ - [Compile from source](#compile-from-source)
+- [Usage](#usage)
+- [Configuration](#configuration)
+- [Query API](#query-api)
+- [Flyscrape API](#flyscrape-api)
+ - [Document Parsing](#document-parsing)
+ - [Basic HTTP Requests](#basic-http-requests)
+ - [File Downloads](#file-downloads)
+- [Issues and suggestions](#issues-and-suggestions)
+
+## Example
+
+This example scrapes the first few pages form Hacker News, specifically the New, Show and Ask sections.
```javascript
export const config = {
- url: "https://news.ycombinator.com/",
- // urls: [] // Specify additional URLs to start from. (default = none)
- // depth: 0, // Specify how deep links should be followed. (default = 0, no follow)
- // follow: [], // Speficy the css selectors to follow (default = ["a[href]"])
- // allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
- // blockedDomains: [], // Specify the blocked domains. (default = none)
- // allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed)
- // blockedURLs: [], // Specify the blocked URLs as regex. (default = none)
- // rate: 100, // Specify the rate in requests per second. (default = no rate limit)
- // proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy)
- // cache: "file", // Enable file-based request caching. (default = no cache)
+ urls: [
+ "https://news.ycombinator.com/new",
+ "https://news.ycombinator.com/show",
+ "https://news.ycombinator.com/ask",
+ ],
+ depth: 5,
+ cache: "file",
+ follow: ["a.morelink[href]"],
}
export default function ({ doc, absoluteURL }) {
@@ -71,9 +85,9 @@ export default function ({ doc, absoluteURL }) {
$ flyscrape run hackernews.js
[
{
- "url": "https://news.ycombinator.com/",
+ "url": "https://news.ycombinator.com/new",
"data": {
- "title": "Hacker News",
+ "title": "New Links | Hacker News",
"posts": [
{
"title": "Show HN: flyscrape - An standalone and scriptable web scraper",
@@ -98,7 +112,7 @@ Check out the [examples folder](examples) for more detailed examples.
To compile flyscrape from source, follow these steps:
-1. Install Go: Make sure you have Go installed on your system. If not, you can download it from [https://golang.org/](https://golang.org/).
+1. Install Go: Make sure you have Go installed on your system. If not, you can download it from [https://go.dev/](https://go.dev/).
2. Install flyscrape: Open a terminal and run the following command:
@@ -122,7 +136,7 @@ Examples:
$ flyscrape run example.js --url "http://other.com"
# Enable proxy support.
- $ flyscrape run example.js --proxies "http://someproxy:8043"
+ $ flyscrape run example.js --proxy "http://someproxy:8043"
# Follow paginated links.
$ flyscrape run example.js --depth 5 --follow ".next-button > a"
@@ -134,23 +148,52 @@ Below is an example scraping script that showcases the capabilities of flyscrape
```javascript
export const config = {
- url: "https://example.com/", // Specify the URL to start scraping from.
- urls: [ // Specify the URL(s) to start scraping from. If both `url` and `urls`
- "https://example.com/foo", // are provided, all of the specified URLs will be scraped.
- "https://example.com/bar",
+ // Specify the URL to start scraping from.
+ url: "https://example.com/",
+
+ // Specify the multiple URLs to start scraping from. (default = [])
+ urls: [
+ "https://anothersite.com/",
+ "https://yetanother.com/",
],
- depth: 0, // Specify how deep links should be followed. (default = 0, no follow)
- follow: [], // Speficy the css selectors to follow (default = ["a[href]"])
- allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
- blockedDomains: [], // Specify the blocked domains. (default = none)
- allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed)
- blockedURLs: [], // Specify the blocked URLs as regex. (default = none)
- rate: 100, // Specify the rate in requests per second. (default = no rate limit)
- proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy)
- cache: "file", // Enable file-based request caching. (default = no cache)
- headers: { // Specify the HTTP request header. (default = none)
- "Authorization": "Basic ZGVtbzpwQDU1dzByZA==",
- "User-Agent": "Gecko/1.0",
+
+ // Specify how deep links should be followed. (default = 0, no follow)
+ depth: 5,
+
+ // Speficy the css selectors to follow. (default = ["a[href]"])
+ follow: [".next > a", ".related a"],
+
+ // Specify the allowed domains. ['*'] for all. (default = domain from url)
+ allowedDomains: ["example.com", "anothersite.com"],
+
+ // Specify the blocked domains. (default = none)
+ blockedDomains: ["somesite.com"],
+
+ // Specify the allowed URLs as regex. (default = all allowed)
+ allowedURLs: ["/posts", "/articles/\d+"],
+
+ // Specify the blocked URLs as regex. (default = none)
+ blockedURLs: ["/admin"],
+
+ // Specify the rate in requests per second. (default = no rate limit)
+ rate: 100,
+
+ // Specify a single HTTP(S) proxy URL. (default = no proxy)
+ proxy: "http://someproxy.com:8043",
+
+ // Specify multiple HTTP(S) proxy URLs. (default = no proxy)
+ proxies: [
+ "http://someproxy.com:8043",
+ "http://someotherproxy.com:8043",
+ ],
+
+ // Enable file-based request caching. (default = no cache)
+ cache: "file",
+
+ // Specify the HTTP request header. (default = none)
+ headers: {
+ "Authorization": "Bearer ...",
+ "User-Agent": "Mozilla ...",
},
};
diff --git a/modules/proxy/proxy.go b/modules/proxy/proxy.go
index 120a856..ff9aa5c 100644
--- a/modules/proxy/proxy.go
+++ b/modules/proxy/proxy.go
@@ -19,6 +19,7 @@ func init() {
type Module struct {
Proxies []string `json:"proxies"`
+ Proxy string `json:"proxy"`
transports []*http.Transport
}
@@ -35,13 +36,14 @@ func (m *Module) Provision(ctx flyscrape.Context) {
return
}
- for _, purl := range m.Proxies {
+ for _, purl := range append(m.Proxies, m.Proxy) {
if parsed, err := url.Parse(purl); err == nil {
m.transports = append(m.transports, &http.Transport{
Proxy: http.ProxyURL(parsed),
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
})
}
+
}
}
diff --git a/modules/proxy/proxy_test.go b/modules/proxy/proxy_test.go
index 62da23a..219649e 100644
--- a/modules/proxy/proxy_test.go
+++ b/modules/proxy/proxy_test.go
@@ -35,19 +35,21 @@ func TestProxy(t *testing.T) {
}
func TestProxyMultiple(t *testing.T) {
- calls := []int{0, 0}
+ calls := []int{0, 0, 0}
p0 := newProxy(func() { calls[0]++ })
p1 := newProxy(func() { calls[1]++ })
+ p2 := newProxy(func() { calls[2]++ })
defer p0.Close()
defer p1.Close()
+ defer p2.Close()
- mod := &proxy.Module{Proxies: []string{p0.URL, p1.URL}}
+ mod := &proxy.Module{Proxies: []string{p0.URL, p1.URL}, Proxy: p2.URL}
mod.Provision(nil)
trans := mod.AdaptTransport(nil)
req := httptest.NewRequest("GET", "http://www.example.com/", nil)
- for i := 0; i < 10; i++ {
+ for i := 0; i < 50; i++ {
resp, err := trans.RoundTrip(req)
require.NoError(t, err)
require.Equal(t, http.StatusOK, resp.StatusCode)
@@ -55,7 +57,8 @@ func TestProxyMultiple(t *testing.T) {
require.Greater(t, calls[0], 1)
require.Greater(t, calls[1], 1)
- require.Equal(t, 10, calls[0]+calls[1])
+ require.Greater(t, calls[2], 1)
+ require.Equal(t, 50, calls[0]+calls[1]+calls[2])
}
func newProxy(f func()) *httptest.Server {