summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md28
-rw-r--r--cmd/main.go1
-rw-r--r--examples/custom_headers.js26
-rw-r--r--module.go1
-rw-r--r--modules/headers/headers.go42
-rw-r--r--modules/headers/headers_test.go94
6 files changed, 180 insertions, 12 deletions
diff --git a/README.md b/README.md
index 2f84843..cb7aacc 100644
--- a/README.md
+++ b/README.md
@@ -134,20 +134,24 @@ Below is an example scraping script that showcases the capabilities of flyscrape
```javascript
export const config = {
- url: "https://example.com/", // Specify the URL to start scraping from.
- urls: [ // Specify the URL(s) to start scraping from. If both `url` and `urls`
- "https://example.com/foo", // are provided, all of the specified URLs will be scraped.
+ url: "https://example.com/", // Specify the URL to start scraping from.
+ urls: [ // Specify the URL(s) to start scraping from. If both `url` and `urls`
+ "https://example.com/foo", // are provided, all of the specified URLs will be scraped.
"https://example.com/bar",
],
- depth: 0, // Specify how deep links should be followed. (default = 0, no follow)
- follow: [], // Speficy the css selectors to follow (default = ["a[href]"])
- allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
- blockedDomains: [], // Specify the blocked domains. (default = none)
- allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed)
- blockedURLs: [], // Specify the blocked URLs as regex. (default = none)
- rate: 100, // Specify the rate in requests per second. (default = no rate limit)
- proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy)
- cache: "file", // Enable file-based request caching. (default = no cache)
+ depth: 0, // Specify how deep links should be followed. (default = 0, no follow)
+ follow: [], // Speficy the css selectors to follow (default = ["a[href]"])
+ allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
+ blockedDomains: [], // Specify the blocked domains. (default = none)
+ allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed)
+ blockedURLs: [], // Specify the blocked URLs as regex. (default = none)
+ rate: 100, // Specify the rate in requests per second. (default = no rate limit)
+ proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy)
+ cache: "file", // Enable file-based request caching. (default = no cache)
+ headers: { // Specify the HTTP request header. (default = none)
+ "Authorization": "Basic ZGVtbzpwQDU1dzByZA==",
+ "User-Agent": "Gecko/1.0",
+ },
};
export function setup() {
diff --git a/cmd/main.go b/cmd/main.go
index 7c49dbf..eab3e03 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -16,6 +16,7 @@ import (
_ "github.com/philippta/flyscrape/modules/depth"
_ "github.com/philippta/flyscrape/modules/domainfilter"
_ "github.com/philippta/flyscrape/modules/followlinks"
+ _ "github.com/philippta/flyscrape/modules/headers"
_ "github.com/philippta/flyscrape/modules/jsonprint"
_ "github.com/philippta/flyscrape/modules/proxy"
_ "github.com/philippta/flyscrape/modules/ratelimit"
diff --git a/examples/custom_headers.js b/examples/custom_headers.js
new file mode 100644
index 0000000..6435a39
--- /dev/null
+++ b/examples/custom_headers.js
@@ -0,0 +1,26 @@
+export const config = {
+ url: "https://news.ycombinator.com/",
+ headers: {
+ "Authorization": "Basic ZGVtbzpwQDU1dzByZA==",
+ "User-Agent": "Gecko/1.0",
+ }
+};
+
+export default function({ doc, absoluteURL }) {
+ const posts = doc.find(".athing");
+
+ return {
+ posts: posts.map((post) => {
+ const link = post.find(".titleline > a");
+ const meta = post.next();
+
+ return {
+ url: absoluteURL(link.attr("href")),
+ user: meta.find(".hnuser").text(),
+ title: link.text(),
+ points: meta.find(".score").text().replace(" points", ""),
+ created: meta.find(".age").attr("title"),
+ };
+ }),
+ };
+}
diff --git a/module.go b/module.go
index 9b33de4..9947e76 100644
--- a/module.go
+++ b/module.go
@@ -97,5 +97,6 @@ var (
"proxy",
"ratelimit",
"cache",
+ "headers",
}
)
diff --git a/modules/headers/headers.go b/modules/headers/headers.go
new file mode 100644
index 0000000..877b370
--- /dev/null
+++ b/modules/headers/headers.go
@@ -0,0 +1,42 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package headers
+
+import (
+ "net/http"
+
+ "github.com/philippta/flyscrape"
+)
+
+func init() {
+ flyscrape.RegisterModule(Module{})
+}
+
+type Module struct {
+ Headers map[string]string `json:"headers"`
+}
+
+func (Module) ModuleInfo() flyscrape.ModuleInfo {
+ return flyscrape.ModuleInfo{
+ ID: "headers",
+ New: func() flyscrape.Module { return new(Module) },
+ }
+}
+
+func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
+ if len(m.Headers) == 0 {
+ return t
+ }
+
+ return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
+ for k, v := range m.Headers {
+ r.Header.Set(k, v)
+ }
+
+ return t.RoundTrip(r)
+ })
+}
+
+var _ flyscrape.TransportAdapter = Module{}
diff --git a/modules/headers/headers_test.go b/modules/headers/headers_test.go
new file mode 100644
index 0000000..72b9001
--- /dev/null
+++ b/modules/headers/headers_test.go
@@ -0,0 +1,94 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package headers_test
+
+import (
+ "fmt"
+ "net/http"
+ "reflect"
+ "testing"
+
+ "github.com/philippta/flyscrape"
+ "github.com/philippta/flyscrape/modules/headers"
+ "github.com/philippta/flyscrape/modules/hook"
+ "github.com/philippta/flyscrape/modules/starturl"
+ "github.com/stretchr/testify/require"
+)
+
+func TestHeaders(t *testing.T) {
+ testCases := []struct {
+ name string
+ headersFn func() headers.Module
+ wantHeaders map[string][]string
+ }{
+ {
+ name: "empty custom headers",
+ headersFn: func() headers.Module {
+ return headers.Module{
+ Headers: map[string]string{},
+ }
+ },
+ wantHeaders: map[string][]string{"User-Agent": {"flyscrape/0.1"}},
+ },
+ {
+ name: "no duplicate headers between default and custom",
+ headersFn: func() headers.Module {
+ return headers.Module{
+ Headers: map[string]string{
+ "Authorization": "Basic ZGVtbzpwQDU1dzByZA==",
+ },
+ }
+ },
+ wantHeaders: map[string][]string{
+ "Authorization": {"Basic ZGVtbzpwQDU1dzByZA=="},
+ "User-Agent": {"flyscrape/0.1"},
+ },
+ },
+ {
+ name: "duplicate headers between default and custom",
+ headersFn: func() headers.Module {
+ return headers.Module{
+ Headers: map[string]string{
+ "Authorization": "Basic ZGVtbzpwQDU1dzByZA==",
+ "User-Agent": "Gecko/1.0",
+ },
+ }
+ },
+ wantHeaders: map[string][]string{
+ "Authorization": {"Basic ZGVtbzpwQDU1dzByZA=="},
+ "User-Agent": {"Gecko/1.0"},
+ },
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ var headers map[string][]string
+
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: "http://www.example.com"},
+ hook.Module{
+ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
+ return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
+ headers = r.Header
+ return rt.RoundTrip(r)
+ })
+ },
+ },
+ tc.headersFn(),
+ }
+
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
+ scraper.Run()
+
+ require.Truef(
+ t,
+ reflect.DeepEqual(tc.wantHeaders, headers),
+ fmt.Sprintf("expected: %v; actual: %v", tc.wantHeaders, headers),
+ )
+ })
+ }
+}