From cbdbbd249239345f88bea031beb55e84c2f47688 Mon Sep 17 00:00:00 2001 From: Rafi Ramadhana <42462215+rafiramadhana@users.noreply.github.com> Date: Thu, 23 Nov 2023 18:58:41 +0700 Subject: Add custom request header (#18) --- README.md | 28 ++++++------ cmd/main.go | 1 + examples/custom_headers.js | 26 ++++++++++++ module.go | 1 + modules/headers/headers.go | 42 ++++++++++++++++++ modules/headers/headers_test.go | 94 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 180 insertions(+), 12 deletions(-) create mode 100644 examples/custom_headers.js create mode 100644 modules/headers/headers.go create mode 100644 modules/headers/headers_test.go diff --git a/README.md b/README.md index 2f84843..cb7aacc 100644 --- a/README.md +++ b/README.md @@ -134,20 +134,24 @@ Below is an example scraping script that showcases the capabilities of flyscrape ```javascript export const config = { - url: "https://example.com/", // Specify the URL to start scraping from. - urls: [ // Specify the URL(s) to start scraping from. If both `url` and `urls` - "https://example.com/foo", // are provided, all of the specified URLs will be scraped. + url: "https://example.com/", // Specify the URL to start scraping from. + urls: [ // Specify the URL(s) to start scraping from. If both `url` and `urls` + "https://example.com/foo", // are provided, all of the specified URLs will be scraped. "https://example.com/bar", ], - depth: 0, // Specify how deep links should be followed. (default = 0, no follow) - follow: [], // Speficy the css selectors to follow (default = ["a[href]"]) - allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) - blockedDomains: [], // Specify the blocked domains. (default = none) - allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) - blockedURLs: [], // Specify the blocked URLs as regex. (default = none) - rate: 100, // Specify the rate in requests per second. (default = no rate limit) - proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy) - cache: "file", // Enable file-based request caching. (default = no cache) + depth: 0, // Specify how deep links should be followed. (default = 0, no follow) + follow: [], // Speficy the css selectors to follow (default = ["a[href]"]) + allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) + blockedDomains: [], // Specify the blocked domains. (default = none) + allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) + blockedURLs: [], // Specify the blocked URLs as regex. (default = none) + rate: 100, // Specify the rate in requests per second. (default = no rate limit) + proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy) + cache: "file", // Enable file-based request caching. (default = no cache) + headers: { // Specify the HTTP request header. (default = none) + "Authorization": "Basic ZGVtbzpwQDU1dzByZA==", + "User-Agent": "Gecko/1.0", + }, }; export function setup() { diff --git a/cmd/main.go b/cmd/main.go index 7c49dbf..eab3e03 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -16,6 +16,7 @@ import ( _ "github.com/philippta/flyscrape/modules/depth" _ "github.com/philippta/flyscrape/modules/domainfilter" _ "github.com/philippta/flyscrape/modules/followlinks" + _ "github.com/philippta/flyscrape/modules/headers" _ "github.com/philippta/flyscrape/modules/jsonprint" _ "github.com/philippta/flyscrape/modules/proxy" _ "github.com/philippta/flyscrape/modules/ratelimit" diff --git a/examples/custom_headers.js b/examples/custom_headers.js new file mode 100644 index 0000000..6435a39 --- /dev/null +++ b/examples/custom_headers.js @@ -0,0 +1,26 @@ +export const config = { + url: "https://news.ycombinator.com/", + headers: { + "Authorization": "Basic ZGVtbzpwQDU1dzByZA==", + "User-Agent": "Gecko/1.0", + } +}; + +export default function({ doc, absoluteURL }) { + const posts = doc.find(".athing"); + + return { + posts: posts.map((post) => { + const link = post.find(".titleline > a"); + const meta = post.next(); + + return { + url: absoluteURL(link.attr("href")), + user: meta.find(".hnuser").text(), + title: link.text(), + points: meta.find(".score").text().replace(" points", ""), + created: meta.find(".age").attr("title"), + }; + }), + }; +} diff --git a/module.go b/module.go index 9b33de4..9947e76 100644 --- a/module.go +++ b/module.go @@ -97,5 +97,6 @@ var ( "proxy", "ratelimit", "cache", + "headers", } ) diff --git a/modules/headers/headers.go b/modules/headers/headers.go new file mode 100644 index 0000000..877b370 --- /dev/null +++ b/modules/headers/headers.go @@ -0,0 +1,42 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package headers + +import ( + "net/http" + + "github.com/philippta/flyscrape" +) + +func init() { + flyscrape.RegisterModule(Module{}) +} + +type Module struct { + Headers map[string]string `json:"headers"` +} + +func (Module) ModuleInfo() flyscrape.ModuleInfo { + return flyscrape.ModuleInfo{ + ID: "headers", + New: func() flyscrape.Module { return new(Module) }, + } +} + +func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { + if len(m.Headers) == 0 { + return t + } + + return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { + for k, v := range m.Headers { + r.Header.Set(k, v) + } + + return t.RoundTrip(r) + }) +} + +var _ flyscrape.TransportAdapter = Module{} diff --git a/modules/headers/headers_test.go b/modules/headers/headers_test.go new file mode 100644 index 0000000..72b9001 --- /dev/null +++ b/modules/headers/headers_test.go @@ -0,0 +1,94 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package headers_test + +import ( + "fmt" + "net/http" + "reflect" + "testing" + + "github.com/philippta/flyscrape" + "github.com/philippta/flyscrape/modules/headers" + "github.com/philippta/flyscrape/modules/hook" + "github.com/philippta/flyscrape/modules/starturl" + "github.com/stretchr/testify/require" +) + +func TestHeaders(t *testing.T) { + testCases := []struct { + name string + headersFn func() headers.Module + wantHeaders map[string][]string + }{ + { + name: "empty custom headers", + headersFn: func() headers.Module { + return headers.Module{ + Headers: map[string]string{}, + } + }, + wantHeaders: map[string][]string{"User-Agent": {"flyscrape/0.1"}}, + }, + { + name: "no duplicate headers between default and custom", + headersFn: func() headers.Module { + return headers.Module{ + Headers: map[string]string{ + "Authorization": "Basic ZGVtbzpwQDU1dzByZA==", + }, + } + }, + wantHeaders: map[string][]string{ + "Authorization": {"Basic ZGVtbzpwQDU1dzByZA=="}, + "User-Agent": {"flyscrape/0.1"}, + }, + }, + { + name: "duplicate headers between default and custom", + headersFn: func() headers.Module { + return headers.Module{ + Headers: map[string]string{ + "Authorization": "Basic ZGVtbzpwQDU1dzByZA==", + "User-Agent": "Gecko/1.0", + }, + } + }, + wantHeaders: map[string][]string{ + "Authorization": {"Basic ZGVtbzpwQDU1dzByZA=="}, + "User-Agent": {"Gecko/1.0"}, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var headers map[string][]string + + mods := []flyscrape.Module{ + &starturl.Module{URL: "http://www.example.com"}, + hook.Module{ + AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { + return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { + headers = r.Header + return rt.RoundTrip(r) + }) + }, + }, + tc.headersFn(), + } + + scraper := flyscrape.NewScraper() + scraper.Modules = mods + scraper.Run() + + require.Truef( + t, + reflect.DeepEqual(tc.wantHeaders, headers), + fmt.Sprintf("expected: %v; actual: %v", tc.wantHeaders, headers), + ) + }) + } +} -- cgit v1.2.3