diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-10-17 19:19:38 +0200 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-10-17 19:19:38 +0200 |
| commit | 03b3be0c3bbc70584e8988e1810dc28eacf4521f (patch) | |
| tree | 8eb1071aec0815b1cc8d34a4482907455ae5e8bd | |
| parent | 11d73f57a80bb65b7507ec80433b8f035ed226c2 (diff) | |
Add HTTP(S) Proxy support
| -rw-r--r-- | README.md | 2 | ||||
| -rw-r--r-- | cmd/flyscrape/main.go | 1 | ||||
| -rw-r--r-- | module.go | 12 | ||||
| -rw-r--r-- | modules/proxy/proxy.go | 61 | ||||
| -rw-r--r-- | modules/proxy/proxy_test.go | 62 |
5 files changed, 129 insertions, 9 deletions
@@ -24,6 +24,7 @@ - Depth control - Request caching - Rate limiting +- HTTP(s) Proxy support - Development mode - Single binary executable @@ -146,6 +147,7 @@ export const config = { allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) blockedURLs: [], // Specify the blocked URLs as regex. (default = none) rate: 100, // Specify the rate in requests per second. (default = no rate limit) + proxies: [], // Specify the HTTP(s) proxy URLs. (default = no proxy) cache: "file", // Enable file-based request caching. (default = no cache) }; diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go index 4450cd7..5ea140a 100644 --- a/cmd/flyscrape/main.go +++ b/cmd/flyscrape/main.go @@ -17,6 +17,7 @@ import ( _ "github.com/philippta/flyscrape/modules/domainfilter" _ "github.com/philippta/flyscrape/modules/followlinks" _ "github.com/philippta/flyscrape/modules/jsonprint" + _ "github.com/philippta/flyscrape/modules/proxy" _ "github.com/philippta/flyscrape/modules/ratelimit" _ "github.com/philippta/flyscrape/modules/starturl" _ "github.com/philippta/flyscrape/modules/urlfilter" @@ -89,16 +89,10 @@ var ( modulesMu sync.RWMutex moduleOrder = []string{ - // Transport Adapters + // Transport adapters must be loaded in a specific order. + // All other modules can be loaded in any order. + "proxy", "ratelimit", "cache", - - // Rest - "starturl", - "followlinks", - "depth", - "domainfilter", - "urlfilter", - "jsonprint", } ) diff --git a/modules/proxy/proxy.go b/modules/proxy/proxy.go new file mode 100644 index 0000000..120a856 --- /dev/null +++ b/modules/proxy/proxy.go @@ -0,0 +1,61 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package proxy + +import ( + "crypto/tls" + "math/rand" + "net/http" + "net/url" + + "github.com/philippta/flyscrape" +) + +func init() { + flyscrape.RegisterModule(Module{}) +} + +type Module struct { + Proxies []string `json:"proxies"` + + transports []*http.Transport +} + +func (Module) ModuleInfo() flyscrape.ModuleInfo { + return flyscrape.ModuleInfo{ + ID: "proxy", + New: func() flyscrape.Module { return new(Module) }, + } +} + +func (m *Module) Provision(ctx flyscrape.Context) { + if m.disabled() { + return + } + + for _, purl := range m.Proxies { + if parsed, err := url.Parse(purl); err == nil { + m.transports = append(m.transports, &http.Transport{ + Proxy: http.ProxyURL(parsed), + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + }) + } + } +} + +func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { + if m.disabled() { + return t + } + + return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { + transport := m.transports[rand.Intn(len(m.transports))] + return transport.RoundTrip(r) + }) +} + +func (m *Module) disabled() bool { + return len(m.Proxies) == 0 +} diff --git a/modules/proxy/proxy_test.go b/modules/proxy/proxy_test.go new file mode 100644 index 0000000..e6058b8 --- /dev/null +++ b/modules/proxy/proxy_test.go @@ -0,0 +1,62 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package proxy_test + +import ( + "net/http" + "net/http/httptest" + "testing" + + "github.com/philippta/flyscrape" + "github.com/philippta/flyscrape/modules/proxy" + "github.com/philippta/flyscrape/modules/starturl" + "github.com/stretchr/testify/require" +) + +func TestProxy(t *testing.T) { + var called bool + p := newProxy(func() { called = true }) + defer p.Close() + + scraper := flyscrape.NewScraper() + scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) + scraper.LoadModule(&proxy.Module{ + Proxies: []string{p.URL}, + }) + + scraper.Run() + require.True(t, called) +} + +func TestProxyMultiple(t *testing.T) { + calls := []int{0, 0} + p0 := newProxy(func() { calls[0]++ }) + p1 := newProxy(func() { calls[1]++ }) + defer p0.Close() + defer p1.Close() + + mod := &proxy.Module{Proxies: []string{p0.URL, p1.URL}} + mod.Provision(nil) + trans := mod.AdaptTransport(nil) + + req := httptest.NewRequest("GET", "http://www.example.com/", nil) + + for i := 0; i < 10; i++ { + resp, err := trans.RoundTrip(req) + require.NoError(t, err) + require.Equal(t, http.StatusOK, resp.StatusCode) + } + + require.Greater(t, calls[0], 1) + require.Greater(t, calls[1], 1) + require.Equal(t, 10, calls[0]+calls[1]) +} + +func newProxy(f func()) *httptest.Server { + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + f() + w.Write([]byte("response from proxy")) + })) +} |