summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2023-10-17 19:19:38 +0200
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2023-10-17 19:19:38 +0200
commit03b3be0c3bbc70584e8988e1810dc28eacf4521f (patch)
tree8eb1071aec0815b1cc8d34a4482907455ae5e8bd
parent11d73f57a80bb65b7507ec80433b8f035ed226c2 (diff)
Add HTTP(S) Proxy support
-rw-r--r--README.md2
-rw-r--r--cmd/flyscrape/main.go1
-rw-r--r--module.go12
-rw-r--r--modules/proxy/proxy.go61
-rw-r--r--modules/proxy/proxy_test.go62
5 files changed, 129 insertions, 9 deletions
diff --git a/README.md b/README.md
index 1802a56..eb8cce2 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,7 @@
- Depth control
- Request caching
- Rate limiting
+- HTTP(s) Proxy support
- Development mode
- Single binary executable
@@ -146,6 +147,7 @@ export const config = {
allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed)
blockedURLs: [], // Specify the blocked URLs as regex. (default = none)
rate: 100, // Specify the rate in requests per second. (default = no rate limit)
+ proxies: [], // Specify the HTTP(s) proxy URLs. (default = no proxy)
cache: "file", // Enable file-based request caching. (default = no cache)
};
diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go
index 4450cd7..5ea140a 100644
--- a/cmd/flyscrape/main.go
+++ b/cmd/flyscrape/main.go
@@ -17,6 +17,7 @@ import (
_ "github.com/philippta/flyscrape/modules/domainfilter"
_ "github.com/philippta/flyscrape/modules/followlinks"
_ "github.com/philippta/flyscrape/modules/jsonprint"
+ _ "github.com/philippta/flyscrape/modules/proxy"
_ "github.com/philippta/flyscrape/modules/ratelimit"
_ "github.com/philippta/flyscrape/modules/starturl"
_ "github.com/philippta/flyscrape/modules/urlfilter"
diff --git a/module.go b/module.go
index da81ffb..0540c91 100644
--- a/module.go
+++ b/module.go
@@ -89,16 +89,10 @@ var (
modulesMu sync.RWMutex
moduleOrder = []string{
- // Transport Adapters
+ // Transport adapters must be loaded in a specific order.
+ // All other modules can be loaded in any order.
+ "proxy",
"ratelimit",
"cache",
-
- // Rest
- "starturl",
- "followlinks",
- "depth",
- "domainfilter",
- "urlfilter",
- "jsonprint",
}
)
diff --git a/modules/proxy/proxy.go b/modules/proxy/proxy.go
new file mode 100644
index 0000000..120a856
--- /dev/null
+++ b/modules/proxy/proxy.go
@@ -0,0 +1,61 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package proxy
+
+import (
+ "crypto/tls"
+ "math/rand"
+ "net/http"
+ "net/url"
+
+ "github.com/philippta/flyscrape"
+)
+
+func init() {
+ flyscrape.RegisterModule(Module{})
+}
+
+type Module struct {
+ Proxies []string `json:"proxies"`
+
+ transports []*http.Transport
+}
+
+func (Module) ModuleInfo() flyscrape.ModuleInfo {
+ return flyscrape.ModuleInfo{
+ ID: "proxy",
+ New: func() flyscrape.Module { return new(Module) },
+ }
+}
+
+func (m *Module) Provision(ctx flyscrape.Context) {
+ if m.disabled() {
+ return
+ }
+
+ for _, purl := range m.Proxies {
+ if parsed, err := url.Parse(purl); err == nil {
+ m.transports = append(m.transports, &http.Transport{
+ Proxy: http.ProxyURL(parsed),
+ TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
+ })
+ }
+ }
+}
+
+func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
+ if m.disabled() {
+ return t
+ }
+
+ return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
+ transport := m.transports[rand.Intn(len(m.transports))]
+ return transport.RoundTrip(r)
+ })
+}
+
+func (m *Module) disabled() bool {
+ return len(m.Proxies) == 0
+}
diff --git a/modules/proxy/proxy_test.go b/modules/proxy/proxy_test.go
new file mode 100644
index 0000000..e6058b8
--- /dev/null
+++ b/modules/proxy/proxy_test.go
@@ -0,0 +1,62 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package proxy_test
+
+import (
+ "net/http"
+ "net/http/httptest"
+ "testing"
+
+ "github.com/philippta/flyscrape"
+ "github.com/philippta/flyscrape/modules/proxy"
+ "github.com/philippta/flyscrape/modules/starturl"
+ "github.com/stretchr/testify/require"
+)
+
+func TestProxy(t *testing.T) {
+ var called bool
+ p := newProxy(func() { called = true })
+ defer p.Close()
+
+ scraper := flyscrape.NewScraper()
+ scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"})
+ scraper.LoadModule(&proxy.Module{
+ Proxies: []string{p.URL},
+ })
+
+ scraper.Run()
+ require.True(t, called)
+}
+
+func TestProxyMultiple(t *testing.T) {
+ calls := []int{0, 0}
+ p0 := newProxy(func() { calls[0]++ })
+ p1 := newProxy(func() { calls[1]++ })
+ defer p0.Close()
+ defer p1.Close()
+
+ mod := &proxy.Module{Proxies: []string{p0.URL, p1.URL}}
+ mod.Provision(nil)
+ trans := mod.AdaptTransport(nil)
+
+ req := httptest.NewRequest("GET", "http://www.example.com/", nil)
+
+ for i := 0; i < 10; i++ {
+ resp, err := trans.RoundTrip(req)
+ require.NoError(t, err)
+ require.Equal(t, http.StatusOK, resp.StatusCode)
+ }
+
+ require.Greater(t, calls[0], 1)
+ require.Greater(t, calls[1], 1)
+ require.Equal(t, 10, calls[0]+calls[1])
+}
+
+func newProxy(f func()) *httptest.Server {
+ return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ f()
+ w.Write([]byte("response from proxy"))
+ }))
+}