diff options
Diffstat (limited to 'modules')
| -rw-r--r-- | modules/browser/browser.go | 135 | ||||
| -rw-r--r-- | modules/browser/browser_test.go | 160 | ||||
| -rw-r--r-- | modules/cookies/cookies.go | 2 | ||||
| -rw-r--r-- | modules/ratelimit/ratelimit.go | 13 |
4 files changed, 307 insertions, 3 deletions
diff --git a/modules/browser/browser.go b/modules/browser/browser.go new file mode 100644 index 0000000..5802d24 --- /dev/null +++ b/modules/browser/browser.go @@ -0,0 +1,135 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package browser + +import ( + "fmt" + "io" + "log" + "net/http" + "os" + "strings" + "sync" + "time" + + "github.com/go-rod/rod" + "github.com/go-rod/rod/lib/launcher" + "github.com/go-rod/rod/lib/proto" + "github.com/philippta/flyscrape" +) + +func init() { + flyscrape.RegisterModule(Module{}) +} + +type Module struct { + Browser bool `json:"browser"` + Headless *bool `json:"headless"` +} + +func (Module) ModuleInfo() flyscrape.ModuleInfo { + return flyscrape.ModuleInfo{ + ID: "browser", + New: func() flyscrape.Module { return new(Module) }, + } +} + +func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { + if !m.Browser { + return t + } + + headless := true + if m.Headless != nil { + headless = *m.Headless + } + + ct, err := chromeTransport(headless) + if err != nil { + log.Println(err) + os.Exit(1) + } + + return ct +} + +func chromeTransport(headless bool) (flyscrape.RoundTripFunc, error) { + serviceURL, err := launcher.New(). + Headless(headless). + Launch() + if err != nil { + return nil, fmt.Errorf("failed to launch browser: %w", err) + } + + browser := rod.New().ControlURL(serviceURL).NoDefaultDevice() + if err := browser.Connect(); err != nil { + return nil, fmt.Errorf("failed to connect to browser: %w", err) + } + + return func(r *http.Request) (*http.Response, error) { + select { + case <-r.Context().Done(): + return nil, r.Context().Err() + default: + } + + page := browser.MustPage() + defer page.Close() + + var once sync.Once + var networkResponse *proto.NetworkResponse + go page.EachEvent(func(e *proto.NetworkResponseReceived) { + if e.Type != proto.NetworkResourceTypeDocument { + return + } + once.Do(func() { + networkResponse = e.Response + }) + })() + + page = page.Context(r.Context()) + + for h := range r.Header { + if h == "User-Agent" && strings.HasPrefix(r.UserAgent(), "flyscrape") { + continue + } + page.MustSetExtraHeaders(h, r.Header.Get(h)) + } + + if err := page.Navigate(r.URL.String()); err != nil { + return nil, err + } + + if err := page.WaitStable(time.Second); err != nil { + return nil, err + } + + html, err := page.HTML() + if err != nil { + return nil, err + } + + resp := &http.Response{ + StatusCode: 200, + Status: "200 OK", + Body: io.NopCloser(strings.NewReader(html)), + Header: http.Header{"Content-Type": []string{"text/html"}}, + } + + if networkResponse != nil { + resp.StatusCode = networkResponse.Status + resp.Status = networkResponse.StatusText + resp.Header = http.Header{} + + for k, v := range networkResponse.Headers { + resp.Header.Set(k, v.String()) + } + } + + return resp, err + }, nil +} + +var _ flyscrape.TransportAdapter = Module{} diff --git a/modules/browser/browser_test.go b/modules/browser/browser_test.go new file mode 100644 index 0000000..f7fe22a --- /dev/null +++ b/modules/browser/browser_test.go @@ -0,0 +1,160 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package browser_test + +import ( + "fmt" + "net/http" + "net/http/httptest" + "testing" + + "github.com/philippta/flyscrape" + "github.com/philippta/flyscrape/modules/browser" + "github.com/philippta/flyscrape/modules/headers" + "github.com/philippta/flyscrape/modules/hook" + "github.com/philippta/flyscrape/modules/starturl" + "github.com/stretchr/testify/require" +) + +func TestBrowser(t *testing.T) { + var called bool + + srv := newServer(func(w http.ResponseWriter, r *http.Request) { + called = true + w.Write([]byte(`<h1>Hello Browser</h1><a href="foo">Foo</a>`)) + }) + defer srv.Close() + + var body string + + mods := []flyscrape.Module{ + &starturl.Module{URL: srv.URL}, + &browser.Module{Browser: true}, + &hook.Module{ + ReceiveResponseFn: func(r *flyscrape.Response) { + body = string(r.Body) + }, + }, + } + + scraper := flyscrape.NewScraper() + scraper.Modules = mods + scraper.Run() + + require.True(t, called) + require.Contains(t, body, "Hello Browser") +} +func TestBrowserStatusCode(t *testing.T) { + srv := newServer(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(404) + }) + defer srv.Close() + + var statusCode int + + mods := []flyscrape.Module{ + &starturl.Module{URL: srv.URL}, + &browser.Module{Browser: true}, + &hook.Module{ + ReceiveResponseFn: func(r *flyscrape.Response) { + statusCode = r.StatusCode + }, + }, + } + + scraper := flyscrape.NewScraper() + scraper.Modules = mods + scraper.Run() + + require.Equal(t, 404, statusCode) +} + +func TestBrowserRequestHeader(t *testing.T) { + srv := newServer(func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte(r.Header.Get("User-Agent"))) + }) + defer srv.Close() + + var body string + + mods := []flyscrape.Module{ + &starturl.Module{URL: srv.URL}, + &browser.Module{Browser: true}, + &headers.Module{ + Headers: map[string]string{ + "User-Agent": "custom-headers", + }, + }, + &hook.Module{ + ReceiveResponseFn: func(r *flyscrape.Response) { + body = string(r.Body) + }, + }, + } + + scraper := flyscrape.NewScraper() + scraper.Modules = mods + scraper.Run() + + require.Contains(t, body, "custom-headers") +} + +func TestBrowserResponseHeader(t *testing.T) { + srv := newServer(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Foo", "bar") + }) + defer srv.Close() + + var header string + + mods := []flyscrape.Module{ + &starturl.Module{URL: srv.URL}, + &browser.Module{Browser: true}, + &hook.Module{ + ReceiveResponseFn: func(r *flyscrape.Response) { + header = r.Headers.Get("Foo") + }, + }, + } + + scraper := flyscrape.NewScraper() + scraper.Modules = mods + scraper.Run() + + require.Equal(t, header, "bar") +} + +func TestBrowserUnsetFlyscrapeUserAgent(t *testing.T) { + srv := newServer(func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte(r.Header.Get("User-Agent"))) + }) + defer srv.Close() + + var body string + + mods := []flyscrape.Module{ + &starturl.Module{URL: srv.URL}, + &browser.Module{Browser: true}, + &hook.Module{ + ReceiveResponseFn: func(r *flyscrape.Response) { + body = string(r.Body) + }, + }, + } + + scraper := flyscrape.NewScraper() + scraper.Modules = mods + scraper.Run() + + fmt.Println(body) + require.Contains(t, body, "Mozilla/5.0") + require.NotContains(t, body, "flyscrape") +} + +func newServer(f func(http.ResponseWriter, *http.Request)) *httptest.Server { + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + f(w, r) + })) +} diff --git a/modules/cookies/cookies.go b/modules/cookies/cookies.go index 4be344d..2f57a3f 100644 --- a/modules/cookies/cookies.go +++ b/modules/cookies/cookies.go @@ -43,7 +43,7 @@ func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { var stores []kooky.CookieStore for _, store := range kooky.FindAllCookieStores() { - if store.Browser() == m.Cookies { + if store.Browser() == m.Cookies && store.IsDefaultProfile() { stores = append(stores, store) } } diff --git a/modules/ratelimit/ratelimit.go b/modules/ratelimit/ratelimit.go index 152c6fd..f68f8e9 100644 --- a/modules/ratelimit/ratelimit.go +++ b/modules/ratelimit/ratelimit.go @@ -17,8 +17,9 @@ func init() { } type Module struct { - Rate int `json:"rate"` - Concurrency int `json:"concurrency"` + Rate int `json:"rate"` + Concurrency int `json:"concurrency"` + Browser bool `json:"browser"` ticker *time.Ticker ratelimit chan struct{} @@ -46,6 +47,10 @@ func (m *Module) Provision(v flyscrape.Context) { }() } + if m.browserEnabled() && !m.concurrencyEnabled() { + m.Concurrency = 1 + } + if m.concurrencyEnabled() { m.concurrency = make(chan struct{}, m.Concurrency) for i := 0; i < m.Concurrency; i++ { @@ -83,6 +88,10 @@ func (m *Module) concurrencyEnabled() bool { return m.Concurrency > 0 } +func (m *Module) browserEnabled() bool { + return m.Browser +} + var ( _ flyscrape.TransportAdapter = (*Module)(nil) _ flyscrape.Provisioner = (*Module)(nil) |