summaryrefslogtreecommitdiff
path: root/modules
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2024-02-24 13:59:29 +0100
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2024-02-24 13:59:29 +0100
commit6cc19d5c412b8adc89092702d4bc21b416fc4fae (patch)
tree3142b426395a409647a519270e31145fa9000f65 /modules
parent3dc39e9eba495b18dad2a7d79d45dcb634729dd7 (diff)
Browser rendering
Diffstat (limited to 'modules')
-rw-r--r--modules/browser/browser.go135
-rw-r--r--modules/browser/browser_test.go160
-rw-r--r--modules/cookies/cookies.go2
-rw-r--r--modules/ratelimit/ratelimit.go13
4 files changed, 307 insertions, 3 deletions
diff --git a/modules/browser/browser.go b/modules/browser/browser.go
new file mode 100644
index 0000000..5802d24
--- /dev/null
+++ b/modules/browser/browser.go
@@ -0,0 +1,135 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package browser
+
+import (
+ "fmt"
+ "io"
+ "log"
+ "net/http"
+ "os"
+ "strings"
+ "sync"
+ "time"
+
+ "github.com/go-rod/rod"
+ "github.com/go-rod/rod/lib/launcher"
+ "github.com/go-rod/rod/lib/proto"
+ "github.com/philippta/flyscrape"
+)
+
+func init() {
+ flyscrape.RegisterModule(Module{})
+}
+
+type Module struct {
+ Browser bool `json:"browser"`
+ Headless *bool `json:"headless"`
+}
+
+func (Module) ModuleInfo() flyscrape.ModuleInfo {
+ return flyscrape.ModuleInfo{
+ ID: "browser",
+ New: func() flyscrape.Module { return new(Module) },
+ }
+}
+
+func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
+ if !m.Browser {
+ return t
+ }
+
+ headless := true
+ if m.Headless != nil {
+ headless = *m.Headless
+ }
+
+ ct, err := chromeTransport(headless)
+ if err != nil {
+ log.Println(err)
+ os.Exit(1)
+ }
+
+ return ct
+}
+
+func chromeTransport(headless bool) (flyscrape.RoundTripFunc, error) {
+ serviceURL, err := launcher.New().
+ Headless(headless).
+ Launch()
+ if err != nil {
+ return nil, fmt.Errorf("failed to launch browser: %w", err)
+ }
+
+ browser := rod.New().ControlURL(serviceURL).NoDefaultDevice()
+ if err := browser.Connect(); err != nil {
+ return nil, fmt.Errorf("failed to connect to browser: %w", err)
+ }
+
+ return func(r *http.Request) (*http.Response, error) {
+ select {
+ case <-r.Context().Done():
+ return nil, r.Context().Err()
+ default:
+ }
+
+ page := browser.MustPage()
+ defer page.Close()
+
+ var once sync.Once
+ var networkResponse *proto.NetworkResponse
+ go page.EachEvent(func(e *proto.NetworkResponseReceived) {
+ if e.Type != proto.NetworkResourceTypeDocument {
+ return
+ }
+ once.Do(func() {
+ networkResponse = e.Response
+ })
+ })()
+
+ page = page.Context(r.Context())
+
+ for h := range r.Header {
+ if h == "User-Agent" && strings.HasPrefix(r.UserAgent(), "flyscrape") {
+ continue
+ }
+ page.MustSetExtraHeaders(h, r.Header.Get(h))
+ }
+
+ if err := page.Navigate(r.URL.String()); err != nil {
+ return nil, err
+ }
+
+ if err := page.WaitStable(time.Second); err != nil {
+ return nil, err
+ }
+
+ html, err := page.HTML()
+ if err != nil {
+ return nil, err
+ }
+
+ resp := &http.Response{
+ StatusCode: 200,
+ Status: "200 OK",
+ Body: io.NopCloser(strings.NewReader(html)),
+ Header: http.Header{"Content-Type": []string{"text/html"}},
+ }
+
+ if networkResponse != nil {
+ resp.StatusCode = networkResponse.Status
+ resp.Status = networkResponse.StatusText
+ resp.Header = http.Header{}
+
+ for k, v := range networkResponse.Headers {
+ resp.Header.Set(k, v.String())
+ }
+ }
+
+ return resp, err
+ }, nil
+}
+
+var _ flyscrape.TransportAdapter = Module{}
diff --git a/modules/browser/browser_test.go b/modules/browser/browser_test.go
new file mode 100644
index 0000000..f7fe22a
--- /dev/null
+++ b/modules/browser/browser_test.go
@@ -0,0 +1,160 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package browser_test
+
+import (
+ "fmt"
+ "net/http"
+ "net/http/httptest"
+ "testing"
+
+ "github.com/philippta/flyscrape"
+ "github.com/philippta/flyscrape/modules/browser"
+ "github.com/philippta/flyscrape/modules/headers"
+ "github.com/philippta/flyscrape/modules/hook"
+ "github.com/philippta/flyscrape/modules/starturl"
+ "github.com/stretchr/testify/require"
+)
+
+func TestBrowser(t *testing.T) {
+ var called bool
+
+ srv := newServer(func(w http.ResponseWriter, r *http.Request) {
+ called = true
+ w.Write([]byte(`<h1>Hello Browser</h1><a href="foo">Foo</a>`))
+ })
+ defer srv.Close()
+
+ var body string
+
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: srv.URL},
+ &browser.Module{Browser: true},
+ &hook.Module{
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ body = string(r.Body)
+ },
+ },
+ }
+
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
+ scraper.Run()
+
+ require.True(t, called)
+ require.Contains(t, body, "Hello Browser")
+}
+func TestBrowserStatusCode(t *testing.T) {
+ srv := newServer(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(404)
+ })
+ defer srv.Close()
+
+ var statusCode int
+
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: srv.URL},
+ &browser.Module{Browser: true},
+ &hook.Module{
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ statusCode = r.StatusCode
+ },
+ },
+ }
+
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
+ scraper.Run()
+
+ require.Equal(t, 404, statusCode)
+}
+
+func TestBrowserRequestHeader(t *testing.T) {
+ srv := newServer(func(w http.ResponseWriter, r *http.Request) {
+ w.Write([]byte(r.Header.Get("User-Agent")))
+ })
+ defer srv.Close()
+
+ var body string
+
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: srv.URL},
+ &browser.Module{Browser: true},
+ &headers.Module{
+ Headers: map[string]string{
+ "User-Agent": "custom-headers",
+ },
+ },
+ &hook.Module{
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ body = string(r.Body)
+ },
+ },
+ }
+
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
+ scraper.Run()
+
+ require.Contains(t, body, "custom-headers")
+}
+
+func TestBrowserResponseHeader(t *testing.T) {
+ srv := newServer(func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Foo", "bar")
+ })
+ defer srv.Close()
+
+ var header string
+
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: srv.URL},
+ &browser.Module{Browser: true},
+ &hook.Module{
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ header = r.Headers.Get("Foo")
+ },
+ },
+ }
+
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
+ scraper.Run()
+
+ require.Equal(t, header, "bar")
+}
+
+func TestBrowserUnsetFlyscrapeUserAgent(t *testing.T) {
+ srv := newServer(func(w http.ResponseWriter, r *http.Request) {
+ w.Write([]byte(r.Header.Get("User-Agent")))
+ })
+ defer srv.Close()
+
+ var body string
+
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: srv.URL},
+ &browser.Module{Browser: true},
+ &hook.Module{
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ body = string(r.Body)
+ },
+ },
+ }
+
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
+ scraper.Run()
+
+ fmt.Println(body)
+ require.Contains(t, body, "Mozilla/5.0")
+ require.NotContains(t, body, "flyscrape")
+}
+
+func newServer(f func(http.ResponseWriter, *http.Request)) *httptest.Server {
+ return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ f(w, r)
+ }))
+}
diff --git a/modules/cookies/cookies.go b/modules/cookies/cookies.go
index 4be344d..2f57a3f 100644
--- a/modules/cookies/cookies.go
+++ b/modules/cookies/cookies.go
@@ -43,7 +43,7 @@ func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
var stores []kooky.CookieStore
for _, store := range kooky.FindAllCookieStores() {
- if store.Browser() == m.Cookies {
+ if store.Browser() == m.Cookies && store.IsDefaultProfile() {
stores = append(stores, store)
}
}
diff --git a/modules/ratelimit/ratelimit.go b/modules/ratelimit/ratelimit.go
index 152c6fd..f68f8e9 100644
--- a/modules/ratelimit/ratelimit.go
+++ b/modules/ratelimit/ratelimit.go
@@ -17,8 +17,9 @@ func init() {
}
type Module struct {
- Rate int `json:"rate"`
- Concurrency int `json:"concurrency"`
+ Rate int `json:"rate"`
+ Concurrency int `json:"concurrency"`
+ Browser bool `json:"browser"`
ticker *time.Ticker
ratelimit chan struct{}
@@ -46,6 +47,10 @@ func (m *Module) Provision(v flyscrape.Context) {
}()
}
+ if m.browserEnabled() && !m.concurrencyEnabled() {
+ m.Concurrency = 1
+ }
+
if m.concurrencyEnabled() {
m.concurrency = make(chan struct{}, m.Concurrency)
for i := 0; i < m.Concurrency; i++ {
@@ -83,6 +88,10 @@ func (m *Module) concurrencyEnabled() bool {
return m.Concurrency > 0
}
+func (m *Module) browserEnabled() bool {
+ return m.Browser
+}
+
var (
_ flyscrape.TransportAdapter = (*Module)(nil)
_ flyscrape.Provisioner = (*Module)(nil)