diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2024-02-24 13:59:29 +0100 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2024-02-24 13:59:29 +0100 |
| commit | 6cc19d5c412b8adc89092702d4bc21b416fc4fae (patch) | |
| tree | 3142b426395a409647a519270e31145fa9000f65 /modules/browser/browser.go | |
| parent | 3dc39e9eba495b18dad2a7d79d45dcb634729dd7 (diff) | |
Browser rendering
Diffstat (limited to 'modules/browser/browser.go')
| -rw-r--r-- | modules/browser/browser.go | 135 |
1 files changed, 135 insertions, 0 deletions
diff --git a/modules/browser/browser.go b/modules/browser/browser.go new file mode 100644 index 0000000..5802d24 --- /dev/null +++ b/modules/browser/browser.go @@ -0,0 +1,135 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package browser + +import ( + "fmt" + "io" + "log" + "net/http" + "os" + "strings" + "sync" + "time" + + "github.com/go-rod/rod" + "github.com/go-rod/rod/lib/launcher" + "github.com/go-rod/rod/lib/proto" + "github.com/philippta/flyscrape" +) + +func init() { + flyscrape.RegisterModule(Module{}) +} + +type Module struct { + Browser bool `json:"browser"` + Headless *bool `json:"headless"` +} + +func (Module) ModuleInfo() flyscrape.ModuleInfo { + return flyscrape.ModuleInfo{ + ID: "browser", + New: func() flyscrape.Module { return new(Module) }, + } +} + +func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { + if !m.Browser { + return t + } + + headless := true + if m.Headless != nil { + headless = *m.Headless + } + + ct, err := chromeTransport(headless) + if err != nil { + log.Println(err) + os.Exit(1) + } + + return ct +} + +func chromeTransport(headless bool) (flyscrape.RoundTripFunc, error) { + serviceURL, err := launcher.New(). + Headless(headless). + Launch() + if err != nil { + return nil, fmt.Errorf("failed to launch browser: %w", err) + } + + browser := rod.New().ControlURL(serviceURL).NoDefaultDevice() + if err := browser.Connect(); err != nil { + return nil, fmt.Errorf("failed to connect to browser: %w", err) + } + + return func(r *http.Request) (*http.Response, error) { + select { + case <-r.Context().Done(): + return nil, r.Context().Err() + default: + } + + page := browser.MustPage() + defer page.Close() + + var once sync.Once + var networkResponse *proto.NetworkResponse + go page.EachEvent(func(e *proto.NetworkResponseReceived) { + if e.Type != proto.NetworkResourceTypeDocument { + return + } + once.Do(func() { + networkResponse = e.Response + }) + })() + + page = page.Context(r.Context()) + + for h := range r.Header { + if h == "User-Agent" && strings.HasPrefix(r.UserAgent(), "flyscrape") { + continue + } + page.MustSetExtraHeaders(h, r.Header.Get(h)) + } + + if err := page.Navigate(r.URL.String()); err != nil { + return nil, err + } + + if err := page.WaitStable(time.Second); err != nil { + return nil, err + } + + html, err := page.HTML() + if err != nil { + return nil, err + } + + resp := &http.Response{ + StatusCode: 200, + Status: "200 OK", + Body: io.NopCloser(strings.NewReader(html)), + Header: http.Header{"Content-Type": []string{"text/html"}}, + } + + if networkResponse != nil { + resp.StatusCode = networkResponse.Status + resp.Status = networkResponse.StatusText + resp.Header = http.Header{} + + for k, v := range networkResponse.Headers { + resp.Header.Set(k, v.String()) + } + } + + return resp, err + }, nil +} + +var _ flyscrape.TransportAdapter = Module{} |