// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package browser
import (
"fmt"
"io"
"log"
"net/http"
"os"
"strings"
"sync"
"time"
"github.com/go-rod/rod"
"github.com/go-rod/rod/lib/launcher"
"github.com/go-rod/rod/lib/proto"
"github.com/philippta/flyscrape"
)
func init() {
flyscrape.RegisterModule(Module{})
}
type Module struct {
Browser bool `json:"browser"`
Headless *bool `json:"headless"`
browser *rod.Browser
}
func (Module) ModuleInfo() flyscrape.ModuleInfo {
return flyscrape.ModuleInfo{
ID: "browser",
New: func() flyscrape.Module { return new(Module) },
}
}
func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
if !m.Browser {
return t
}
headless := true
if m.Headless != nil {
headless = *m.Headless
}
browser, err := newBrowser(headless)
if err != nil {
log.Println(err)
os.Exit(1)
}
m.browser = browser
return chromeTransport(browser)
}
func (m *Module) Finalize() {
if m.browser != nil {
m.browser.Close()
}
}
func newBrowser(headless bool) (*rod.Browser, error) {
serviceURL, err := launcher.New().
Headless(headless).
Launch()
if err != nil {
return nil, fmt.Errorf("failed to launch browser: %w", err)
}
browser := rod.New().ControlURL(serviceURL).NoDefaultDevice()
if err := browser.Connect(); err != nil {
return nil, fmt.Errorf("failed to connect to browser: %w", err)
}
return browser, nil
}
func chromeTransport(browser *rod.Browser) flyscrape.RoundTripFunc {
return func(r *http.Request) (*http.Response, error) {
select {
case <-r.Context().Done():
return nil, r.Context().Err()
default:
}
page := browser.MustPage()
defer page.Close()
var once sync.Once
var networkResponse *proto.NetworkResponse
go page.EachEvent(func(e *proto.NetworkResponseReceived) {
if e.Type != proto.NetworkResourceTypeDocument {
return
}
once.Do(func() {
networkResponse = e.Response
})
})()
page = page.Context(r.Context())
for h := range r.Header {
if h == "Cookie" {
continue
}
if h == "User-Agent" && strings.HasPrefix(r.UserAgent(), "flyscrape") {
continue
}
page.MustSetExtraHeaders(h, r.Header.Get(h))
}
page.SetCookies(parseCookies(r))
if err := page.Navigate(r.URL.String()); err != nil {
return nil, err
}
timeout := page.Timeout(10 * time.Second)
timeout.WaitLoad()
timeout.WaitDOMStable(300*time.Millisecond, 0)
timeout.WaitRequestIdle(time.Second, nil, nil, nil)
html, err := page.HTML()
if err != nil {
return nil, err
}
resp := &http.Response{
StatusCode: 200,
Status: "200 OK",
Body: io.NopCloser(strings.NewReader(html)),
Header: http.Header{"Content-Type": []string{"text/html"}},
}
if networkResponse != nil {
resp.StatusCode = networkResponse.Status
resp.Status = networkResponse.StatusText
resp.Header = http.Header{}
for k, v := range networkResponse.Headers {
resp.Header.Set(k, v.String())
}
}
return resp, err
}
}
func parseCookies(r *http.Request) []*proto.NetworkCookieParam {
rawCookie := r.Header.Get("Cookie")
if rawCookie == "" {
return nil
}
header := http.Header{}
header.Add("Cookie", rawCookie)
request := http.Request{Header: header}
domainSegs := strings.Split(r.URL.Hostname(), ".")
if len(domainSegs) < 2 {
return nil
}
domain := "." + strings.Join(domainSegs[len(domainSegs)-2:], ".")
var cookies []*proto.NetworkCookieParam
for _, cookie := range request.Cookies() {
cookies = append(cookies, &proto.NetworkCookieParam{
Name: cookie.Name,
Value: cookie.Value,
Domain: domain,
Path: "/",
Secure: false,
HTTPOnly: false,
SameSite: "Lax",
Expires: -1,
URL: r.URL.String(),
})
}
return cookies
}
var (
_ flyscrape.TransportAdapter = &Module{}
_ flyscrape.Finalizer = &Module{}
)