From 92baa1671dd2a9dcc43d14f3a893f0e7f9a4b34d Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Mon, 26 Feb 2024 21:05:35 +0100 Subject: Improve cookie support for browser mode --- modules/browser/browser.go | 76 ++++++++++++++++++++++++++++++++++++++++------ modules/cookies/cookies.go | 4 +++ modules/proxy/proxy.go | 2 +- 3 files changed, 72 insertions(+), 10 deletions(-) (limited to 'modules') diff --git a/modules/browser/browser.go b/modules/browser/browser.go index 5802d24..55b7a81 100644 --- a/modules/browser/browser.go +++ b/modules/browser/browser.go @@ -27,6 +27,8 @@ func init() { type Module struct { Browser bool `json:"browser"` Headless *bool `json:"headless"` + + browser *rod.Browser } func (Module) ModuleInfo() flyscrape.ModuleInfo { @@ -36,7 +38,7 @@ func (Module) ModuleInfo() flyscrape.ModuleInfo { } } -func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { +func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { if !m.Browser { return t } @@ -46,16 +48,24 @@ func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { headless = *m.Headless } - ct, err := chromeTransport(headless) + browser, err := newBrowser(headless) if err != nil { log.Println(err) os.Exit(1) } - return ct + m.browser = browser + + return chromeTransport(browser) +} + +func (m *Module) Finalize() { + if m.browser != nil { + m.browser.Close() + } } -func chromeTransport(headless bool) (flyscrape.RoundTripFunc, error) { +func newBrowser(headless bool) (*rod.Browser, error) { serviceURL, err := launcher.New(). Headless(headless). Launch() @@ -68,6 +78,10 @@ func chromeTransport(headless bool) (flyscrape.RoundTripFunc, error) { return nil, fmt.Errorf("failed to connect to browser: %w", err) } + return browser, nil +} + +func chromeTransport(browser *rod.Browser) flyscrape.RoundTripFunc { return func(r *http.Request) (*http.Response, error) { select { case <-r.Context().Done(): @@ -92,19 +106,25 @@ func chromeTransport(headless bool) (flyscrape.RoundTripFunc, error) { page = page.Context(r.Context()) for h := range r.Header { + if h == "Cookie" { + continue + } if h == "User-Agent" && strings.HasPrefix(r.UserAgent(), "flyscrape") { continue } page.MustSetExtraHeaders(h, r.Header.Get(h)) } + page.SetCookies(parseCookies(r)) + if err := page.Navigate(r.URL.String()); err != nil { return nil, err } - if err := page.WaitStable(time.Second); err != nil { - return nil, err - } + timeout := page.Timeout(10 * time.Second) + timeout.WaitLoad() + timeout.WaitDOMStable(300*time.Millisecond, 0) + timeout.WaitRequestIdle(time.Second, nil, nil, nil) html, err := page.HTML() if err != nil { @@ -129,7 +149,45 @@ func chromeTransport(headless bool) (flyscrape.RoundTripFunc, error) { } return resp, err - }, nil + } } -var _ flyscrape.TransportAdapter = Module{} +func parseCookies(r *http.Request) []*proto.NetworkCookieParam { + rawCookie := r.Header.Get("Cookie") + if rawCookie == "" { + return nil + } + + header := http.Header{} + header.Add("Cookie", rawCookie) + request := http.Request{Header: header} + + domainSegs := strings.Split(r.URL.Hostname(), ".") + if len(domainSegs) < 2 { + return nil + } + + domain := "." + strings.Join(domainSegs[len(domainSegs)-2:], ".") + + var cookies []*proto.NetworkCookieParam + for _, cookie := range request.Cookies() { + cookies = append(cookies, &proto.NetworkCookieParam{ + Name: cookie.Name, + Value: cookie.Value, + Domain: domain, + Path: "/", + Secure: false, + HTTPOnly: false, + SameSite: "Lax", + Expires: -1, + URL: r.URL.String(), + }) + } + + return cookies +} + +var ( + _ flyscrape.TransportAdapter = &Module{} + _ flyscrape.Finalizer = &Module{} +) diff --git a/modules/cookies/cookies.go b/modules/cookies/cookies.go index 2f57a3f..2af2a27 100644 --- a/modules/cookies/cookies.go +++ b/modules/cookies/cookies.go @@ -51,6 +51,10 @@ func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { for _, store := range stores { for _, cookie := range store.Cookies(r.URL) { + // Unquote cookie value + if len(cookie.Value) >= 2 && cookie.Value[0] == '"' && cookie.Value[len(cookie.Value)-1] == '"' { + cookie.Value = cookie.Value[1 : len(cookie.Value)-2] + } r.AddCookie(cookie) } } diff --git a/modules/proxy/proxy.go b/modules/proxy/proxy.go index 832bc7a..a8f4963 100644 --- a/modules/proxy/proxy.go +++ b/modules/proxy/proxy.go @@ -62,5 +62,5 @@ func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { } func (m *Module) disabled() bool { - return len(m.Proxies) == 0 + return len(m.Proxies) == 0 && m.Proxy == "" } -- cgit v1.2.3