summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2024-02-26 21:05:35 +0100
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2024-02-26 21:05:35 +0100
commit92baa1671dd2a9dcc43d14f3a893f0e7f9a4b34d (patch)
tree6b6d25e81d0ac2b3ae53ea777ea1482f4be94563
parent74dbcb0f58fc402a17799c0f6a6c3c775df0b760 (diff)
Improve cookie support for browser mode
-rw-r--r--modules/browser/browser.go76
-rw-r--r--modules/cookies/cookies.go4
-rw-r--r--modules/proxy/proxy.go2
-rw-r--r--scrape.go19
4 files changed, 85 insertions, 16 deletions
diff --git a/modules/browser/browser.go b/modules/browser/browser.go
index 5802d24..55b7a81 100644
--- a/modules/browser/browser.go
+++ b/modules/browser/browser.go
@@ -27,6 +27,8 @@ func init() {
type Module struct {
Browser bool `json:"browser"`
Headless *bool `json:"headless"`
+
+ browser *rod.Browser
}
func (Module) ModuleInfo() flyscrape.ModuleInfo {
@@ -36,7 +38,7 @@ func (Module) ModuleInfo() flyscrape.ModuleInfo {
}
}
-func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
+func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
if !m.Browser {
return t
}
@@ -46,16 +48,24 @@ func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
headless = *m.Headless
}
- ct, err := chromeTransport(headless)
+ browser, err := newBrowser(headless)
if err != nil {
log.Println(err)
os.Exit(1)
}
- return ct
+ m.browser = browser
+
+ return chromeTransport(browser)
+}
+
+func (m *Module) Finalize() {
+ if m.browser != nil {
+ m.browser.Close()
+ }
}
-func chromeTransport(headless bool) (flyscrape.RoundTripFunc, error) {
+func newBrowser(headless bool) (*rod.Browser, error) {
serviceURL, err := launcher.New().
Headless(headless).
Launch()
@@ -68,6 +78,10 @@ func chromeTransport(headless bool) (flyscrape.RoundTripFunc, error) {
return nil, fmt.Errorf("failed to connect to browser: %w", err)
}
+ return browser, nil
+}
+
+func chromeTransport(browser *rod.Browser) flyscrape.RoundTripFunc {
return func(r *http.Request) (*http.Response, error) {
select {
case <-r.Context().Done():
@@ -92,19 +106,25 @@ func chromeTransport(headless bool) (flyscrape.RoundTripFunc, error) {
page = page.Context(r.Context())
for h := range r.Header {
+ if h == "Cookie" {
+ continue
+ }
if h == "User-Agent" && strings.HasPrefix(r.UserAgent(), "flyscrape") {
continue
}
page.MustSetExtraHeaders(h, r.Header.Get(h))
}
+ page.SetCookies(parseCookies(r))
+
if err := page.Navigate(r.URL.String()); err != nil {
return nil, err
}
- if err := page.WaitStable(time.Second); err != nil {
- return nil, err
- }
+ timeout := page.Timeout(10 * time.Second)
+ timeout.WaitLoad()
+ timeout.WaitDOMStable(300*time.Millisecond, 0)
+ timeout.WaitRequestIdle(time.Second, nil, nil, nil)
html, err := page.HTML()
if err != nil {
@@ -129,7 +149,45 @@ func chromeTransport(headless bool) (flyscrape.RoundTripFunc, error) {
}
return resp, err
- }, nil
+ }
}
-var _ flyscrape.TransportAdapter = Module{}
+func parseCookies(r *http.Request) []*proto.NetworkCookieParam {
+ rawCookie := r.Header.Get("Cookie")
+ if rawCookie == "" {
+ return nil
+ }
+
+ header := http.Header{}
+ header.Add("Cookie", rawCookie)
+ request := http.Request{Header: header}
+
+ domainSegs := strings.Split(r.URL.Hostname(), ".")
+ if len(domainSegs) < 2 {
+ return nil
+ }
+
+ domain := "." + strings.Join(domainSegs[len(domainSegs)-2:], ".")
+
+ var cookies []*proto.NetworkCookieParam
+ for _, cookie := range request.Cookies() {
+ cookies = append(cookies, &proto.NetworkCookieParam{
+ Name: cookie.Name,
+ Value: cookie.Value,
+ Domain: domain,
+ Path: "/",
+ Secure: false,
+ HTTPOnly: false,
+ SameSite: "Lax",
+ Expires: -1,
+ URL: r.URL.String(),
+ })
+ }
+
+ return cookies
+}
+
+var (
+ _ flyscrape.TransportAdapter = &Module{}
+ _ flyscrape.Finalizer = &Module{}
+)
diff --git a/modules/cookies/cookies.go b/modules/cookies/cookies.go
index 2f57a3f..2af2a27 100644
--- a/modules/cookies/cookies.go
+++ b/modules/cookies/cookies.go
@@ -51,6 +51,10 @@ func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
for _, store := range stores {
for _, cookie := range store.Cookies(r.URL) {
+ // Unquote cookie value
+ if len(cookie.Value) >= 2 && cookie.Value[0] == '"' && cookie.Value[len(cookie.Value)-1] == '"' {
+ cookie.Value = cookie.Value[1 : len(cookie.Value)-2]
+ }
r.AddCookie(cookie)
}
}
diff --git a/modules/proxy/proxy.go b/modules/proxy/proxy.go
index 832bc7a..a8f4963 100644
--- a/modules/proxy/proxy.go
+++ b/modules/proxy/proxy.go
@@ -62,5 +62,5 @@ func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
}
func (m *Module) disabled() bool {
- return len(m.Proxies) == 0
+ return len(m.Proxies) == 0 && m.Proxy == ""
}
diff --git a/scrape.go b/scrape.go
index cb7f18c..d12a5e0 100644
--- a/scrape.go
+++ b/scrape.go
@@ -123,7 +123,6 @@ func (s *Scraper) scrape() {
for i := 0; i < 500; i++ {
go func() {
for job := range s.jobs {
- job := job
s.process(job.url, job.depth)
s.wg.Done()
}
@@ -197,11 +196,19 @@ func (s *Scraper) process(url string, depth int) {
}
if s.ScrapeFunc != nil {
- response.Data, err = s.ScrapeFunc(ScrapeParams{HTML: string(response.Body), URL: request.URL})
- if err != nil {
- response.Error = err
- return
- }
+ func() {
+ defer func() {
+ if r := recover(); r != nil {
+ log.Println(r)
+ }
+ }()
+
+ response.Data, err = s.ScrapeFunc(ScrapeParams{HTML: string(response.Body), URL: request.URL})
+ if err != nil {
+ response.Error = err
+ return
+ }
+ }()
}
}