diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2024-02-24 13:59:29 +0100 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2024-02-24 13:59:29 +0100 |
| commit | 6cc19d5c412b8adc89092702d4bc21b416fc4fae (patch) | |
| tree | 3142b426395a409647a519270e31145fa9000f65 | |
| parent | 3dc39e9eba495b18dad2a7d79d45dcb634729dd7 (diff) | |
Browser rendering
| -rw-r--r-- | README.md | 57 | ||||
| -rw-r--r-- | cmd/flyscrape/main.go | 1 | ||||
| -rw-r--r-- | examples/browser.js | 19 | ||||
| -rw-r--r-- | go.mod | 6 | ||||
| -rw-r--r-- | go.sum | 16 | ||||
| -rw-r--r-- | module.go | 1 | ||||
| -rw-r--r-- | modules/browser/browser.go | 135 | ||||
| -rw-r--r-- | modules/browser/browser_test.go | 160 | ||||
| -rw-r--r-- | modules/cookies/cookies.go | 2 | ||||
| -rw-r--r-- | modules/ratelimit/ratelimit.go | 13 | ||||
| -rw-r--r-- | template.js | 8 |
11 files changed, 379 insertions, 39 deletions
@@ -24,13 +24,11 @@ ## Features -- **Highly Configurable:** 13 options to fine-tune your scraper. -- **Standalone:** flyscrape comes as a single binary executable. +- **Standalone:** Flyscrape comes as a single binary executable. +- **jQuery-like:** Extract data from HTML pages with a familiar API. - **Scriptable:** Use JavaScript to write your data extraction logic. -- **Simple API:** Extract data from HTML pages with a familiar API. -- **Fast Iteration:** Use the development mode to get quick feedback. -- **Request Caching:** Re-run scripts on websites you already scraped. -- **Zero Dependencies:** No need to fill up your disk with npm packages. +- **Tons of features:** 20 features to configure the scraping behavior. +- **Browser Mode:** Render JavaScript heavy pages using a headless Browser. ## Overview @@ -60,8 +58,16 @@ export const config = { "https://news.ycombinator.com/show", "https://news.ycombinator.com/ask", ], - depth: 5, + + // Cache request for later. cache: "file", + + // Enable JavaScript rendering. + browser: true, + headless: false, + + // Follow pagination 5 times. + depth: 5, follow: ["a.morelink[href]"], } @@ -181,6 +187,12 @@ export const config = { "https://yetanother.com/", ], + // Enable rendering with headless browser. (default = false) + browser: true, + + // Specify if browser should be headless or not. (default = true) + headless: false, + // Specify how deep links should be followed. (default = 0, no follow) depth: 5, @@ -206,9 +218,11 @@ export const config = { concurrency: 1, // Specify a single HTTP(S) proxy URL. (default = no proxy) + // Note: Not compatible with browser mode. proxy: "http://someproxy.com:8043", // Specify multiple HTTP(S) proxy URLs. (default = no proxy) + // Note: Not compatible with browser mode. proxies: [ "http://someproxy.com:8043", "http://someotherproxy.com:8043", @@ -288,35 +302,6 @@ const doc = parse(`<div class="foo">bar</div>`); const text = doc.find(".foo").text(); ``` -### Basic HTTP Requests - -```javascript -import http from "flyscrape/http"; - -const response = http.get("https://example.com") - -const response = http.postForm("https://example.com", { - "username": "foo", - "password": "bar", -}) - -const response = http.postJSON("https://example.com", { - "username": "foo", - "password": "bar", -}) - -// Contents of response -{ - body: "<html>...</html>", - status: 200, - headers: { - "Content-Type": "text/html", - // ... - }, - error": "", -} -``` - ### File Downloads ```javascript diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go index aff7ec4..7419f6e 100644 --- a/cmd/flyscrape/main.go +++ b/cmd/flyscrape/main.go @@ -11,6 +11,7 @@ import ( "os" "github.com/philippta/flyscrape/cmd" + _ "github.com/philippta/flyscrape/modules/browser" _ "github.com/philippta/flyscrape/modules/cache" _ "github.com/philippta/flyscrape/modules/cookies" _ "github.com/philippta/flyscrape/modules/depth" diff --git a/examples/browser.js b/examples/browser.js new file mode 100644 index 0000000..de26bd5 --- /dev/null +++ b/examples/browser.js @@ -0,0 +1,19 @@ +export const config = { + url: "https://www.airbnb.com/", + browser: true, + // headless: false, +}; + +export default function ({ doc, absoluteURL }) { + const rooms = doc.find("[itemprop=itemListElement]"); + + return { + listings: rooms.map(room => { + const link = "https://" + room.find("meta[itemprop=url]").attr("content"); + const image = room.find("img").attr("src"); + const desc = new Set(room.find("[role=group] > div > div > div").map(d => d.text()).filter(Boolean)); + + return { link, image, desc } + }), + } +} @@ -10,6 +10,7 @@ require ( github.com/dop251/goja_nodejs v0.0.0-20230914102007-198ba9a8b098 github.com/evanw/esbuild v0.18.14 github.com/fsnotify/fsnotify v1.6.0 + github.com/go-rod/rod v0.114.7 github.com/inancgumus/screen v0.0.0-20190314163918-06e984b86ed3 github.com/mattn/go-sqlite3 v1.14.17 github.com/nlnwa/whatwg-url v0.4.0 @@ -40,6 +41,11 @@ require ( github.com/tidwall/gjson v1.17.0 // indirect github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.1 // indirect + github.com/ysmood/fetchup v0.2.3 // indirect + github.com/ysmood/goob v0.4.0 // indirect + github.com/ysmood/got v0.34.1 // indirect + github.com/ysmood/gson v0.7.3 // indirect + github.com/ysmood/leakless v0.8.0 // indirect github.com/zalando/go-keyring v0.2.3 // indirect golang.org/x/crypto v0.19.0 // indirect golang.org/x/net v0.21.0 // indirect @@ -48,6 +48,8 @@ github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4 github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= github.com/go-ini/ini v1.67.0 h1:z6ZrTEZqSWOTyH2FlglNbNgARyHG8oLW9gMELqKr06A= github.com/go-ini/ini v1.67.0/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8= +github.com/go-rod/rod v0.114.7 h1:h4pimzSOUnw7Eo41zdJA788XsawzHjJMyzCE3BrBww0= +github.com/go-rod/rod v0.114.7/go.mod h1:aiedSEFg5DwG/fnNbUOTPMTTWX3MRj6vIs/a684Mthw= github.com/go-sourcemap/sourcemap v2.1.3+incompatible h1:W1iEw64niKVGogNgBN3ePyLFfuisuzeidWPMPWmECqU= github.com/go-sourcemap/sourcemap v2.1.3+incompatible/go.mod h1:F8jJfvm2KbVjc5NqelyYJmf/v5J0dwNLS2mL4sNA1Jg= github.com/go-sqlite/sqlite3 v0.0.0-20180313105335-53dd8e640ee7 h1:ow5vK9Q/DSKkxbEIJHBST6g+buBDwdaDIyk1dGGwpQo= @@ -108,6 +110,20 @@ github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= +github.com/ysmood/fetchup v0.2.3 h1:ulX+SonA0Vma5zUFXtv52Kzip/xe7aj4vqT5AJwQ+ZQ= +github.com/ysmood/fetchup v0.2.3/go.mod h1:xhibcRKziSvol0H1/pj33dnKrYyI2ebIvz5cOOkYGns= +github.com/ysmood/goob v0.4.0 h1:HsxXhyLBeGzWXnqVKtmT9qM7EuVs/XOgkX7T6r1o1AQ= +github.com/ysmood/goob v0.4.0/go.mod h1:u6yx7ZhS4Exf2MwciFr6nIM8knHQIE22lFpWHnfql18= +github.com/ysmood/gop v0.0.2 h1:VuWweTmXK+zedLqYufJdh3PlxDNBOfFHjIZlPT2T5nw= +github.com/ysmood/gop v0.0.2/go.mod h1:rr5z2z27oGEbyB787hpEcx4ab8cCiPnKxn0SUHt6xzk= +github.com/ysmood/got v0.34.1 h1:IrV2uWLs45VXNvZqhJ6g2nIhY+pgIG1CUoOcqfXFl1s= +github.com/ysmood/got v0.34.1/go.mod h1:yddyjq/PmAf08RMLSwDjPyCvHvYed+WjHnQxpH851LM= +github.com/ysmood/gotrace v0.6.0 h1:SyI1d4jclswLhg7SWTL6os3L1WOKeNn/ZtzVQF8QmdY= +github.com/ysmood/gotrace v0.6.0/go.mod h1:TzhIG7nHDry5//eYZDYcTzuJLYQIkykJzCRIo4/dzQM= +github.com/ysmood/gson v0.7.3 h1:QFkWbTH8MxyUTKPkVWAENJhxqdBa4lYTQWqZCiLG6kE= +github.com/ysmood/gson v0.7.3/go.mod h1:3Kzs5zDl21g5F/BlLTNcuAGAYLKt2lV5G8D1zF3RNmg= +github.com/ysmood/leakless v0.8.0 h1:BzLrVoiwxikpgEQR0Lk8NyBN5Cit2b1z+u0mgL4ZJak= +github.com/ysmood/leakless v0.8.0/go.mod h1:R8iAXPRaG97QJwqxs74RdwzcRHT1SWCGTNqY8q0JvMQ= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/zalando/go-keyring v0.2.3 h1:v9CUu9phlABObO4LPWycf+zwMG7nlbb3t/B5wa97yms= github.com/zalando/go-keyring v0.2.3/go.mod h1:HL4k+OXQfJUWaMnqyuSOc0drfGPX2b51Du6K+MRgZMk= @@ -95,6 +95,7 @@ var ( // Transport adapters must be loaded in a specific order. // All other modules can be loaded in any order. "proxy", + "browser", "retry", "ratelimit", "cache", diff --git a/modules/browser/browser.go b/modules/browser/browser.go new file mode 100644 index 0000000..5802d24 --- /dev/null +++ b/modules/browser/browser.go @@ -0,0 +1,135 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package browser + +import ( + "fmt" + "io" + "log" + "net/http" + "os" + "strings" + "sync" + "time" + + "github.com/go-rod/rod" + "github.com/go-rod/rod/lib/launcher" + "github.com/go-rod/rod/lib/proto" + "github.com/philippta/flyscrape" +) + +func init() { + flyscrape.RegisterModule(Module{}) +} + +type Module struct { + Browser bool `json:"browser"` + Headless *bool `json:"headless"` +} + +func (Module) ModuleInfo() flyscrape.ModuleInfo { + return flyscrape.ModuleInfo{ + ID: "browser", + New: func() flyscrape.Module { return new(Module) }, + } +} + +func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { + if !m.Browser { + return t + } + + headless := true + if m.Headless != nil { + headless = *m.Headless + } + + ct, err := chromeTransport(headless) + if err != nil { + log.Println(err) + os.Exit(1) + } + + return ct +} + +func chromeTransport(headless bool) (flyscrape.RoundTripFunc, error) { + serviceURL, err := launcher.New(). + Headless(headless). + Launch() + if err != nil { + return nil, fmt.Errorf("failed to launch browser: %w", err) + } + + browser := rod.New().ControlURL(serviceURL).NoDefaultDevice() + if err := browser.Connect(); err != nil { + return nil, fmt.Errorf("failed to connect to browser: %w", err) + } + + return func(r *http.Request) (*http.Response, error) { + select { + case <-r.Context().Done(): + return nil, r.Context().Err() + default: + } + + page := browser.MustPage() + defer page.Close() + + var once sync.Once + var networkResponse *proto.NetworkResponse + go page.EachEvent(func(e *proto.NetworkResponseReceived) { + if e.Type != proto.NetworkResourceTypeDocument { + return + } + once.Do(func() { + networkResponse = e.Response + }) + })() + + page = page.Context(r.Context()) + + for h := range r.Header { + if h == "User-Agent" && strings.HasPrefix(r.UserAgent(), "flyscrape") { + continue + } + page.MustSetExtraHeaders(h, r.Header.Get(h)) + } + + if err := page.Navigate(r.URL.String()); err != nil { + return nil, err + } + + if err := page.WaitStable(time.Second); err != nil { + return nil, err + } + + html, err := page.HTML() + if err != nil { + return nil, err + } + + resp := &http.Response{ + StatusCode: 200, + Status: "200 OK", + Body: io.NopCloser(strings.NewReader(html)), + Header: http.Header{"Content-Type": []string{"text/html"}}, + } + + if networkResponse != nil { + resp.StatusCode = networkResponse.Status + resp.Status = networkResponse.StatusText + resp.Header = http.Header{} + + for k, v := range networkResponse.Headers { + resp.Header.Set(k, v.String()) + } + } + + return resp, err + }, nil +} + +var _ flyscrape.TransportAdapter = Module{} diff --git a/modules/browser/browser_test.go b/modules/browser/browser_test.go new file mode 100644 index 0000000..f7fe22a --- /dev/null +++ b/modules/browser/browser_test.go @@ -0,0 +1,160 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package browser_test + +import ( + "fmt" + "net/http" + "net/http/httptest" + "testing" + + "github.com/philippta/flyscrape" + "github.com/philippta/flyscrape/modules/browser" + "github.com/philippta/flyscrape/modules/headers" + "github.com/philippta/flyscrape/modules/hook" + "github.com/philippta/flyscrape/modules/starturl" + "github.com/stretchr/testify/require" +) + +func TestBrowser(t *testing.T) { + var called bool + + srv := newServer(func(w http.ResponseWriter, r *http.Request) { + called = true + w.Write([]byte(`<h1>Hello Browser</h1><a href="foo">Foo</a>`)) + }) + defer srv.Close() + + var body string + + mods := []flyscrape.Module{ + &starturl.Module{URL: srv.URL}, + &browser.Module{Browser: true}, + &hook.Module{ + ReceiveResponseFn: func(r *flyscrape.Response) { + body = string(r.Body) + }, + }, + } + + scraper := flyscrape.NewScraper() + scraper.Modules = mods + scraper.Run() + + require.True(t, called) + require.Contains(t, body, "Hello Browser") +} +func TestBrowserStatusCode(t *testing.T) { + srv := newServer(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(404) + }) + defer srv.Close() + + var statusCode int + + mods := []flyscrape.Module{ + &starturl.Module{URL: srv.URL}, + &browser.Module{Browser: true}, + &hook.Module{ + ReceiveResponseFn: func(r *flyscrape.Response) { + statusCode = r.StatusCode + }, + }, + } + + scraper := flyscrape.NewScraper() + scraper.Modules = mods + scraper.Run() + + require.Equal(t, 404, statusCode) +} + +func TestBrowserRequestHeader(t *testing.T) { + srv := newServer(func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte(r.Header.Get("User-Agent"))) + }) + defer srv.Close() + + var body string + + mods := []flyscrape.Module{ + &starturl.Module{URL: srv.URL}, + &browser.Module{Browser: true}, + &headers.Module{ + Headers: map[string]string{ + "User-Agent": "custom-headers", + }, + }, + &hook.Module{ + ReceiveResponseFn: func(r *flyscrape.Response) { + body = string(r.Body) + }, + }, + } + + scraper := flyscrape.NewScraper() + scraper.Modules = mods + scraper.Run() + + require.Contains(t, body, "custom-headers") +} + +func TestBrowserResponseHeader(t *testing.T) { + srv := newServer(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Foo", "bar") + }) + defer srv.Close() + + var header string + + mods := []flyscrape.Module{ + &starturl.Module{URL: srv.URL}, + &browser.Module{Browser: true}, + &hook.Module{ + ReceiveResponseFn: func(r *flyscrape.Response) { + header = r.Headers.Get("Foo") + }, + }, + } + + scraper := flyscrape.NewScraper() + scraper.Modules = mods + scraper.Run() + + require.Equal(t, header, "bar") +} + +func TestBrowserUnsetFlyscrapeUserAgent(t *testing.T) { + srv := newServer(func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte(r.Header.Get("User-Agent"))) + }) + defer srv.Close() + + var body string + + mods := []flyscrape.Module{ + &starturl.Module{URL: srv.URL}, + &browser.Module{Browser: true}, + &hook.Module{ + ReceiveResponseFn: func(r *flyscrape.Response) { + body = string(r.Body) + }, + }, + } + + scraper := flyscrape.NewScraper() + scraper.Modules = mods + scraper.Run() + + fmt.Println(body) + require.Contains(t, body, "Mozilla/5.0") + require.NotContains(t, body, "flyscrape") +} + +func newServer(f func(http.ResponseWriter, *http.Request)) *httptest.Server { + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + f(w, r) + })) +} diff --git a/modules/cookies/cookies.go b/modules/cookies/cookies.go index 4be344d..2f57a3f 100644 --- a/modules/cookies/cookies.go +++ b/modules/cookies/cookies.go @@ -43,7 +43,7 @@ func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { var stores []kooky.CookieStore for _, store := range kooky.FindAllCookieStores() { - if store.Browser() == m.Cookies { + if store.Browser() == m.Cookies && store.IsDefaultProfile() { stores = append(stores, store) } } diff --git a/modules/ratelimit/ratelimit.go b/modules/ratelimit/ratelimit.go index 152c6fd..f68f8e9 100644 --- a/modules/ratelimit/ratelimit.go +++ b/modules/ratelimit/ratelimit.go @@ -17,8 +17,9 @@ func init() { } type Module struct { - Rate int `json:"rate"` - Concurrency int `json:"concurrency"` + Rate int `json:"rate"` + Concurrency int `json:"concurrency"` + Browser bool `json:"browser"` ticker *time.Ticker ratelimit chan struct{} @@ -46,6 +47,10 @@ func (m *Module) Provision(v flyscrape.Context) { }() } + if m.browserEnabled() && !m.concurrencyEnabled() { + m.Concurrency = 1 + } + if m.concurrencyEnabled() { m.concurrency = make(chan struct{}, m.Concurrency) for i := 0; i < m.Concurrency; i++ { @@ -83,6 +88,10 @@ func (m *Module) concurrencyEnabled() bool { return m.Concurrency > 0 } +func (m *Module) browserEnabled() bool { + return m.Browser +} + var ( _ flyscrape.TransportAdapter = (*Module)(nil) _ flyscrape.Provisioner = (*Module)(nil) diff --git a/template.js b/template.js index a7b4384..b466a4e 100644 --- a/template.js +++ b/template.js @@ -2,6 +2,12 @@ export const config = { // Specify the URL to start scraping from. url: "https://example.com/", + // Enable rendering with headless browser. (default = false) + // browser: true, + + // Specify if browser should be headless or not. (default = true) + // headless: false, + // Specify the multiple URLs to start scraping from. (default = []) // urls: [ // "https://anothersite.com/", @@ -33,9 +39,11 @@ export const config = { // concurrency: 1, // Specify a single HTTP(S) proxy URL. (default = no proxy) + // Note: Not compatible with browser mode. // proxy: "http://someproxy.com:8043", // Specify multiple HTTP(S) proxy URLs. (default = no proxy) + // Note: Not compatible with browser mode. // proxies: [ // "http://someproxy.com:8043", // "http://someotherproxy.com:8043", |