summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2024-02-24 13:59:29 +0100
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2024-02-24 13:59:29 +0100
commit6cc19d5c412b8adc89092702d4bc21b416fc4fae (patch)
tree3142b426395a409647a519270e31145fa9000f65
parent3dc39e9eba495b18dad2a7d79d45dcb634729dd7 (diff)
Browser rendering
-rw-r--r--README.md57
-rw-r--r--cmd/flyscrape/main.go1
-rw-r--r--examples/browser.js19
-rw-r--r--go.mod6
-rw-r--r--go.sum16
-rw-r--r--module.go1
-rw-r--r--modules/browser/browser.go135
-rw-r--r--modules/browser/browser_test.go160
-rw-r--r--modules/cookies/cookies.go2
-rw-r--r--modules/ratelimit/ratelimit.go13
-rw-r--r--template.js8
11 files changed, 379 insertions, 39 deletions
diff --git a/README.md b/README.md
index f7b4948..d4df30e 100644
--- a/README.md
+++ b/README.md
@@ -24,13 +24,11 @@
## Features
-- **Highly Configurable:** 13 options to fine-tune your scraper.
-- **Standalone:** flyscrape comes as a single binary executable.
+- **Standalone:** Flyscrape comes as a single binary executable.
+- **jQuery-like:** Extract data from HTML pages with a familiar API.
- **Scriptable:** Use JavaScript to write your data extraction logic.
-- **Simple API:** Extract data from HTML pages with a familiar API.
-- **Fast Iteration:** Use the development mode to get quick feedback.
-- **Request Caching:** Re-run scripts on websites you already scraped.
-- **Zero Dependencies:** No need to fill up your disk with npm packages.
+- **Tons of features:** 20 features to configure the scraping behavior.
+- **Browser Mode:** Render JavaScript heavy pages using a headless Browser.
## Overview
@@ -60,8 +58,16 @@ export const config = {
"https://news.ycombinator.com/show",
"https://news.ycombinator.com/ask",
],
- depth: 5,
+
+ // Cache request for later.
cache: "file",
+
+ // Enable JavaScript rendering.
+ browser: true,
+ headless: false,
+
+ // Follow pagination 5 times.
+ depth: 5,
follow: ["a.morelink[href]"],
}
@@ -181,6 +187,12 @@ export const config = {
"https://yetanother.com/",
],
+ // Enable rendering with headless browser. (default = false)
+ browser: true,
+
+ // Specify if browser should be headless or not. (default = true)
+ headless: false,
+
// Specify how deep links should be followed. (default = 0, no follow)
depth: 5,
@@ -206,9 +218,11 @@ export const config = {
concurrency: 1,
// Specify a single HTTP(S) proxy URL. (default = no proxy)
+ // Note: Not compatible with browser mode.
proxy: "http://someproxy.com:8043",
// Specify multiple HTTP(S) proxy URLs. (default = no proxy)
+ // Note: Not compatible with browser mode.
proxies: [
"http://someproxy.com:8043",
"http://someotherproxy.com:8043",
@@ -288,35 +302,6 @@ const doc = parse(`<div class="foo">bar</div>`);
const text = doc.find(".foo").text();
```
-### Basic HTTP Requests
-
-```javascript
-import http from "flyscrape/http";
-
-const response = http.get("https://example.com")
-
-const response = http.postForm("https://example.com", {
- "username": "foo",
- "password": "bar",
-})
-
-const response = http.postJSON("https://example.com", {
- "username": "foo",
- "password": "bar",
-})
-
-// Contents of response
-{
- body: "<html>...</html>",
- status: 200,
- headers: {
- "Content-Type": "text/html",
- // ...
- },
- error": "",
-}
-```
-
### File Downloads
```javascript
diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go
index aff7ec4..7419f6e 100644
--- a/cmd/flyscrape/main.go
+++ b/cmd/flyscrape/main.go
@@ -11,6 +11,7 @@ import (
"os"
"github.com/philippta/flyscrape/cmd"
+ _ "github.com/philippta/flyscrape/modules/browser"
_ "github.com/philippta/flyscrape/modules/cache"
_ "github.com/philippta/flyscrape/modules/cookies"
_ "github.com/philippta/flyscrape/modules/depth"
diff --git a/examples/browser.js b/examples/browser.js
new file mode 100644
index 0000000..de26bd5
--- /dev/null
+++ b/examples/browser.js
@@ -0,0 +1,19 @@
+export const config = {
+ url: "https://www.airbnb.com/",
+ browser: true,
+ // headless: false,
+};
+
+export default function ({ doc, absoluteURL }) {
+ const rooms = doc.find("[itemprop=itemListElement]");
+
+ return {
+ listings: rooms.map(room => {
+ const link = "https://" + room.find("meta[itemprop=url]").attr("content");
+ const image = room.find("img").attr("src");
+ const desc = new Set(room.find("[role=group] > div > div > div").map(d => d.text()).filter(Boolean));
+
+ return { link, image, desc }
+ }),
+ }
+}
diff --git a/go.mod b/go.mod
index 3899dd2..4a2048c 100644
--- a/go.mod
+++ b/go.mod
@@ -10,6 +10,7 @@ require (
github.com/dop251/goja_nodejs v0.0.0-20230914102007-198ba9a8b098
github.com/evanw/esbuild v0.18.14
github.com/fsnotify/fsnotify v1.6.0
+ github.com/go-rod/rod v0.114.7
github.com/inancgumus/screen v0.0.0-20190314163918-06e984b86ed3
github.com/mattn/go-sqlite3 v1.14.17
github.com/nlnwa/whatwg-url v0.4.0
@@ -40,6 +41,11 @@ require (
github.com/tidwall/gjson v1.17.0 // indirect
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.1 // indirect
+ github.com/ysmood/fetchup v0.2.3 // indirect
+ github.com/ysmood/goob v0.4.0 // indirect
+ github.com/ysmood/got v0.34.1 // indirect
+ github.com/ysmood/gson v0.7.3 // indirect
+ github.com/ysmood/leakless v0.8.0 // indirect
github.com/zalando/go-keyring v0.2.3 // indirect
golang.org/x/crypto v0.19.0 // indirect
golang.org/x/net v0.21.0 // indirect
diff --git a/go.sum b/go.sum
index f6e61e6..19875e2 100644
--- a/go.sum
+++ b/go.sum
@@ -48,6 +48,8 @@ github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4
github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
github.com/go-ini/ini v1.67.0 h1:z6ZrTEZqSWOTyH2FlglNbNgARyHG8oLW9gMELqKr06A=
github.com/go-ini/ini v1.67.0/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8=
+github.com/go-rod/rod v0.114.7 h1:h4pimzSOUnw7Eo41zdJA788XsawzHjJMyzCE3BrBww0=
+github.com/go-rod/rod v0.114.7/go.mod h1:aiedSEFg5DwG/fnNbUOTPMTTWX3MRj6vIs/a684Mthw=
github.com/go-sourcemap/sourcemap v2.1.3+incompatible h1:W1iEw64niKVGogNgBN3ePyLFfuisuzeidWPMPWmECqU=
github.com/go-sourcemap/sourcemap v2.1.3+incompatible/go.mod h1:F8jJfvm2KbVjc5NqelyYJmf/v5J0dwNLS2mL4sNA1Jg=
github.com/go-sqlite/sqlite3 v0.0.0-20180313105335-53dd8e640ee7 h1:ow5vK9Q/DSKkxbEIJHBST6g+buBDwdaDIyk1dGGwpQo=
@@ -108,6 +110,20 @@ github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28=
+github.com/ysmood/fetchup v0.2.3 h1:ulX+SonA0Vma5zUFXtv52Kzip/xe7aj4vqT5AJwQ+ZQ=
+github.com/ysmood/fetchup v0.2.3/go.mod h1:xhibcRKziSvol0H1/pj33dnKrYyI2ebIvz5cOOkYGns=
+github.com/ysmood/goob v0.4.0 h1:HsxXhyLBeGzWXnqVKtmT9qM7EuVs/XOgkX7T6r1o1AQ=
+github.com/ysmood/goob v0.4.0/go.mod h1:u6yx7ZhS4Exf2MwciFr6nIM8knHQIE22lFpWHnfql18=
+github.com/ysmood/gop v0.0.2 h1:VuWweTmXK+zedLqYufJdh3PlxDNBOfFHjIZlPT2T5nw=
+github.com/ysmood/gop v0.0.2/go.mod h1:rr5z2z27oGEbyB787hpEcx4ab8cCiPnKxn0SUHt6xzk=
+github.com/ysmood/got v0.34.1 h1:IrV2uWLs45VXNvZqhJ6g2nIhY+pgIG1CUoOcqfXFl1s=
+github.com/ysmood/got v0.34.1/go.mod h1:yddyjq/PmAf08RMLSwDjPyCvHvYed+WjHnQxpH851LM=
+github.com/ysmood/gotrace v0.6.0 h1:SyI1d4jclswLhg7SWTL6os3L1WOKeNn/ZtzVQF8QmdY=
+github.com/ysmood/gotrace v0.6.0/go.mod h1:TzhIG7nHDry5//eYZDYcTzuJLYQIkykJzCRIo4/dzQM=
+github.com/ysmood/gson v0.7.3 h1:QFkWbTH8MxyUTKPkVWAENJhxqdBa4lYTQWqZCiLG6kE=
+github.com/ysmood/gson v0.7.3/go.mod h1:3Kzs5zDl21g5F/BlLTNcuAGAYLKt2lV5G8D1zF3RNmg=
+github.com/ysmood/leakless v0.8.0 h1:BzLrVoiwxikpgEQR0Lk8NyBN5Cit2b1z+u0mgL4ZJak=
+github.com/ysmood/leakless v0.8.0/go.mod h1:R8iAXPRaG97QJwqxs74RdwzcRHT1SWCGTNqY8q0JvMQ=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/zalando/go-keyring v0.2.3 h1:v9CUu9phlABObO4LPWycf+zwMG7nlbb3t/B5wa97yms=
github.com/zalando/go-keyring v0.2.3/go.mod h1:HL4k+OXQfJUWaMnqyuSOc0drfGPX2b51Du6K+MRgZMk=
diff --git a/module.go b/module.go
index 7091309..47ccd31 100644
--- a/module.go
+++ b/module.go
@@ -95,6 +95,7 @@ var (
// Transport adapters must be loaded in a specific order.
// All other modules can be loaded in any order.
"proxy",
+ "browser",
"retry",
"ratelimit",
"cache",
diff --git a/modules/browser/browser.go b/modules/browser/browser.go
new file mode 100644
index 0000000..5802d24
--- /dev/null
+++ b/modules/browser/browser.go
@@ -0,0 +1,135 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package browser
+
+import (
+ "fmt"
+ "io"
+ "log"
+ "net/http"
+ "os"
+ "strings"
+ "sync"
+ "time"
+
+ "github.com/go-rod/rod"
+ "github.com/go-rod/rod/lib/launcher"
+ "github.com/go-rod/rod/lib/proto"
+ "github.com/philippta/flyscrape"
+)
+
+func init() {
+ flyscrape.RegisterModule(Module{})
+}
+
+type Module struct {
+ Browser bool `json:"browser"`
+ Headless *bool `json:"headless"`
+}
+
+func (Module) ModuleInfo() flyscrape.ModuleInfo {
+ return flyscrape.ModuleInfo{
+ ID: "browser",
+ New: func() flyscrape.Module { return new(Module) },
+ }
+}
+
+func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
+ if !m.Browser {
+ return t
+ }
+
+ headless := true
+ if m.Headless != nil {
+ headless = *m.Headless
+ }
+
+ ct, err := chromeTransport(headless)
+ if err != nil {
+ log.Println(err)
+ os.Exit(1)
+ }
+
+ return ct
+}
+
+func chromeTransport(headless bool) (flyscrape.RoundTripFunc, error) {
+ serviceURL, err := launcher.New().
+ Headless(headless).
+ Launch()
+ if err != nil {
+ return nil, fmt.Errorf("failed to launch browser: %w", err)
+ }
+
+ browser := rod.New().ControlURL(serviceURL).NoDefaultDevice()
+ if err := browser.Connect(); err != nil {
+ return nil, fmt.Errorf("failed to connect to browser: %w", err)
+ }
+
+ return func(r *http.Request) (*http.Response, error) {
+ select {
+ case <-r.Context().Done():
+ return nil, r.Context().Err()
+ default:
+ }
+
+ page := browser.MustPage()
+ defer page.Close()
+
+ var once sync.Once
+ var networkResponse *proto.NetworkResponse
+ go page.EachEvent(func(e *proto.NetworkResponseReceived) {
+ if e.Type != proto.NetworkResourceTypeDocument {
+ return
+ }
+ once.Do(func() {
+ networkResponse = e.Response
+ })
+ })()
+
+ page = page.Context(r.Context())
+
+ for h := range r.Header {
+ if h == "User-Agent" && strings.HasPrefix(r.UserAgent(), "flyscrape") {
+ continue
+ }
+ page.MustSetExtraHeaders(h, r.Header.Get(h))
+ }
+
+ if err := page.Navigate(r.URL.String()); err != nil {
+ return nil, err
+ }
+
+ if err := page.WaitStable(time.Second); err != nil {
+ return nil, err
+ }
+
+ html, err := page.HTML()
+ if err != nil {
+ return nil, err
+ }
+
+ resp := &http.Response{
+ StatusCode: 200,
+ Status: "200 OK",
+ Body: io.NopCloser(strings.NewReader(html)),
+ Header: http.Header{"Content-Type": []string{"text/html"}},
+ }
+
+ if networkResponse != nil {
+ resp.StatusCode = networkResponse.Status
+ resp.Status = networkResponse.StatusText
+ resp.Header = http.Header{}
+
+ for k, v := range networkResponse.Headers {
+ resp.Header.Set(k, v.String())
+ }
+ }
+
+ return resp, err
+ }, nil
+}
+
+var _ flyscrape.TransportAdapter = Module{}
diff --git a/modules/browser/browser_test.go b/modules/browser/browser_test.go
new file mode 100644
index 0000000..f7fe22a
--- /dev/null
+++ b/modules/browser/browser_test.go
@@ -0,0 +1,160 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package browser_test
+
+import (
+ "fmt"
+ "net/http"
+ "net/http/httptest"
+ "testing"
+
+ "github.com/philippta/flyscrape"
+ "github.com/philippta/flyscrape/modules/browser"
+ "github.com/philippta/flyscrape/modules/headers"
+ "github.com/philippta/flyscrape/modules/hook"
+ "github.com/philippta/flyscrape/modules/starturl"
+ "github.com/stretchr/testify/require"
+)
+
+func TestBrowser(t *testing.T) {
+ var called bool
+
+ srv := newServer(func(w http.ResponseWriter, r *http.Request) {
+ called = true
+ w.Write([]byte(`<h1>Hello Browser</h1><a href="foo">Foo</a>`))
+ })
+ defer srv.Close()
+
+ var body string
+
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: srv.URL},
+ &browser.Module{Browser: true},
+ &hook.Module{
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ body = string(r.Body)
+ },
+ },
+ }
+
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
+ scraper.Run()
+
+ require.True(t, called)
+ require.Contains(t, body, "Hello Browser")
+}
+func TestBrowserStatusCode(t *testing.T) {
+ srv := newServer(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(404)
+ })
+ defer srv.Close()
+
+ var statusCode int
+
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: srv.URL},
+ &browser.Module{Browser: true},
+ &hook.Module{
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ statusCode = r.StatusCode
+ },
+ },
+ }
+
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
+ scraper.Run()
+
+ require.Equal(t, 404, statusCode)
+}
+
+func TestBrowserRequestHeader(t *testing.T) {
+ srv := newServer(func(w http.ResponseWriter, r *http.Request) {
+ w.Write([]byte(r.Header.Get("User-Agent")))
+ })
+ defer srv.Close()
+
+ var body string
+
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: srv.URL},
+ &browser.Module{Browser: true},
+ &headers.Module{
+ Headers: map[string]string{
+ "User-Agent": "custom-headers",
+ },
+ },
+ &hook.Module{
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ body = string(r.Body)
+ },
+ },
+ }
+
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
+ scraper.Run()
+
+ require.Contains(t, body, "custom-headers")
+}
+
+func TestBrowserResponseHeader(t *testing.T) {
+ srv := newServer(func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Foo", "bar")
+ })
+ defer srv.Close()
+
+ var header string
+
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: srv.URL},
+ &browser.Module{Browser: true},
+ &hook.Module{
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ header = r.Headers.Get("Foo")
+ },
+ },
+ }
+
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
+ scraper.Run()
+
+ require.Equal(t, header, "bar")
+}
+
+func TestBrowserUnsetFlyscrapeUserAgent(t *testing.T) {
+ srv := newServer(func(w http.ResponseWriter, r *http.Request) {
+ w.Write([]byte(r.Header.Get("User-Agent")))
+ })
+ defer srv.Close()
+
+ var body string
+
+ mods := []flyscrape.Module{
+ &starturl.Module{URL: srv.URL},
+ &browser.Module{Browser: true},
+ &hook.Module{
+ ReceiveResponseFn: func(r *flyscrape.Response) {
+ body = string(r.Body)
+ },
+ },
+ }
+
+ scraper := flyscrape.NewScraper()
+ scraper.Modules = mods
+ scraper.Run()
+
+ fmt.Println(body)
+ require.Contains(t, body, "Mozilla/5.0")
+ require.NotContains(t, body, "flyscrape")
+}
+
+func newServer(f func(http.ResponseWriter, *http.Request)) *httptest.Server {
+ return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ f(w, r)
+ }))
+}
diff --git a/modules/cookies/cookies.go b/modules/cookies/cookies.go
index 4be344d..2f57a3f 100644
--- a/modules/cookies/cookies.go
+++ b/modules/cookies/cookies.go
@@ -43,7 +43,7 @@ func (m Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
var stores []kooky.CookieStore
for _, store := range kooky.FindAllCookieStores() {
- if store.Browser() == m.Cookies {
+ if store.Browser() == m.Cookies && store.IsDefaultProfile() {
stores = append(stores, store)
}
}
diff --git a/modules/ratelimit/ratelimit.go b/modules/ratelimit/ratelimit.go
index 152c6fd..f68f8e9 100644
--- a/modules/ratelimit/ratelimit.go
+++ b/modules/ratelimit/ratelimit.go
@@ -17,8 +17,9 @@ func init() {
}
type Module struct {
- Rate int `json:"rate"`
- Concurrency int `json:"concurrency"`
+ Rate int `json:"rate"`
+ Concurrency int `json:"concurrency"`
+ Browser bool `json:"browser"`
ticker *time.Ticker
ratelimit chan struct{}
@@ -46,6 +47,10 @@ func (m *Module) Provision(v flyscrape.Context) {
}()
}
+ if m.browserEnabled() && !m.concurrencyEnabled() {
+ m.Concurrency = 1
+ }
+
if m.concurrencyEnabled() {
m.concurrency = make(chan struct{}, m.Concurrency)
for i := 0; i < m.Concurrency; i++ {
@@ -83,6 +88,10 @@ func (m *Module) concurrencyEnabled() bool {
return m.Concurrency > 0
}
+func (m *Module) browserEnabled() bool {
+ return m.Browser
+}
+
var (
_ flyscrape.TransportAdapter = (*Module)(nil)
_ flyscrape.Provisioner = (*Module)(nil)
diff --git a/template.js b/template.js
index a7b4384..b466a4e 100644
--- a/template.js
+++ b/template.js
@@ -2,6 +2,12 @@ export const config = {
// Specify the URL to start scraping from.
url: "https://example.com/",
+ // Enable rendering with headless browser. (default = false)
+ // browser: true,
+
+ // Specify if browser should be headless or not. (default = true)
+ // headless: false,
+
// Specify the multiple URLs to start scraping from. (default = [])
// urls: [
// "https://anothersite.com/",
@@ -33,9 +39,11 @@ export const config = {
// concurrency: 1,
// Specify a single HTTP(S) proxy URL. (default = no proxy)
+ // Note: Not compatible with browser mode.
// proxy: "http://someproxy.com:8043",
// Specify multiple HTTP(S) proxy URLs. (default = no proxy)
+ // Note: Not compatible with browser mode.
// proxies: [
// "http://someproxy.com:8043",
// "http://someotherproxy.com:8043",