diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-11-01 21:20:40 +0100 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-11-01 21:20:40 +0100 |
| commit | aadd80b3b213988aa5701075f2650198e4066349 (patch) | |
| tree | 23d61848106682a0c72338c07c37d6b9f87a1b47 | |
| parent | 2d3cd6584dedce45ea709d1757a28ce7537f3472 (diff) | |
Add login functionality
| -rw-r--r-- | examples/hackernews_login.js | 20 | ||||
| -rw-r--r-- | flyscrape.go | 14 | ||||
| -rw-r--r-- | js.go | 6 | ||||
| -rw-r--r-- | js_lib.go | 71 | ||||
| -rw-r--r-- | js_lib_test.go | 91 | ||||
| -rw-r--r-- | scrape.go | 5 |
6 files changed, 201 insertions, 6 deletions
diff --git a/examples/hackernews_login.js b/examples/hackernews_login.js new file mode 100644 index 0000000..371b8b7 --- /dev/null +++ b/examples/hackernews_login.js @@ -0,0 +1,20 @@ +import { submitForm } from "flyscrape" + +export const config = { + url: "https://news.ycombinator.com", +} + +export function login() { + const formData = { + "acct": "my-username", + "pw": "my-password", + } + + submitForm("https://news.ycombinator.com/login", formData) +} + +export default function ({ doc }) { + return { + karma: doc.find("#karma").text() + } +} diff --git a/flyscrape.go b/flyscrape.go index bb4ee30..81c9346 100644 --- a/flyscrape.go +++ b/flyscrape.go @@ -25,16 +25,17 @@ func Run(file string) error { client := &http.Client{} - script, err := Compile(string(src), nil) + exports, err := Compile(string(src), NewJSLibrary(client)) if err != nil { return fmt.Errorf("failed to compile script: %w", err) } scraper := NewScraper() - scraper.ScrapeFunc = script.Scrape + scraper.ScrapeFunc = exports.Scrape + scraper.LoginFunc = exports.Login scraper.Script = file scraper.Client = client - scraper.Modules = LoadModules(script.Config()) + scraper.Modules = LoadModules(exports.Config()) scraper.Run() return nil @@ -53,18 +54,19 @@ func Dev(file string) error { fn := func(s string) error { client := &http.Client{} - script, err := Compile(s, nil) + exports, err := Compile(s, NewJSLibrary(client)) if err != nil { printCompileErr(file, err) return nil } - cfg := script.Config() + cfg := exports.Config() cfg = updateCfg(cfg, "depth", 0) cfg = updateCfg(cfg, "cache", "file:"+cachefile) scraper := NewScraper() - scraper.ScrapeFunc = script.Scrape + scraper.ScrapeFunc = exports.Scrape + scraper.LoginFunc = exports.Login scraper.Script = file scraper.Client = client scraper.Modules = LoadModules(cfg) @@ -55,6 +55,12 @@ func (e Exports) Scrape(p ScrapeParams) (any, error) { return fn(p) } +func (e Exports) Login() { + if fn, ok := e["login"].(func(goja.FunctionCall) goja.Value); ok { + fn(goja.FunctionCall{}) + } +} + type Imports map[string]map[string]any func Compile(src string, imports Imports) (Exports, error) { diff --git a/js_lib.go b/js_lib.go new file mode 100644 index 0000000..1ba2a57 --- /dev/null +++ b/js_lib.go @@ -0,0 +1,71 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package flyscrape + +import ( + "fmt" + "io" + "net/http" + gourl "net/url" + "strings" +) + +func NewJSLibrary(client *http.Client) Imports { + return Imports{ + "flyscrape": map[string]any{ + "fetchDocument": jsFetchDocument(client), + "submitForm": jsSubmitForm(client), + }, + } +} + +func jsFetchDocument(client *http.Client) func(url string) map[string]any { + return func(url string) map[string]any { + resp, err := client.Get(url) + if err != nil { + return nil + } + defer resp.Body.Close() + + var b strings.Builder + if _, err := io.Copy(&b, resp.Body); err != nil { + return nil + } + + doc, err := DocumentFromString(b.String()) + if err != nil { + return nil + } + + return doc + } +} + +func jsSubmitForm(client *http.Client) func(url string, data map[string]any) map[string]any { + return func(url string, data map[string]any) map[string]any { + form := gourl.Values{} + for k, v := range data { + form.Set(k, fmt.Sprintf("%v", v)) + } + + resp, err := client.PostForm(url, form) + if err != nil { + return nil + } + defer resp.Body.Close() + + var b strings.Builder + if _, err := io.Copy(&b, resp.Body); err != nil { + return nil + } + + doc, err := DocumentFromString(b.String()) + if err != nil { + return nil + } + + return doc + } +} diff --git a/js_lib_test.go b/js_lib_test.go new file mode 100644 index 0000000..8d58c33 --- /dev/null +++ b/js_lib_test.go @@ -0,0 +1,91 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package flyscrape_test + +import ( + "fmt" + "io" + "net/http" + "net/http/cookiejar" + "net/url" + "strings" + "testing" + + "github.com/philippta/flyscrape" + "github.com/stretchr/testify/require" +) + +func TestJSLibFetchDocument(t *testing.T) { + script := ` + import { fetchDocument } from "flyscrape" + + const doc = fetchDocument("https://example.com") + export const headline = doc.find("h1").text() + ` + + client := &http.Client{ + Transport: flyscrape.MockTransport(200, html), + } + + exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client)) + require.NoError(t, err) + + h, ok := exports["headline"].(string) + require.True(t, ok) + require.Equal(t, "headline", h) +} + +func TestJSLibSubmitForm(t *testing.T) { + script := ` + import { submitForm } from "flyscrape" + + const doc = submitForm("https://example.com", { + "username": "foo", + "password": "bar", + }) + + export const text = doc.find("div").text() + ` + + var username, password string + + jar, _ := cookiejar.New(nil) + client := &http.Client{ + Jar: jar, + Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { + username = r.FormValue("username") + password = r.FormValue("password") + + resp := &http.Response{ + StatusCode: 200, + Status: fmt.Sprintf("%d %s", 200, http.StatusText(200)), + Body: io.NopCloser(strings.NewReader(`<div>Login successful</div>`)), + Header: http.Header{}, + } + + cookie := http.Cookie{ + Name: "example", + Value: "Hello world!", + Path: "/", + MaxAge: 3600, + } + + resp.Header.Add("Set-Cookie", cookie.String()) + return resp, nil + }), + } + + exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client)) + require.NoError(t, err) + + text, ok := exports["text"].(string) + require.True(t, ok) + require.Equal(t, "Login successful", text) + require.Equal(t, "foo", username) + require.Equal(t, "bar", password) + + u, _ := url.Parse("https://example.com") + require.NotEmpty(t, jar.Cookies(u)) +} @@ -53,6 +53,7 @@ func NewScraper() *Scraper { type Scraper struct { ScrapeFunc ScrapeFunc + LoginFunc func() Script string Modules []Module Client *http.Client @@ -96,6 +97,10 @@ func (s *Scraper) Run() { } } + if s.LoginFunc != nil { + s.LoginFunc() + } + go s.scrape() s.wg.Wait() close(s.jobs) |