summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--examples/hackernews_login.js20
-rw-r--r--flyscrape.go14
-rw-r--r--js.go6
-rw-r--r--js_lib.go71
-rw-r--r--js_lib_test.go91
-rw-r--r--scrape.go5
6 files changed, 201 insertions, 6 deletions
diff --git a/examples/hackernews_login.js b/examples/hackernews_login.js
new file mode 100644
index 0000000..371b8b7
--- /dev/null
+++ b/examples/hackernews_login.js
@@ -0,0 +1,20 @@
+import { submitForm } from "flyscrape"
+
+export const config = {
+ url: "https://news.ycombinator.com",
+}
+
+export function login() {
+ const formData = {
+ "acct": "my-username",
+ "pw": "my-password",
+ }
+
+ submitForm("https://news.ycombinator.com/login", formData)
+}
+
+export default function ({ doc }) {
+ return {
+ karma: doc.find("#karma").text()
+ }
+}
diff --git a/flyscrape.go b/flyscrape.go
index bb4ee30..81c9346 100644
--- a/flyscrape.go
+++ b/flyscrape.go
@@ -25,16 +25,17 @@ func Run(file string) error {
client := &http.Client{}
- script, err := Compile(string(src), nil)
+ exports, err := Compile(string(src), NewJSLibrary(client))
if err != nil {
return fmt.Errorf("failed to compile script: %w", err)
}
scraper := NewScraper()
- scraper.ScrapeFunc = script.Scrape
+ scraper.ScrapeFunc = exports.Scrape
+ scraper.LoginFunc = exports.Login
scraper.Script = file
scraper.Client = client
- scraper.Modules = LoadModules(script.Config())
+ scraper.Modules = LoadModules(exports.Config())
scraper.Run()
return nil
@@ -53,18 +54,19 @@ func Dev(file string) error {
fn := func(s string) error {
client := &http.Client{}
- script, err := Compile(s, nil)
+ exports, err := Compile(s, NewJSLibrary(client))
if err != nil {
printCompileErr(file, err)
return nil
}
- cfg := script.Config()
+ cfg := exports.Config()
cfg = updateCfg(cfg, "depth", 0)
cfg = updateCfg(cfg, "cache", "file:"+cachefile)
scraper := NewScraper()
- scraper.ScrapeFunc = script.Scrape
+ scraper.ScrapeFunc = exports.Scrape
+ scraper.LoginFunc = exports.Login
scraper.Script = file
scraper.Client = client
scraper.Modules = LoadModules(cfg)
diff --git a/js.go b/js.go
index 7b5630b..5170800 100644
--- a/js.go
+++ b/js.go
@@ -55,6 +55,12 @@ func (e Exports) Scrape(p ScrapeParams) (any, error) {
return fn(p)
}
+func (e Exports) Login() {
+ if fn, ok := e["login"].(func(goja.FunctionCall) goja.Value); ok {
+ fn(goja.FunctionCall{})
+ }
+}
+
type Imports map[string]map[string]any
func Compile(src string, imports Imports) (Exports, error) {
diff --git a/js_lib.go b/js_lib.go
new file mode 100644
index 0000000..1ba2a57
--- /dev/null
+++ b/js_lib.go
@@ -0,0 +1,71 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package flyscrape
+
+import (
+ "fmt"
+ "io"
+ "net/http"
+ gourl "net/url"
+ "strings"
+)
+
+func NewJSLibrary(client *http.Client) Imports {
+ return Imports{
+ "flyscrape": map[string]any{
+ "fetchDocument": jsFetchDocument(client),
+ "submitForm": jsSubmitForm(client),
+ },
+ }
+}
+
+func jsFetchDocument(client *http.Client) func(url string) map[string]any {
+ return func(url string) map[string]any {
+ resp, err := client.Get(url)
+ if err != nil {
+ return nil
+ }
+ defer resp.Body.Close()
+
+ var b strings.Builder
+ if _, err := io.Copy(&b, resp.Body); err != nil {
+ return nil
+ }
+
+ doc, err := DocumentFromString(b.String())
+ if err != nil {
+ return nil
+ }
+
+ return doc
+ }
+}
+
+func jsSubmitForm(client *http.Client) func(url string, data map[string]any) map[string]any {
+ return func(url string, data map[string]any) map[string]any {
+ form := gourl.Values{}
+ for k, v := range data {
+ form.Set(k, fmt.Sprintf("%v", v))
+ }
+
+ resp, err := client.PostForm(url, form)
+ if err != nil {
+ return nil
+ }
+ defer resp.Body.Close()
+
+ var b strings.Builder
+ if _, err := io.Copy(&b, resp.Body); err != nil {
+ return nil
+ }
+
+ doc, err := DocumentFromString(b.String())
+ if err != nil {
+ return nil
+ }
+
+ return doc
+ }
+}
diff --git a/js_lib_test.go b/js_lib_test.go
new file mode 100644
index 0000000..8d58c33
--- /dev/null
+++ b/js_lib_test.go
@@ -0,0 +1,91 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package flyscrape_test
+
+import (
+ "fmt"
+ "io"
+ "net/http"
+ "net/http/cookiejar"
+ "net/url"
+ "strings"
+ "testing"
+
+ "github.com/philippta/flyscrape"
+ "github.com/stretchr/testify/require"
+)
+
+func TestJSLibFetchDocument(t *testing.T) {
+ script := `
+ import { fetchDocument } from "flyscrape"
+
+ const doc = fetchDocument("https://example.com")
+ export const headline = doc.find("h1").text()
+ `
+
+ client := &http.Client{
+ Transport: flyscrape.MockTransport(200, html),
+ }
+
+ exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client))
+ require.NoError(t, err)
+
+ h, ok := exports["headline"].(string)
+ require.True(t, ok)
+ require.Equal(t, "headline", h)
+}
+
+func TestJSLibSubmitForm(t *testing.T) {
+ script := `
+ import { submitForm } from "flyscrape"
+
+ const doc = submitForm("https://example.com", {
+ "username": "foo",
+ "password": "bar",
+ })
+
+ export const text = doc.find("div").text()
+ `
+
+ var username, password string
+
+ jar, _ := cookiejar.New(nil)
+ client := &http.Client{
+ Jar: jar,
+ Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
+ username = r.FormValue("username")
+ password = r.FormValue("password")
+
+ resp := &http.Response{
+ StatusCode: 200,
+ Status: fmt.Sprintf("%d %s", 200, http.StatusText(200)),
+ Body: io.NopCloser(strings.NewReader(`<div>Login successful</div>`)),
+ Header: http.Header{},
+ }
+
+ cookie := http.Cookie{
+ Name: "example",
+ Value: "Hello world!",
+ Path: "/",
+ MaxAge: 3600,
+ }
+
+ resp.Header.Add("Set-Cookie", cookie.String())
+ return resp, nil
+ }),
+ }
+
+ exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client))
+ require.NoError(t, err)
+
+ text, ok := exports["text"].(string)
+ require.True(t, ok)
+ require.Equal(t, "Login successful", text)
+ require.Equal(t, "foo", username)
+ require.Equal(t, "bar", password)
+
+ u, _ := url.Parse("https://example.com")
+ require.NotEmpty(t, jar.Cookies(u))
+}
diff --git a/scrape.go b/scrape.go
index 764ef39..73908b8 100644
--- a/scrape.go
+++ b/scrape.go
@@ -53,6 +53,7 @@ func NewScraper() *Scraper {
type Scraper struct {
ScrapeFunc ScrapeFunc
+ LoginFunc func()
Script string
Modules []Module
Client *http.Client
@@ -96,6 +97,10 @@ func (s *Scraper) Run() {
}
}
+ if s.LoginFunc != nil {
+ s.LoginFunc()
+ }
+
go s.scrape()
s.wg.Wait()
close(s.jobs)