diff options
| -rw-r--r-- | examples/hackernews_login.js | 11 | ||||
| -rw-r--r-- | flyscrape.go | 4 | ||||
| -rw-r--r-- | js.go | 4 | ||||
| -rw-r--r-- | js_lib.go | 109 | ||||
| -rw-r--r-- | js_lib_test.go | 166 | ||||
| -rw-r--r-- | scrape.go | 6 | ||||
| -rw-r--r-- | utils.go | 1 |
7 files changed, 213 insertions, 88 deletions
diff --git a/examples/hackernews_login.js b/examples/hackernews_login.js index 371b8b7..aea9442 100644 --- a/examples/hackernews_login.js +++ b/examples/hackernews_login.js @@ -1,16 +1,15 @@ -import { submitForm } from "flyscrape" +import { parse } from "flyscrape" +import http from "flyscrape/http" export const config = { url: "https://news.ycombinator.com", } -export function login() { - const formData = { +export function setup() { + http.postForm("https://news.ycombinator.com/login", { "acct": "my-username", "pw": "my-password", - } - - submitForm("https://news.ycombinator.com/login", formData) + }) } export default function ({ doc }) { diff --git a/flyscrape.go b/flyscrape.go index 81c9346..8e79c2a 100644 --- a/flyscrape.go +++ b/flyscrape.go @@ -32,7 +32,7 @@ func Run(file string) error { scraper := NewScraper() scraper.ScrapeFunc = exports.Scrape - scraper.LoginFunc = exports.Login + scraper.SetupFunc = exports.Setup scraper.Script = file scraper.Client = client scraper.Modules = LoadModules(exports.Config()) @@ -66,7 +66,7 @@ func Dev(file string) error { scraper := NewScraper() scraper.ScrapeFunc = exports.Scrape - scraper.LoginFunc = exports.Login + scraper.SetupFunc = exports.Setup scraper.Script = file scraper.Client = client scraper.Modules = LoadModules(cfg) @@ -55,8 +55,8 @@ func (e Exports) Scrape(p ScrapeParams) (any, error) { return fn(p) } -func (e Exports) Login() { - if fn, ok := e["login"].(func(goja.FunctionCall) goja.Value); ok { +func (e Exports) Setup() { + if fn, ok := e["setup"].(func(goja.FunctionCall) goja.Value); ok { fn(goja.FunctionCall{}) } } @@ -5,6 +5,8 @@ package flyscrape import ( + "bytes" + "encoding/json" "fmt" "io" "net/http" @@ -15,57 +17,104 @@ import ( func NewJSLibrary(client *http.Client) Imports { return Imports{ "flyscrape": map[string]any{ - "fetchDocument": jsFetchDocument(client), - "submitForm": jsSubmitForm(client), + "parse": jsParse(), + }, + "flyscrape/http": map[string]any{ + "get": jsHTTPGet(client), + "postForm": jsHTTPPostForm(client), + "postJSON": jsHTTPPostJSON(client), }, } } -func jsFetchDocument(client *http.Client) func(url string) map[string]any { - return func(url string) map[string]any { - resp, err := client.Get(url) +func jsParse() func(html string) map[string]any { + return func(html string) map[string]any { + doc, err := DocumentFromString(html) if err != nil { return nil } - defer resp.Body.Close() - - var b strings.Builder - if _, err := io.Copy(&b, resp.Body); err != nil { - return nil - } + return doc + } +} - doc, err := DocumentFromString(b.String()) +func jsHTTPGet(client *http.Client) func(url string) map[string]any { + return func(url string) map[string]any { + req, err := http.NewRequest("GET", url, nil) if err != nil { - return nil + return map[string]any{"error": err.Error()} } - - return doc + return jsFetch(client, req) } } -func jsSubmitForm(client *http.Client) func(url string, data map[string]any) map[string]any { - return func(url string, data map[string]any) map[string]any { - form := gourl.Values{} - for k, v := range data { - form.Set(k, fmt.Sprintf("%v", v)) +func jsHTTPPostForm(client *http.Client) func(url string, form map[string]any) map[string]any { + return func(url string, form map[string]any) map[string]any { + vals := gourl.Values{} + for k, v := range form { + switch v := v.(type) { + case []any: + for _, v := range v { + vals.Add(k, fmt.Sprintf("%v", v)) + } + default: + vals.Add(k, fmt.Sprintf("%v", v)) + } } - resp, err := client.PostForm(url, form) + req, err := http.NewRequest("POST", url, strings.NewReader(vals.Encode())) if err != nil { - return nil + return map[string]any{"error": err.Error()} } - defer resp.Body.Close() + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") - var b strings.Builder - if _, err := io.Copy(&b, resp.Body); err != nil { - return nil - } + return jsFetch(client, req) + } +} + +func jsHTTPPostJSON(client *http.Client) func(url string, data any) map[string]any { + return func(url string, data any) map[string]any { + b, _ := json.Marshal(data) - doc, err := DocumentFromString(b.String()) + req, err := http.NewRequest("POST", url, bytes.NewReader(b)) if err != nil { - return nil + return map[string]any{"error": err.Error()} } + req.Header.Set("Content-Type", "application/json") - return doc + return jsFetch(client, req) + } +} + +func jsFetch(client *http.Client, req *http.Request) (obj map[string]any) { + obj = map[string]any{ + "body": "", + "status": 0, + "headers": map[string]any{}, + "error": "", + } + + resp, err := client.Do(req) + if err != nil { + obj["error"] = err.Error() + return + } + defer resp.Body.Close() + + obj["status"] = resp.StatusCode + + b, err := io.ReadAll(resp.Body) + if err != nil { + obj["error"] = err.Error() + return } + + obj["body"] = string(b) + + headers := map[string]any{} + for name := range resp.Header { + headers[name] = resp.Header.Get(name) + } + obj["headers"] = headers + + return } diff --git a/js_lib_test.go b/js_lib_test.go index 8d58c33..3682813 100644 --- a/js_lib_test.go +++ b/js_lib_test.go @@ -5,24 +5,20 @@ package flyscrape_test import ( - "fmt" - "io" + "encoding/json" "net/http" - "net/http/cookiejar" - "net/url" - "strings" "testing" "github.com/philippta/flyscrape" "github.com/stretchr/testify/require" ) -func TestJSLibFetchDocument(t *testing.T) { +func TestJSLibParse(t *testing.T) { script := ` - import { fetchDocument } from "flyscrape" + import { parse } from "flyscrape" - const doc = fetchDocument("https://example.com") - export const headline = doc.find("h1").text() + const doc = parse('<div class=foo>Hello world</div>') + export const text = doc.find(".foo").text() ` client := &http.Client{ @@ -32,60 +28,140 @@ func TestJSLibFetchDocument(t *testing.T) { exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client)) require.NoError(t, err) - h, ok := exports["headline"].(string) + h, ok := exports["text"].(string) require.True(t, ok) - require.Equal(t, "headline", h) + require.Equal(t, "Hello world", h) } -func TestJSLibSubmitForm(t *testing.T) { +func TestJSLibHTTPGet(t *testing.T) { script := ` - import { submitForm } from "flyscrape" + import http from "flyscrape/http" - const doc = submitForm("https://example.com", { - "username": "foo", - "password": "bar", + const res = http.get("https://example.com") + + export const body = res.body; + export const status = res.status; + export const error = res.error; + export const headers = res.headers; + ` + + client := &http.Client{ + Transport: flyscrape.MockTransport(200, html), + } + + exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client)) + require.NoError(t, err) + + body, ok := exports["body"].(string) + require.True(t, ok) + require.Equal(t, html, body) + + status, ok := exports["status"].(int64) + require.True(t, ok) + require.Equal(t, int64(200), status) + + error, ok := exports["error"].(string) + require.True(t, ok) + require.Equal(t, "", error) + + headers, ok := exports["headers"].(map[string]any) + require.True(t, ok) + require.NotEmpty(t, headers) +} + +func TestJSLibHTTPPostForm(t *testing.T) { + script := ` + import http from "flyscrape/http" + + const res = http.postForm("https://example.com", { + username: "foo", + password: "bar", + arr: [1,2,3], }) - export const text = doc.find("div").text() + export const body = res.body; + export const status = res.status; + export const error = res.error; + export const headers = res.headers; ` - var username, password string + client := &http.Client{ + Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { + require.Equal(t, "POST", r.Method) + require.Equal(t, "application/x-www-form-urlencoded", r.Header.Get("Content-Type")) + require.Equal(t, "foo", r.FormValue("username")) + require.Equal(t, "bar", r.FormValue("password")) + require.Len(t, r.Form["arr"], 3) + + return flyscrape.MockResponse(400, "Bad Request") + }), + } + + exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client)) + require.NoError(t, err) + + body, ok := exports["body"].(string) + require.True(t, ok) + require.Equal(t, "Bad Request", body) + + status, ok := exports["status"].(int64) + require.True(t, ok) + require.Equal(t, int64(400), status) + + error, ok := exports["error"].(string) + require.True(t, ok) + require.Equal(t, "", error) + + headers, ok := exports["headers"].(map[string]any) + require.True(t, ok) + require.NotEmpty(t, headers) +} + +func TestJSLibHTTPPostJSON(t *testing.T) { + script := ` + import http from "flyscrape/http" + + const res = http.postJSON("https://example.com", { + username: "foo", + password: "bar", + }) + + export const body = res.body; + export const status = res.status; + export const error = res.error; + export const headers = res.headers; + ` - jar, _ := cookiejar.New(nil) client := &http.Client{ - Jar: jar, Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { - username = r.FormValue("username") - password = r.FormValue("password") - - resp := &http.Response{ - StatusCode: 200, - Status: fmt.Sprintf("%d %s", 200, http.StatusText(200)), - Body: io.NopCloser(strings.NewReader(`<div>Login successful</div>`)), - Header: http.Header{}, - } - - cookie := http.Cookie{ - Name: "example", - Value: "Hello world!", - Path: "/", - MaxAge: 3600, - } - - resp.Header.Add("Set-Cookie", cookie.String()) - return resp, nil + require.Equal(t, "POST", r.Method) + require.Equal(t, "application/json", r.Header.Get("Content-Type")) + + m := map[string]any{} + json.NewDecoder(r.Body).Decode(&m) + require.Equal(t, "foo", m["username"]) + require.Equal(t, "bar", m["password"]) + + return flyscrape.MockResponse(400, "Bad Request") }), } exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client)) require.NoError(t, err) - text, ok := exports["text"].(string) + body, ok := exports["body"].(string) + require.True(t, ok) + require.Equal(t, "Bad Request", body) + + status, ok := exports["status"].(int64) require.True(t, ok) - require.Equal(t, "Login successful", text) - require.Equal(t, "foo", username) - require.Equal(t, "bar", password) + require.Equal(t, int64(400), status) - u, _ := url.Parse("https://example.com") - require.NotEmpty(t, jar.Cookies(u)) + error, ok := exports["error"].(string) + require.True(t, ok) + require.Equal(t, "", error) + + headers, ok := exports["headers"].(map[string]any) + require.True(t, ok) + require.NotEmpty(t, headers) } @@ -53,7 +53,7 @@ func NewScraper() *Scraper { type Scraper struct { ScrapeFunc ScrapeFunc - LoginFunc func() + SetupFunc func() Script string Modules []Module Client *http.Client @@ -97,8 +97,8 @@ func (s *Scraper) Run() { } } - if s.LoginFunc != nil { - s.LoginFunc() + if s.SetupFunc != nil { + s.SetupFunc() } go s.scrape() @@ -39,5 +39,6 @@ func MockResponse(statusCode int, html string) (*http.Response, error) { StatusCode: statusCode, Status: fmt.Sprintf("%d %s", statusCode, http.StatusText(statusCode)), Body: io.NopCloser(strings.NewReader(html)), + Header: http.Header{"Content-Type": []string{"text/html"}}, }, nil } |