summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--examples/hackernews_login.js11
-rw-r--r--flyscrape.go4
-rw-r--r--js.go4
-rw-r--r--js_lib.go109
-rw-r--r--js_lib_test.go166
-rw-r--r--scrape.go6
-rw-r--r--utils.go1
7 files changed, 213 insertions, 88 deletions
diff --git a/examples/hackernews_login.js b/examples/hackernews_login.js
index 371b8b7..aea9442 100644
--- a/examples/hackernews_login.js
+++ b/examples/hackernews_login.js
@@ -1,16 +1,15 @@
-import { submitForm } from "flyscrape"
+import { parse } from "flyscrape"
+import http from "flyscrape/http"
export const config = {
url: "https://news.ycombinator.com",
}
-export function login() {
- const formData = {
+export function setup() {
+ http.postForm("https://news.ycombinator.com/login", {
"acct": "my-username",
"pw": "my-password",
- }
-
- submitForm("https://news.ycombinator.com/login", formData)
+ })
}
export default function ({ doc }) {
diff --git a/flyscrape.go b/flyscrape.go
index 81c9346..8e79c2a 100644
--- a/flyscrape.go
+++ b/flyscrape.go
@@ -32,7 +32,7 @@ func Run(file string) error {
scraper := NewScraper()
scraper.ScrapeFunc = exports.Scrape
- scraper.LoginFunc = exports.Login
+ scraper.SetupFunc = exports.Setup
scraper.Script = file
scraper.Client = client
scraper.Modules = LoadModules(exports.Config())
@@ -66,7 +66,7 @@ func Dev(file string) error {
scraper := NewScraper()
scraper.ScrapeFunc = exports.Scrape
- scraper.LoginFunc = exports.Login
+ scraper.SetupFunc = exports.Setup
scraper.Script = file
scraper.Client = client
scraper.Modules = LoadModules(cfg)
diff --git a/js.go b/js.go
index 5170800..a132c1c 100644
--- a/js.go
+++ b/js.go
@@ -55,8 +55,8 @@ func (e Exports) Scrape(p ScrapeParams) (any, error) {
return fn(p)
}
-func (e Exports) Login() {
- if fn, ok := e["login"].(func(goja.FunctionCall) goja.Value); ok {
+func (e Exports) Setup() {
+ if fn, ok := e["setup"].(func(goja.FunctionCall) goja.Value); ok {
fn(goja.FunctionCall{})
}
}
diff --git a/js_lib.go b/js_lib.go
index 1ba2a57..abfe07c 100644
--- a/js_lib.go
+++ b/js_lib.go
@@ -5,6 +5,8 @@
package flyscrape
import (
+ "bytes"
+ "encoding/json"
"fmt"
"io"
"net/http"
@@ -15,57 +17,104 @@ import (
func NewJSLibrary(client *http.Client) Imports {
return Imports{
"flyscrape": map[string]any{
- "fetchDocument": jsFetchDocument(client),
- "submitForm": jsSubmitForm(client),
+ "parse": jsParse(),
+ },
+ "flyscrape/http": map[string]any{
+ "get": jsHTTPGet(client),
+ "postForm": jsHTTPPostForm(client),
+ "postJSON": jsHTTPPostJSON(client),
},
}
}
-func jsFetchDocument(client *http.Client) func(url string) map[string]any {
- return func(url string) map[string]any {
- resp, err := client.Get(url)
+func jsParse() func(html string) map[string]any {
+ return func(html string) map[string]any {
+ doc, err := DocumentFromString(html)
if err != nil {
return nil
}
- defer resp.Body.Close()
-
- var b strings.Builder
- if _, err := io.Copy(&b, resp.Body); err != nil {
- return nil
- }
+ return doc
+ }
+}
- doc, err := DocumentFromString(b.String())
+func jsHTTPGet(client *http.Client) func(url string) map[string]any {
+ return func(url string) map[string]any {
+ req, err := http.NewRequest("GET", url, nil)
if err != nil {
- return nil
+ return map[string]any{"error": err.Error()}
}
-
- return doc
+ return jsFetch(client, req)
}
}
-func jsSubmitForm(client *http.Client) func(url string, data map[string]any) map[string]any {
- return func(url string, data map[string]any) map[string]any {
- form := gourl.Values{}
- for k, v := range data {
- form.Set(k, fmt.Sprintf("%v", v))
+func jsHTTPPostForm(client *http.Client) func(url string, form map[string]any) map[string]any {
+ return func(url string, form map[string]any) map[string]any {
+ vals := gourl.Values{}
+ for k, v := range form {
+ switch v := v.(type) {
+ case []any:
+ for _, v := range v {
+ vals.Add(k, fmt.Sprintf("%v", v))
+ }
+ default:
+ vals.Add(k, fmt.Sprintf("%v", v))
+ }
}
- resp, err := client.PostForm(url, form)
+ req, err := http.NewRequest("POST", url, strings.NewReader(vals.Encode()))
if err != nil {
- return nil
+ return map[string]any{"error": err.Error()}
}
- defer resp.Body.Close()
+ req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
- var b strings.Builder
- if _, err := io.Copy(&b, resp.Body); err != nil {
- return nil
- }
+ return jsFetch(client, req)
+ }
+}
+
+func jsHTTPPostJSON(client *http.Client) func(url string, data any) map[string]any {
+ return func(url string, data any) map[string]any {
+ b, _ := json.Marshal(data)
- doc, err := DocumentFromString(b.String())
+ req, err := http.NewRequest("POST", url, bytes.NewReader(b))
if err != nil {
- return nil
+ return map[string]any{"error": err.Error()}
}
+ req.Header.Set("Content-Type", "application/json")
- return doc
+ return jsFetch(client, req)
+ }
+}
+
+func jsFetch(client *http.Client, req *http.Request) (obj map[string]any) {
+ obj = map[string]any{
+ "body": "",
+ "status": 0,
+ "headers": map[string]any{},
+ "error": "",
+ }
+
+ resp, err := client.Do(req)
+ if err != nil {
+ obj["error"] = err.Error()
+ return
+ }
+ defer resp.Body.Close()
+
+ obj["status"] = resp.StatusCode
+
+ b, err := io.ReadAll(resp.Body)
+ if err != nil {
+ obj["error"] = err.Error()
+ return
}
+
+ obj["body"] = string(b)
+
+ headers := map[string]any{}
+ for name := range resp.Header {
+ headers[name] = resp.Header.Get(name)
+ }
+ obj["headers"] = headers
+
+ return
}
diff --git a/js_lib_test.go b/js_lib_test.go
index 8d58c33..3682813 100644
--- a/js_lib_test.go
+++ b/js_lib_test.go
@@ -5,24 +5,20 @@
package flyscrape_test
import (
- "fmt"
- "io"
+ "encoding/json"
"net/http"
- "net/http/cookiejar"
- "net/url"
- "strings"
"testing"
"github.com/philippta/flyscrape"
"github.com/stretchr/testify/require"
)
-func TestJSLibFetchDocument(t *testing.T) {
+func TestJSLibParse(t *testing.T) {
script := `
- import { fetchDocument } from "flyscrape"
+ import { parse } from "flyscrape"
- const doc = fetchDocument("https://example.com")
- export const headline = doc.find("h1").text()
+ const doc = parse('<div class=foo>Hello world</div>')
+ export const text = doc.find(".foo").text()
`
client := &http.Client{
@@ -32,60 +28,140 @@ func TestJSLibFetchDocument(t *testing.T) {
exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client))
require.NoError(t, err)
- h, ok := exports["headline"].(string)
+ h, ok := exports["text"].(string)
require.True(t, ok)
- require.Equal(t, "headline", h)
+ require.Equal(t, "Hello world", h)
}
-func TestJSLibSubmitForm(t *testing.T) {
+func TestJSLibHTTPGet(t *testing.T) {
script := `
- import { submitForm } from "flyscrape"
+ import http from "flyscrape/http"
- const doc = submitForm("https://example.com", {
- "username": "foo",
- "password": "bar",
+ const res = http.get("https://example.com")
+
+ export const body = res.body;
+ export const status = res.status;
+ export const error = res.error;
+ export const headers = res.headers;
+ `
+
+ client := &http.Client{
+ Transport: flyscrape.MockTransport(200, html),
+ }
+
+ exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client))
+ require.NoError(t, err)
+
+ body, ok := exports["body"].(string)
+ require.True(t, ok)
+ require.Equal(t, html, body)
+
+ status, ok := exports["status"].(int64)
+ require.True(t, ok)
+ require.Equal(t, int64(200), status)
+
+ error, ok := exports["error"].(string)
+ require.True(t, ok)
+ require.Equal(t, "", error)
+
+ headers, ok := exports["headers"].(map[string]any)
+ require.True(t, ok)
+ require.NotEmpty(t, headers)
+}
+
+func TestJSLibHTTPPostForm(t *testing.T) {
+ script := `
+ import http from "flyscrape/http"
+
+ const res = http.postForm("https://example.com", {
+ username: "foo",
+ password: "bar",
+ arr: [1,2,3],
})
- export const text = doc.find("div").text()
+ export const body = res.body;
+ export const status = res.status;
+ export const error = res.error;
+ export const headers = res.headers;
`
- var username, password string
+ client := &http.Client{
+ Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
+ require.Equal(t, "POST", r.Method)
+ require.Equal(t, "application/x-www-form-urlencoded", r.Header.Get("Content-Type"))
+ require.Equal(t, "foo", r.FormValue("username"))
+ require.Equal(t, "bar", r.FormValue("password"))
+ require.Len(t, r.Form["arr"], 3)
+
+ return flyscrape.MockResponse(400, "Bad Request")
+ }),
+ }
+
+ exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client))
+ require.NoError(t, err)
+
+ body, ok := exports["body"].(string)
+ require.True(t, ok)
+ require.Equal(t, "Bad Request", body)
+
+ status, ok := exports["status"].(int64)
+ require.True(t, ok)
+ require.Equal(t, int64(400), status)
+
+ error, ok := exports["error"].(string)
+ require.True(t, ok)
+ require.Equal(t, "", error)
+
+ headers, ok := exports["headers"].(map[string]any)
+ require.True(t, ok)
+ require.NotEmpty(t, headers)
+}
+
+func TestJSLibHTTPPostJSON(t *testing.T) {
+ script := `
+ import http from "flyscrape/http"
+
+ const res = http.postJSON("https://example.com", {
+ username: "foo",
+ password: "bar",
+ })
+
+ export const body = res.body;
+ export const status = res.status;
+ export const error = res.error;
+ export const headers = res.headers;
+ `
- jar, _ := cookiejar.New(nil)
client := &http.Client{
- Jar: jar,
Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
- username = r.FormValue("username")
- password = r.FormValue("password")
-
- resp := &http.Response{
- StatusCode: 200,
- Status: fmt.Sprintf("%d %s", 200, http.StatusText(200)),
- Body: io.NopCloser(strings.NewReader(`<div>Login successful</div>`)),
- Header: http.Header{},
- }
-
- cookie := http.Cookie{
- Name: "example",
- Value: "Hello world!",
- Path: "/",
- MaxAge: 3600,
- }
-
- resp.Header.Add("Set-Cookie", cookie.String())
- return resp, nil
+ require.Equal(t, "POST", r.Method)
+ require.Equal(t, "application/json", r.Header.Get("Content-Type"))
+
+ m := map[string]any{}
+ json.NewDecoder(r.Body).Decode(&m)
+ require.Equal(t, "foo", m["username"])
+ require.Equal(t, "bar", m["password"])
+
+ return flyscrape.MockResponse(400, "Bad Request")
}),
}
exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client))
require.NoError(t, err)
- text, ok := exports["text"].(string)
+ body, ok := exports["body"].(string)
+ require.True(t, ok)
+ require.Equal(t, "Bad Request", body)
+
+ status, ok := exports["status"].(int64)
require.True(t, ok)
- require.Equal(t, "Login successful", text)
- require.Equal(t, "foo", username)
- require.Equal(t, "bar", password)
+ require.Equal(t, int64(400), status)
- u, _ := url.Parse("https://example.com")
- require.NotEmpty(t, jar.Cookies(u))
+ error, ok := exports["error"].(string)
+ require.True(t, ok)
+ require.Equal(t, "", error)
+
+ headers, ok := exports["headers"].(map[string]any)
+ require.True(t, ok)
+ require.NotEmpty(t, headers)
}
diff --git a/scrape.go b/scrape.go
index 73908b8..4de36d8 100644
--- a/scrape.go
+++ b/scrape.go
@@ -53,7 +53,7 @@ func NewScraper() *Scraper {
type Scraper struct {
ScrapeFunc ScrapeFunc
- LoginFunc func()
+ SetupFunc func()
Script string
Modules []Module
Client *http.Client
@@ -97,8 +97,8 @@ func (s *Scraper) Run() {
}
}
- if s.LoginFunc != nil {
- s.LoginFunc()
+ if s.SetupFunc != nil {
+ s.SetupFunc()
}
go s.scrape()
diff --git a/utils.go b/utils.go
index 161cff8..861ee38 100644
--- a/utils.go
+++ b/utils.go
@@ -39,5 +39,6 @@ func MockResponse(statusCode int, html string) (*http.Response, error) {
StatusCode: statusCode,
Status: fmt.Sprintf("%d %s", statusCode, http.StatusText(statusCode)),
Body: io.NopCloser(strings.NewReader(html)),
+ Header: http.Header{"Content-Type": []string{"text/html"}},
}, nil
}