1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
|
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package flyscrape
import (
"encoding/json"
"errors"
"fmt"
"math/rand"
"os"
"strings"
"github.com/evanw/esbuild/pkg/api"
"github.com/philippta/flyscrape/js"
"go.kuoruan.net/v8go-polyfills/console"
v8 "rogchap.com/v8go"
)
type Config []byte
type ScrapeParams struct {
HTML string
URL string
}
type ScrapeFunc func(ScrapeParams) (any, error)
type TransformError struct {
Line int
Column int
Text string
}
func (err TransformError) Error() string {
return fmt.Sprintf("%d:%d: %s", err.Line, err.Column, err.Text)
}
func Compile(src string) (Config, ScrapeFunc, error) {
src, err := build(src)
if err != nil {
return nil, nil, err
}
return vm(src)
}
func build(src string) (string, error) {
res := api.Transform(src, api.TransformOptions{
Platform: api.PlatformBrowser,
Format: api.FormatIIFE,
})
var errs []error
for _, msg := range res.Errors {
err := TransformError{Text: msg.Text}
if msg.Location != nil {
err.Line = msg.Location.Line
err.Column = msg.Location.Column
}
errs = append(errs, err)
}
if len(res.Errors) > 0 {
return "", errors.Join(errs...)
}
return string(res.Code), nil
}
func vm(src string) (Config, ScrapeFunc, error) {
ctx := v8.NewContext()
if err := console.InjectTo(ctx, console.WithOutput(os.Stderr)); err != nil {
return nil, nil, fmt.Errorf("injecting console: %w", err)
}
ctx.RunScript("var module = {}", "main.js")
if _, err := ctx.RunScript(removeIIFE(js.Flyscrape), "main.js"); err != nil {
return nil, nil, fmt.Errorf("running flyscrape bundle: %w", err)
}
if _, err := ctx.RunScript(`const require = () => require_flyscrape();`, "main.js"); err != nil {
return nil, nil, fmt.Errorf("creating require function: %w", err)
}
if _, err := ctx.RunScript(removeIIFE(src), "main.js"); err != nil {
return nil, nil, fmt.Errorf("running user script: %w", err)
}
cfg, err := ctx.RunScript("JSON.stringify(config)", "main.js")
if err != nil {
return nil, nil, fmt.Errorf("reading config: %w", err)
}
if !cfg.IsString() {
return nil, nil, fmt.Errorf("config is not a string")
}
scrape := func(params ScrapeParams) (any, error) {
suffix := randSeq(16)
ctx.Global().Set("html_"+suffix, params.HTML)
ctx.Global().Set("url_"+suffix, params.URL)
data, err := ctx.RunScript(fmt.Sprintf(`JSON.stringify(stdin_default({html: html_%s, url: url_%s}))`, suffix, suffix), "main.js")
if err != nil {
return nil, err
}
var obj any
if err := json.Unmarshal([]byte(data.String()), &obj); err != nil {
return nil, err
}
return obj, nil
}
return Config(cfg.String()), scrape, nil
}
func randSeq(n int) string {
letters := []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
b := make([]rune, n)
for i := range b {
b[i] = letters[rand.Intn(len(letters))]
}
return string(b)
}
func removeIIFE(s string) string {
s = strings.TrimPrefix(s, "(() => {\n")
s = strings.TrimSuffix(s, "})();\n")
return s
}
|