summaryrefslogtreecommitdiff
path: root/js.go
blob: e97245347ea3c261f009827b47b93bf429c71a6a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package flyscrape

import (
	"encoding/json"
	"errors"
	"fmt"
	"math/rand"
	"os"
	"strings"

	"github.com/evanw/esbuild/pkg/api"
	"github.com/philippta/flyscrape/js"
	"go.kuoruan.net/v8go-polyfills/console"
	v8 "rogchap.com/v8go"
)

type Config []byte

type ScrapeParams struct {
	HTML string
	URL  string
}

type ScrapeFunc func(ScrapeParams) (any, error)

type TransformError struct {
	Line   int
	Column int
	Text   string
}

func (err TransformError) Error() string {
	return fmt.Sprintf("%d:%d: %s", err.Line, err.Column, err.Text)
}

func Compile(src string) (Config, ScrapeFunc, error) {
	src, err := build(src)
	if err != nil {
		return nil, nil, err
	}
	return vm(src)
}

func build(src string) (string, error) {
	res := api.Transform(src, api.TransformOptions{
		Platform: api.PlatformBrowser,
		Format:   api.FormatIIFE,
	})

	var errs []error
	for _, msg := range res.Errors {
		err := TransformError{Text: msg.Text}
		if msg.Location != nil {
			err.Line = msg.Location.Line
			err.Column = msg.Location.Column
		}
		errs = append(errs, err)
	}
	if len(res.Errors) > 0 {
		return "", errors.Join(errs...)
	}

	return string(res.Code), nil
}

func vm(src string) (Config, ScrapeFunc, error) {
	ctx := v8.NewContext()

	if err := console.InjectTo(ctx, console.WithOutput(os.Stderr)); err != nil {
		return nil, nil, fmt.Errorf("injecting console: %w", err)
	}

	ctx.RunScript("var module = {}", "main.js")

	if _, err := ctx.RunScript(removeIIFE(js.Flyscrape), "main.js"); err != nil {
		return nil, nil, fmt.Errorf("running flyscrape bundle: %w", err)
	}
	if _, err := ctx.RunScript(`const require = () => require_flyscrape();`, "main.js"); err != nil {
		return nil, nil, fmt.Errorf("creating require function: %w", err)
	}
	if _, err := ctx.RunScript(removeIIFE(src), "main.js"); err != nil {
		return nil, nil, fmt.Errorf("running user script: %w", err)
	}

	cfg, err := ctx.RunScript("JSON.stringify(config)", "main.js")
	if err != nil {
		return nil, nil, fmt.Errorf("reading config: %w", err)
	}
	if !cfg.IsString() {
		return nil, nil, fmt.Errorf("config is not a string")
	}

	scrape := func(params ScrapeParams) (any, error) {
		suffix := randSeq(16)
		ctx.Global().Set("html_"+suffix, params.HTML)
		ctx.Global().Set("url_"+suffix, params.URL)
		data, err := ctx.RunScript(fmt.Sprintf(`JSON.stringify(stdin_default({html: html_%s, url: url_%s}))`, suffix, suffix), "main.js")
		if err != nil {
			return nil, err
		}

		var obj any
		if err := json.Unmarshal([]byte(data.String()), &obj); err != nil {
			return nil, err
		}

		return obj, nil
	}

	return Config(cfg.String()), scrape, nil
}

func randSeq(n int) string {
	letters := []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
	b := make([]rune, n)
	for i := range b {
		b[i] = letters[rand.Intn(len(letters))]
	}
	return string(b)
}

func removeIIFE(s string) string {
	s = strings.TrimPrefix(s, "(() => {\n")
	s = strings.TrimSuffix(s, "})();\n")
	return s
}