diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-11 18:31:20 +0200 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-11 18:31:20 +0200 |
| commit | 062b36fe5725d1267c66db2e506b4131d78ce772 (patch) | |
| tree | 998e5260feb1babac8dae512b56d67d8f20f7266 /js.go | |
| parent | 7e4cf39a0ba6ccbd5cc036700a8b1ff9358ecc3d (diff) | |
simplify project structure
Diffstat (limited to 'js.go')
| -rw-r--r-- | js.go | 107 |
1 files changed, 107 insertions, 0 deletions
@@ -0,0 +1,107 @@ +package flyscrape + +import ( + "encoding/json" + "errors" + "fmt" + "math/rand" + "strings" + "time" + + "flyscrape/js" + + "github.com/evanw/esbuild/pkg/api" + v8 "rogchap.com/v8go" +) + +func init() { + rand.Seed(time.Now().UnixNano()) +} + +func Compile(src string) (ScrapeOptions, ScrapeFunc, error) { + src, err := build(src) + if err != nil { + return ScrapeOptions{}, nil, err + } + return vm(src) +} + +func build(src string) (string, error) { + res := api.Transform(src, api.TransformOptions{ + Platform: api.PlatformBrowser, + Format: api.FormatIIFE, + }) + + var errs []error + for _, msg := range res.Errors { + errs = append(errs, fmt.Errorf("%s", msg.Text)) + } + if len(res.Errors) > 0 { + return "", errors.Join(errs...) + } + + return string(res.Code), nil +} + +func vm(src string) (ScrapeOptions, ScrapeFunc, error) { + ctx := v8.NewContext() + ctx.RunScript("var module = {}", "main.js") + + if _, err := ctx.RunScript(removeIIFE(js.Flyscrape), "main.js"); err != nil { + return ScrapeOptions{}, nil, fmt.Errorf("running flyscrape bundle: %w", err) + } + if _, err := ctx.RunScript(`const require = () => require_flyscrape();`, "main.js"); err != nil { + return ScrapeOptions{}, nil, fmt.Errorf("creating require function: %w", err) + } + if _, err := ctx.RunScript(removeIIFE(src), "main.js"); err != nil { + return ScrapeOptions{}, nil, fmt.Errorf("running user script: %w", err) + } + + var opts ScrapeOptions + + url, err := ctx.RunScript("options.url", "main.js") + if err != nil { + return ScrapeOptions{}, nil, fmt.Errorf("reading options.url: %w", err) + } + opts.URL = url.String() + + depth, err := ctx.RunScript("options.depth", "main.js") + if err != nil { + return ScrapeOptions{}, nil, fmt.Errorf("reading options.depth: %w", err) + } + opts.Depth = int(depth.Integer()) + + scrape := func(params ScrapeParams) (any, error) { + suffix := randSeq(10) + ctx.Global().Set("html_"+suffix, params.HTML) + ctx.Global().Set("url_"+suffix, params.URL) + data, err := ctx.RunScript(fmt.Sprintf(`JSON.stringify(stdin_default({html: html_%s, url: url_%s}))`, suffix, suffix), "main.js") + if err != nil { + return nil, err + } + + var obj any + if err := json.Unmarshal([]byte(data.String()), &obj); err != nil { + return nil, err + } + + return obj, nil + } + + return opts, scrape, nil +} + +func randSeq(n int) string { + letters := []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") + b := make([]rune, n) + for i := range b { + b[i] = letters[rand.Intn(len(letters))] + } + return string(b) +} + +func removeIIFE(s string) string { + s = strings.TrimPrefix(s, "(() => {\n") + s = strings.TrimSuffix(s, "})();\n") + return s +} |