summaryrefslogblamecommitdiff
path: root/js.go
blob: 649fb29965431548b42a094469bd84b8c4d6fcab (plain) (tree)
1
2
3
4
5
6
7
8



                                                                      
                 

        
                 


                       


                 
                 

                     
 



                                                           
                                          

 


                         
                  
 






                                               









                                                                       
                                                      
                              
                       
                                    
         


                      

                                                       

                                              



                                        





                                                        




                                               
                                    

 
                                                 
                        
 

                                       
 
                              
 
                                                                
                                                                           

         
                                                          
                       
                                                                      
         
 































                                                                                                                                                                                          
                               
                                       

                 
                           
                                                                                   
                                       

                 
                               
         
 
                                                

 
























                                                                                    
         
                
 





                                               
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package flyscrape

import (
	_ "embed"
	"encoding/json"
	"errors"
	"fmt"
	"log"
	"net/url"
	"strconv"
	"strings"
	"sync"
	"sync/atomic"

	"github.com/PuerkitoBio/goquery"
	"github.com/dop251/goja"
	gojaconsole "github.com/dop251/goja_nodejs/console"
	"github.com/dop251/goja_nodejs/require"
	"github.com/evanw/esbuild/pkg/api"
)

//go:embed template.js
var ScriptTemplate []byte

type Config []byte

type ScrapeParams struct {
	HTML string
	URL  string
}

type ScrapeFunc func(ScrapeParams) (any, error)

type TransformError struct {
	Line   int
	Column int
	Text   string
}

func (err TransformError) Error() string {
	return fmt.Sprintf("%d:%d: %s", err.Line, err.Column, err.Text)
}

func Compile(src string) (Config, ScrapeFunc, error) {
	src, err := build(src)
	if err != nil {
		return nil, nil, err
	}
	return vm(src)
}

func build(src string) (string, error) {
	res := api.Transform(src, api.TransformOptions{
		Platform: api.PlatformBrowser,
		Format:   api.FormatIIFE,
	})

	var errs []error
	for _, msg := range res.Errors {
		err := TransformError{Text: msg.Text}
		if msg.Location != nil {
			err.Line = msg.Location.Line
			err.Column = msg.Location.Column
		}
		errs = append(errs, err)
	}
	if len(res.Errors) > 0 {
		return "", errors.Join(errs...)
	}

	return string(res.Code), nil
}

func vm(src string) (Config, ScrapeFunc, error) {
	vm := goja.New()

	registry := &require.Registry{}
	registry.Enable(vm)

	gojaconsole.Enable(vm)

	if _, err := vm.RunString(removeIIFE(src)); err != nil {
		return nil, nil, fmt.Errorf("running user script: %w", err)
	}

	cfg, err := vm.RunString("JSON.stringify(config)")
	if err != nil {
		return nil, nil, fmt.Errorf("reading config: %w", err)
	}

	var c atomic.Uint64
	var lock sync.Mutex

	scrape := func(p ScrapeParams) (any, error) {
		lock.Lock()
		defer lock.Unlock()

		doc, err := goquery.NewDocumentFromReader(strings.NewReader(p.HTML))
		if err != nil {
			return nil, err
		}

		baseurl, err := url.Parse(p.URL)
		if err != nil {
			log.Println(err)
			return nil, err
		}

		suffix := strconv.FormatUint(c.Add(1), 10)
		vm.Set("html_"+suffix, p.HTML)
		vm.Set("url_"+suffix, p.URL)
		vm.Set("doc_"+suffix, wrap(vm, doc.Selection))
		vm.Set("absurl_"+suffix, func(ref string) string {
			abs, err := baseurl.Parse(ref)
			if err != nil {
				log.Println(err)
				return ref
			}
			return abs.String()
		})

		data, err := vm.RunString(fmt.Sprintf(`JSON.stringify(stdin_default({html: html_%s, doc: doc_%s, url: url_%s, absoluteURL: absurl_%s}))`, suffix, suffix, suffix, suffix))
		if err != nil {
			return nil, err
		}

		var obj any
		if err := json.Unmarshal([]byte(data.String()), &obj); err != nil {
			return nil, err
		}

		return obj, nil
	}

	return Config(cfg.String()), scrape, nil
}

func wrap(vm *goja.Runtime, sel *goquery.Selection) map[string]any {
	o := map[string]any{}
	o["text"] = sel.Text
	o["html"] = func() string { h, _ := goquery.OuterHtml(sel); return h }
	o["attr"] = func(name string) string { v, _ := sel.Attr(name); return v }
	o["hasAttr"] = func(name string) bool { _, ok := sel.Attr(name); return ok }
	o["hasClass"] = sel.HasClass
	o["length"] = sel.Length()
	o["first"] = func() map[string]any { return wrap(vm, sel.First()) }
	o["last"] = func() map[string]any { return wrap(vm, sel.Last()) }
	o["get"] = func(index int) map[string]any { return wrap(vm, sel.Eq(index)) }
	o["index"] = sel.Index()
	o["find"] = func(s string) map[string]any { return wrap(vm, sel.Find(s)) }
	o["next"] = func() map[string]any { return wrap(vm, sel.Next()) }
	o["prev"] = func() map[string]any { return wrap(vm, sel.Prev()) }
	o["siblings"] = func() map[string]any { return wrap(vm, sel.Siblings()) }
	o["children"] = func() map[string]any { return wrap(vm, sel.Children()) }
	o["parent"] = func() map[string]any { return wrap(vm, sel.Parent()) }
	o["map"] = func(callback func(map[string]any, int) any) []any {
		var vals []any
		sel.Map(func(i int, s *goquery.Selection) string {
			vals = append(vals, callback(wrap(vm, s), i))
			return ""
		})
		return vals
	}
	return o
}

func removeIIFE(s string) string {
	s = strings.TrimPrefix(s, "(() => {\n")
	s = strings.TrimSuffix(s, "})();\n")
	return s
}