summaryrefslogblamecommitdiff
path: root/js.go
blob: 719c031f4e176a594d3ca7cb80b8e04f19608964 (plain) (tree)
1
2
3
4
5
6
7
8



                                                                      
                 

        
                 


                       

                 
                 
              
 

                                        
                                               
                                               
                                          

 


                         
                  
 






                                               









                                                                       











                                                      

                                                                          



                                       


                                                            
                              
                       
                               
         
                               

 

                                                       

                                             



                                        





                                                        




                                               
                                    

 
                                                       
                        
                                       
 
                           
                          
 

















                                                                                              

         
                                                
                       



                                                                 



                                   


                                                    
         
 





                                          

                           










                                                                                                                     


                                   
                                                      
                               
                                        








                                                
                                                        





                                                      

                 








                                                                                       
 



                                                                       
         
 
                                           

 
                                                      
                             
                                                                 





                                                                                    








                                                                                    


                                                                       
                                                                     


                                 
         


                                                                           
                                         






                                                       
                
 
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package flyscrape

import (
	_ "embed"
	"encoding/json"
	"errors"
	"fmt"
	"log"
	"net/url"
	"strings"
	"sync"

	"github.com/PuerkitoBio/goquery"
	"github.com/dop251/goja"
	"github.com/dop251/goja_nodejs/console"
	"github.com/dop251/goja_nodejs/require"
	"github.com/evanw/esbuild/pkg/api"
)

//go:embed template.js
var ScriptTemplate []byte

type Config []byte

type ScrapeParams struct {
	HTML string
	URL  string
}

type ScrapeFunc func(ScrapeParams) (any, error)

type TransformError struct {
	Line   int
	Column int
	Text   string
}

func (err TransformError) Error() string {
	return fmt.Sprintf("%d:%d: %s", err.Line, err.Column, err.Text)
}

type Exports map[string]any

func (e Exports) Config() []byte {
	b, _ := json.Marshal(e["config"])
	return b
}

func (e Exports) Scrape(p ScrapeParams) (any, error) {
	fn := e["__scrape"].(ScrapeFunc)
	return fn(p)
}

func (e Exports) Setup() {
	if fn, ok := e["setup"].(func(goja.FunctionCall) goja.Value); ok {
		fn(goja.FunctionCall{})
	}
}

type Imports map[string]map[string]any

func Compile(src string, imports Imports) (Exports, error) {
	src, err := build(src)
	if err != nil {
		return nil, err
	}
	return vm(src, imports)
}

func build(src string) (string, error) {
	res := api.Transform(src, api.TransformOptions{
		Platform: api.PlatformNode,
		Format:   api.FormatCommonJS,
	})

	var errs []error
	for _, msg := range res.Errors {
		err := TransformError{Text: msg.Text}
		if msg.Location != nil {
			err.Line = msg.Location.Line
			err.Column = msg.Location.Column
		}
		errs = append(errs, err)
	}
	if len(res.Errors) > 0 {
		return "", errors.Join(errs...)
	}

	return string(res.Code), nil
}

func vm(src string, imports Imports) (Exports, error) {
	vm := goja.New()
	registry := &require.Registry{}

	registry.Enable(vm)
	console.Enable(vm)

	for module, pkg := range imports {
		pkg := pkg
		registry.RegisterNativeModule(module, func(vm *goja.Runtime, o *goja.Object) {
			exports := vm.NewObject()

			for ident, val := range pkg {
				exports.Set(ident, val)
			}

			o.Set("exports", exports)
		})
	}

	if _, err := vm.RunString("module = {}"); err != nil {
		return nil, fmt.Errorf("running defining module: %w", err)
	}
	if _, err := vm.RunString(src); err != nil {
		return nil, fmt.Errorf("running user script: %w", err)
	}

	v, err := vm.RunString("module.exports")
	if err != nil {
		return nil, fmt.Errorf("reading config: %w", err)
	}

	exports := Exports{}
	if goja.IsUndefined(v) {
		return exports, nil
	}

	obj := v.ToObject(vm)
	for _, key := range obj.Keys() {
		exports[key] = obj.Get(key).Export()
	}

	exports["__scrape"] = scrape(vm)

	return exports, nil
}

func scrape(vm *goja.Runtime) ScrapeFunc {
	var lock sync.Mutex

	defaultfn, err := vm.RunString("module.exports.default")
	if err != nil {
		return func(ScrapeParams) (any, error) { return nil, errors.New("no scrape function defined") }
	}

	scrapefn, ok := defaultfn.Export().(func(goja.FunctionCall) goja.Value)
	if !ok {
		return func(ScrapeParams) (any, error) { return nil, errors.New("default export is not a function") }
	}

	return func(p ScrapeParams) (any, error) {
		lock.Lock()
		defer lock.Unlock()

		doc, err := DocumentFromString(p.HTML)
		if err != nil {
			log.Println(err)
			return nil, err
		}

		baseurl, err := url.Parse(p.URL)
		if err != nil {
			log.Println(err)
			return nil, err
		}

		absoluteURL := func(ref string) string {
			abs, err := baseurl.Parse(ref)
			if err != nil {
				log.Println(err)
				return ref
			}
			return abs.String()
		}

		o := vm.NewObject()
		o.Set("url", p.URL)
		o.Set("doc", doc)
		o.Set("absoluteURL", absoluteURL)

		ret := scrapefn(goja.FunctionCall{Arguments: []goja.Value{o}}).Export()
		return ret, nil
	}
}

func DocumentFromString(s string) (map[string]any, error) {
	doc, err := goquery.NewDocumentFromReader(strings.NewReader(s))
	if err != nil {
		return nil, err
	}

	return Document(doc.Selection), nil
}

func Document(sel *goquery.Selection) map[string]any {
	o := map[string]any{}
	o["WARNING"] = "Forgot to call text(), html() or attr()?"
	o["text"] = sel.Text
	o["html"] = func() string { h, _ := goquery.OuterHtml(sel); return h }
	o["attr"] = func(name string) string { v, _ := sel.Attr(name); return v }
	o["hasAttr"] = func(name string) bool { _, ok := sel.Attr(name); return ok }
	o["hasClass"] = sel.HasClass
	o["length"] = sel.Length()
	o["first"] = func() map[string]any { return Document(sel.First()) }
	o["last"] = func() map[string]any { return Document(sel.Last()) }
	o["get"] = func(index int) map[string]any { return Document(sel.Eq(index)) }
	o["find"] = func(s string) map[string]any { return Document(sel.Find(s)) }
	o["next"] = func() map[string]any { return Document(sel.Next()) }
	o["prev"] = func() map[string]any { return Document(sel.Prev()) }
	o["siblings"] = func() map[string]any { return Document(sel.Siblings()) }
	o["children"] = func() map[string]any { return Document(sel.Children()) }
	o["parent"] = func() map[string]any { return Document(sel.Parent()) }
	o["map"] = func(callback func(map[string]any, int) any) []any {
		var vals []any
		sel.Map(func(i int, s *goquery.Selection) string {
			vals = append(vals, callback(Document(s), i))
			return ""
		})
		return vals
	}
	o["filter"] = func(callback func(map[string]any, int) bool) []any {
		var vals []any
		sel.Each(func(i int, s *goquery.Selection) {
			el := Document(s)
			ok := callback(el, i)
			if ok {
				vals = append(vals, el)
			}
		})
		return vals
	}
	return o
}