// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package flyscrape
import (
"bytes"
"encoding/json"
"fmt"
"io"
"log"
"mime"
"net/http"
gourl "net/url"
"os"
"path/filepath"
"strings"
"golang.org/x/sync/errgroup"
)
func NewJSLibrary(client *http.Client) (imports Imports, wait func()) {
downloads := &errgroup.Group{}
// Allow 5 parallel downloads. Why 5?
// Docker downloads 3 layers in parallel.
// My Chrome downloads up to 6 files in parallel.
// 5 feels like a reasonable number.
downloads.SetLimit(5)
im := Imports{
"flyscrape": map[string]any{
"parse": jsParse(),
},
"flyscrape/http": map[string]any{
"get": jsHTTPGet(client),
"postForm": jsHTTPPostForm(client),
"postJSON": jsHTTPPostJSON(client),
"download": jsHTTPDownload(client, downloads),
},
}
return im, func() { downloads.Wait() }
}
func jsParse() func(html string) map[string]any {
return func(html string) map[string]any {
doc, err := DocumentFromString(html)
if err != nil {
return nil
}
return doc
}
}
func jsHTTPGet(client *http.Client) func(url string) map[string]any {
return func(url string) map[string]any {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return map[string]any{"error": err.Error()}
}
return jsFetch(client, req)
}
}
func jsHTTPPostForm(client *http.Client) func(url string, form map[string]any) map[string]any {
return func(url string, form map[string]any) map[string]any {
vals := gourl.Values{}
for k, v := range form {
switch v := v.(type) {
case []any:
for _, v := range v {
vals.Add(k, fmt.Sprintf("%v", v))
}
default:
vals.Add(k, fmt.Sprintf("%v", v))
}
}
req, err := http.NewRequest("POST", url, strings.NewReader(vals.Encode()))
if err != nil {
return map[string]any{"error": err.Error()}
}
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
return jsFetch(client, req)
}
}
func jsHTTPPostJSON(client *http.Client) func(url string, data any) map[string]any {
return func(url string, data any) map[string]any {
b, _ := json.Marshal(data)
req, err := http.NewRequest("POST", url, bytes.NewReader(b))
if err != nil {
return map[string]any{"error": err.Error()}
}
req.Header.Set("Content-Type", "application/json")
return jsFetch(client, req)
}
}
func jsHTTPDownload(client *http.Client, g *errgroup.Group) func(url string, dst string) {
fileExists := func(name string) bool {
_, err := os.Stat(name)
return err == nil
}
isDir := func(path string) bool {
if strings.HasSuffix(path, "/") {
return true
}
if filepath.Ext(path) == "" {
return true
}
s, err := os.Stat(path)
return err == nil && s.IsDir()
}
suggestedFilename := func(url, contentDisp string) string {
filename := filepath.Base(url)
if contentDisp == "" {
return filename
}
_, params, err := mime.ParseMediaType(contentDisp)
if err != nil {
return filename
}
name, ok := params["filename"]
if !ok || name == "" {
return filename
}
return filepath.Base(name)
}
return func(url string, dst string) {
g.Go(func() error {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
log.Printf("error downloading file %q: %v", url, err)
return nil
}
req.Header.Add(HeaderBypassCache, "true")
resp, err := client.Do(req)
if err != nil {
log.Printf("error downloading file %q: %v", url, err)
return nil
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
log.Printf("error downloading file %q: unexpected status code %d", url, resp.StatusCode)
return nil
}
dst, err = filepath.Abs(dst)
if err != nil {
log.Printf("error downloading file %q: abs path failed: %v", url, err)
return nil
}
if isDir(dst) {
name := suggestedFilename(url, resp.Header.Get("Content-Disposition"))
dst = filepath.Join(dst, name)
}
if fileExists(dst) {
return nil
}
os.MkdirAll(filepath.Dir(dst), 0o755)
f, err := os.Create(dst)
if err != nil {
log.Printf("error downloading file %q: file save failed: %v", url, err)
return nil
}
defer f.Close()
io.Copy(f, resp.Body)
return nil
})
}
}
func jsFetch(client *http.Client, req *http.Request) (obj map[string]any) {
obj = map[string]any{
"body": "",
"status": 0,
"headers": map[string]any{},
"error": "",
}
resp, err := client.Do(req)
if err != nil {
obj["error"] = err.Error()
return
}
defer resp.Body.Close()
obj["status"] = resp.StatusCode
b, err := io.ReadAll(resp.Body)
if err != nil {
obj["error"] = err.Error()
return
}
obj["body"] = string(b)
headers := map[string]any{}
for name := range resp.Header {
headers[name] = resp.Header.Get(name)
}
obj["headers"] = headers
return
}