From 94da9293f63e46712b0a890e1e0eab4153fdb3f9 Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Wed, 15 Nov 2023 16:31:50 +0100 Subject: Add file download functionality --- js_lib.go | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 2 deletions(-) (limited to 'js_lib.go') diff --git a/js_lib.go b/js_lib.go index abfe07c..ce4500a 100644 --- a/js_lib.go +++ b/js_lib.go @@ -9,13 +9,27 @@ import ( "encoding/json" "fmt" "io" + "log" + "mime" "net/http" gourl "net/url" + "os" + "path/filepath" "strings" + + "golang.org/x/sync/errgroup" ) -func NewJSLibrary(client *http.Client) Imports { - return Imports{ +func NewJSLibrary(client *http.Client) (imports Imports, wait func()) { + downloads := &errgroup.Group{} + + // Allow 5 parallel downloads. Why 5? + // Docker downloads 3 layers in parallel. + // My Chrome downloads up to 6 files in parallel. + // 5 feels like a reasonable number. + downloads.SetLimit(5) + + im := Imports{ "flyscrape": map[string]any{ "parse": jsParse(), }, @@ -23,8 +37,11 @@ func NewJSLibrary(client *http.Client) Imports { "get": jsHTTPGet(client), "postForm": jsHTTPPostForm(client), "postJSON": jsHTTPPostJSON(client), + "download": jsHTTPDownload(client, downloads), }, } + + return im, func() { downloads.Wait() } } func jsParse() func(html string) map[string]any { @@ -85,6 +102,93 @@ func jsHTTPPostJSON(client *http.Client) func(url string, data any) map[string]a } } +func jsHTTPDownload(client *http.Client, g *errgroup.Group) func(url string, dst string) { + fileExists := func(name string) bool { + _, err := os.Stat(name) + return err == nil + } + + isDir := func(path string) bool { + if strings.HasSuffix(path, "/") { + return true + } + if filepath.Ext(path) == "" { + return true + } + s, err := os.Stat(path) + return err == nil && s.IsDir() + } + + suggestedFilename := func(url, contentDisp string) string { + filename := filepath.Base(url) + + if contentDisp == "" { + return filename + } + + _, params, err := mime.ParseMediaType(contentDisp) + if err != nil { + return filename + } + + name, ok := params["filename"] + if !ok || name == "" { + return filename + } + + return filepath.Base(name) + } + + return func(url string, dst string) { + g.Go(func() error { + req, err := http.NewRequest("GET", url, nil) + if err != nil { + log.Printf("error downloading file %q: %v", url, err) + return nil + } + req.Header.Add(HeaderBypassCache, "true") + + resp, err := client.Do(req) + if err != nil { + log.Printf("error downloading file %q: %v", url, err) + return nil + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + log.Printf("error downloading file %q: unexpected status code %d", url, resp.StatusCode) + return nil + } + + dst, err = filepath.Abs(dst) + if err != nil { + log.Printf("error downloading file %q: abs path failed: %v", url, err) + return nil + } + + if isDir(dst) { + name := suggestedFilename(url, resp.Header.Get("Content-Disposition")) + dst = filepath.Join(dst, name) + } + + if fileExists(dst) { + return nil + } + + os.MkdirAll(filepath.Dir(dst), 0o755) + f, err := os.Create(dst) + if err != nil { + log.Printf("error downloading file %q: file save failed: %v", url, err) + return nil + } + defer f.Close() + + io.Copy(f, resp.Body) + return nil + }) + } +} + func jsFetch(client *http.Client, req *http.Request) (obj map[string]any) { obj = map[string]any{ "body": "", -- cgit v1.2.3