diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-11-15 16:31:50 +0100 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-11-15 23:54:03 +0100 |
| commit | 94da9293f63e46712b0a890e1e0eab4153fdb3f9 (patch) | |
| tree | de81e6d00f7e1a5215d18557e772e7f1131d218b /js_lib.go | |
| parent | 3e01902887bdc52e743ef6cec53a5c89cb5637f0 (diff) | |
Add file download functionality
Diffstat (limited to 'js_lib.go')
| -rw-r--r-- | js_lib.go | 108 |
1 files changed, 106 insertions, 2 deletions
@@ -9,13 +9,27 @@ import ( "encoding/json" "fmt" "io" + "log" + "mime" "net/http" gourl "net/url" + "os" + "path/filepath" "strings" + + "golang.org/x/sync/errgroup" ) -func NewJSLibrary(client *http.Client) Imports { - return Imports{ +func NewJSLibrary(client *http.Client) (imports Imports, wait func()) { + downloads := &errgroup.Group{} + + // Allow 5 parallel downloads. Why 5? + // Docker downloads 3 layers in parallel. + // My Chrome downloads up to 6 files in parallel. + // 5 feels like a reasonable number. + downloads.SetLimit(5) + + im := Imports{ "flyscrape": map[string]any{ "parse": jsParse(), }, @@ -23,8 +37,11 @@ func NewJSLibrary(client *http.Client) Imports { "get": jsHTTPGet(client), "postForm": jsHTTPPostForm(client), "postJSON": jsHTTPPostJSON(client), + "download": jsHTTPDownload(client, downloads), }, } + + return im, func() { downloads.Wait() } } func jsParse() func(html string) map[string]any { @@ -85,6 +102,93 @@ func jsHTTPPostJSON(client *http.Client) func(url string, data any) map[string]a } } +func jsHTTPDownload(client *http.Client, g *errgroup.Group) func(url string, dst string) { + fileExists := func(name string) bool { + _, err := os.Stat(name) + return err == nil + } + + isDir := func(path string) bool { + if strings.HasSuffix(path, "/") { + return true + } + if filepath.Ext(path) == "" { + return true + } + s, err := os.Stat(path) + return err == nil && s.IsDir() + } + + suggestedFilename := func(url, contentDisp string) string { + filename := filepath.Base(url) + + if contentDisp == "" { + return filename + } + + _, params, err := mime.ParseMediaType(contentDisp) + if err != nil { + return filename + } + + name, ok := params["filename"] + if !ok || name == "" { + return filename + } + + return filepath.Base(name) + } + + return func(url string, dst string) { + g.Go(func() error { + req, err := http.NewRequest("GET", url, nil) + if err != nil { + log.Printf("error downloading file %q: %v", url, err) + return nil + } + req.Header.Add(HeaderBypassCache, "true") + + resp, err := client.Do(req) + if err != nil { + log.Printf("error downloading file %q: %v", url, err) + return nil + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + log.Printf("error downloading file %q: unexpected status code %d", url, resp.StatusCode) + return nil + } + + dst, err = filepath.Abs(dst) + if err != nil { + log.Printf("error downloading file %q: abs path failed: %v", url, err) + return nil + } + + if isDir(dst) { + name := suggestedFilename(url, resp.Header.Get("Content-Disposition")) + dst = filepath.Join(dst, name) + } + + if fileExists(dst) { + return nil + } + + os.MkdirAll(filepath.Dir(dst), 0o755) + f, err := os.Create(dst) + if err != nil { + log.Printf("error downloading file %q: file save failed: %v", url, err) + return nil + } + defer f.Close() + + io.Copy(f, resp.Body) + return nil + }) + } +} + func jsFetch(client *http.Client, req *http.Request) (obj map[string]any) { obj = map[string]any{ "body": "", |