Add file download functionality

author: Philipp Tanlak <philipp.tanlak@gmail.com> 2023-11-15 16:31:50 +0100
committer: Philipp Tanlak <philipp.tanlak@gmail.com> 2023-11-15 23:54:03 +0100
commit: 94da9293f63e46712b0a890e1e0eab4153fdb3f9 (patch)
tree: de81e6d00f7e1a5215d18557e772e7f1131d218b
parent: 3e01902887bdc52e743ef6cec53a5c89cb5637f0 (diff)
11 files changed, 254 insertions, 13 deletions
diff --git a/README.md b/README.md
index 4d46bbd..d7c701c 100644
--- a/README.md
+++ b/README.md
@@ -119,10 +119,10 @@ Below is an example scraping script that showcases the capabilities of flyscrape
 ```javascript
 export const config = {
     url: "https://example.com/",    // Specify the URL to start scraping from.
-    urls: [                         // Specify the URL(S) to start scraping from. If both .url and .urls
+    urls: [                         // Specify the URL(s) to start scraping from. If both `url` and `urls`
         "https://example.com/foo",  // are provided, all of the specified URLs will be scraped.
-        "https://example.com/foo",
-    ]
+        "https://example.com/bar",
+    ],
     depth: 0,                       // Specify how deep links should be followed.  (default = 0, no follow)
     follow: [],                     // Speficy the css selectors to follow         (default = ["a[href]"])
     allowedDomains: [],             // Specify the allowed domains. ['*'] for all. (default = domain from url)
@@ -180,6 +180,8 @@ items.filter(item => item.hasClass("a"))  // [<li class="a">Item 1</li>]
 
 ## Flyscrape API
 
+### Document Parsing
+
 ```javascript
 import { parse } from "flyscrape";
 
@@ -187,6 +189,8 @@ const doc = parse(`<div class="foo">bar</div>`);
 const text = doc.find(".foo").text();
 ```
 
+### Basic HTTP Requests
+
 ```javascript
 import http from "flyscrape/http";
 
@@ -214,7 +218,20 @@ const response = http.postJSON("https://example.com", {
 }
 ```
 
+### File Downloads
+
+```javascript
+import { download } from "flyscrape/http";
+
+download("http://example.com/image.jpg")              // downloads as "image.jpg"
+download("http://example.com/image.jpg", "other.jpg") // downloads as "other.jpg"
+download("http://example.com/image.jpg", "dir/")      // downloads as "dir/image.jpg"
 
+// If the server offers a filename via the Content-Disposition header and no
+// destination filename is provided, Flyscrape will honor the suggested filename.
+// E.g. `Content-Disposition: attachment; filename="archive.zip"`
+download("http://example.com/generate_archive.php", "dir/") // downloads as "dir/archive.zip"
+```
 
 ## Issues and Suggestions
 
diff --git a/examples/download.js b/examples/download.js
new file mode 100644
index 0000000..7048846
--- /dev/null
+++ b/examples/download.js
@@ -0,0 +1,25 @@
+import { download } from "flyscrape/http";
+
+export const config = {
+  url: "https://commons.wikimedia.org/wiki/London",
+};
+
+export default function ({ doc }) {
+  const symbols = doc.find("#mw-content-text .mw-gallery-traditional:first-of-type li");
+
+  return {
+    symbols: symbols.map(symbol => {
+      const name = symbol.text().trim();
+      const url = symbol.find("img").attr("src");
+      const file = `symbols/${basename(url)}`;
+
+      download(url, file);
+
+      return { name, url, file };
+    })
+  };
+}
+
+function basename(path) {
+  return path.split("/").slice(-1)[0];
+}
diff --git a/flyscrape.go b/flyscrape.go
index 8e79c2a..a96d37a 100644
--- a/flyscrape.go
+++ b/flyscrape.go
@@ -25,7 +25,10 @@ func Run(file string) error {
 
 	client := &http.Client{}
 
-	exports, err := Compile(string(src), NewJSLibrary(client))
+	imports, wait := NewJSLibrary(client)
+	defer wait()
+
+	exports, err := Compile(string(src), imports)
 	if err != nil {
 		return fmt.Errorf("failed to compile script: %w", err)
 	}
@@ -54,7 +57,10 @@ func Dev(file string) error {
 	fn := func(s string) error {
 		client := &http.Client{}
 
-		exports, err := Compile(s, NewJSLibrary(client))
+		imports, wait := NewJSLibrary(client)
+		defer wait()
+
+		exports, err := Compile(s, imports)
 		if err != nil {
 			printCompileErr(file, err)
 			return nil
diff --git a/go.mod b/go.mod
index f0580cc..563e26d 100644
--- a/go.mod
+++ b/go.mod
@@ -27,6 +27,7 @@ require (
 	github.com/rogpeppe/go-internal v1.10.0 // indirect
 	golang.org/x/crypto v0.0.0-20210921155107-089bfa567519 // indirect
 	golang.org/x/net v0.10.0 // indirect
+	golang.org/x/sync v0.5.0 // indirect
 	golang.org/x/sys v0.8.0 // indirect
 	golang.org/x/term v0.8.0 // indirect
 	golang.org/x/text v0.9.0 // indirect
diff --git a/go.sum b/go.sum
index 51beac7..06e3703 100644
--- a/go.sum
+++ b/go.sum
@@ -73,6 +73,8 @@ golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
+golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
diff --git a/js.go b/js.go
index a132c1c..719c031 100644
--- a/js.go
+++ b/js.go
@@ -126,6 +126,10 @@ func vm(src string, imports Imports) (Exports, error) {
 	}
 
 	exports := Exports{}
+	if goja.IsUndefined(v) {
+		return exports, nil
+	}
+
 	obj := v.ToObject(vm)
 	for _, key := range obj.Keys() {
 		exports[key] = obj.Get(key).Export()
diff --git a/js_lib.go b/js_lib.go
index abfe07c..ce4500a 100644
--- a/js_lib.go
+++ b/js_lib.go
@@ -9,13 +9,27 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
+	"log"
+	"mime"
 	"net/http"
 	gourl "net/url"
+	"os"
+	"path/filepath"
 	"strings"
+
+	"golang.org/x/sync/errgroup"
 )
 
-func NewJSLibrary(client *http.Client) Imports {
-	return Imports{
+func NewJSLibrary(client *http.Client) (imports Imports, wait func()) {
+	downloads := &errgroup.Group{}
+
+	// Allow 5 parallel downloads. Why 5?
+	// Docker downloads 3 layers in parallel.
+	// My Chrome downloads up to 6 files in parallel.
+	// 5 feels like a reasonable number.
+	downloads.SetLimit(5)
+
+	im := Imports{
 		"flyscrape": map[string]any{
 			"parse": jsParse(),
 		},
@@ -23,8 +37,11 @@ func NewJSLibrary(client *http.Client) Imports {
 			"get":      jsHTTPGet(client),
 			"postForm": jsHTTPPostForm(client),
 			"postJSON": jsHTTPPostJSON(client),
+			"download": jsHTTPDownload(client, downloads),
 		},
 	}
+
+	return im, func() { downloads.Wait() }
 }
 
 func jsParse() func(html string) map[string]any {
@@ -85,6 +102,93 @@ func jsHTTPPostJSON(client *http.Client) func(url string, data any) map[string]a
 	}
 }
 
+func jsHTTPDownload(client *http.Client, g *errgroup.Group) func(url string, dst string) {
+	fileExists := func(name string) bool {
+		_, err := os.Stat(name)
+		return err == nil
+	}
+
+	isDir := func(path string) bool {
+		if strings.HasSuffix(path, "/") {
+			return true
+		}
+		if filepath.Ext(path) == "" {
+			return true
+		}
+		s, err := os.Stat(path)
+		return err == nil && s.IsDir()
+	}
+
+	suggestedFilename := func(url, contentDisp string) string {
+		filename := filepath.Base(url)
+
+		if contentDisp == "" {
+			return filename
+		}
+
+		_, params, err := mime.ParseMediaType(contentDisp)
+		if err != nil {
+			return filename
+		}
+
+		name, ok := params["filename"]
+		if !ok || name == "" {
+			return filename
+		}
+
+		return filepath.Base(name)
+	}
+
+	return func(url string, dst string) {
+		g.Go(func() error {
+			req, err := http.NewRequest("GET", url, nil)
+			if err != nil {
+				log.Printf("error downloading file %q: %v", url, err)
+				return nil
+			}
+			req.Header.Add(HeaderBypassCache, "true")
+
+			resp, err := client.Do(req)
+			if err != nil {
+				log.Printf("error downloading file %q: %v", url, err)
+				return nil
+			}
+			defer resp.Body.Close()
+
+			if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+				log.Printf("error downloading file %q: unexpected status code %d", url, resp.StatusCode)
+				return nil
+			}
+
+			dst, err = filepath.Abs(dst)
+			if err != nil {
+				log.Printf("error downloading file %q: abs path failed: %v", url, err)
+				return nil
+			}
+
+			if isDir(dst) {
+				name := suggestedFilename(url, resp.Header.Get("Content-Disposition"))
+				dst = filepath.Join(dst, name)
+			}
+
+			if fileExists(dst) {
+				return nil
+			}
+
+			os.MkdirAll(filepath.Dir(dst), 0o755)
+			f, err := os.Create(dst)
+			if err != nil {
+				log.Printf("error downloading file %q: file save failed: %v", url, err)
+				return nil
+			}
+			defer f.Close()
+
+			io.Copy(f, resp.Body)
+			return nil
+		})
+	}
+}
+
 func jsFetch(client *http.Client, req *http.Request) (obj map[string]any) {
 	obj = map[string]any{
 		"body":    "",
diff --git a/js_lib_test.go b/js_lib_test.go
index 3682813..e375308 100644
--- a/js_lib_test.go
+++ b/js_lib_test.go
@@ -7,6 +7,7 @@ package flyscrape_test
 import (
 	"encoding/json"
 	"net/http"
+	"os"
 	"testing"
 
 	"github.com/philippta/flyscrape"
@@ -25,7 +26,8 @@ func TestJSLibParse(t *testing.T) {
 		Transport: flyscrape.MockTransport(200, html),
 	}
 
-	exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client))
+	imports, _ := flyscrape.NewJSLibrary(client)
+	exports, err := flyscrape.Compile(script, imports)
 	require.NoError(t, err)
 
 	h, ok := exports["text"].(string)
@@ -49,7 +51,8 @@ func TestJSLibHTTPGet(t *testing.T) {
 		Transport: flyscrape.MockTransport(200, html),
 	}
 
-	exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client))
+	imports, _ := flyscrape.NewJSLibrary(client)
+	exports, err := flyscrape.Compile(script, imports)
 	require.NoError(t, err)
 
 	body, ok := exports["body"].(string)
@@ -97,7 +100,8 @@ func TestJSLibHTTPPostForm(t *testing.T) {
 		}),
 	}
 
-	exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client))
+	imports, _ := flyscrape.NewJSLibrary(client)
+	exports, err := flyscrape.Compile(script, imports)
 	require.NoError(t, err)
 
 	body, ok := exports["body"].(string)
@@ -146,7 +150,8 @@ func TestJSLibHTTPPostJSON(t *testing.T) {
 		}),
 	}
 
-	exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client))
+	imports, _ := flyscrape.NewJSLibrary(client)
+	exports, err := flyscrape.Compile(script, imports)
 	require.NoError(t, err)
 
 	body, ok := exports["body"].(string)
@@ -165,3 +170,67 @@ func TestJSLibHTTPPostJSON(t *testing.T) {
 	require.True(t, ok)
 	require.NotEmpty(t, headers)
 }
+
+func TestJSLibHTTPDownload(t *testing.T) {
+	cwd, err := os.Getwd()
+	require.NoError(t, err)
+
+	tmpdir, err := os.MkdirTemp("", "http-download")
+	require.NoError(t, err)
+
+	defer os.RemoveAll(tmpdir)
+	defer os.Chdir(cwd)
+	os.Chdir(tmpdir)
+
+	script := `
+    import http from "flyscrape/http";
+
+    http.download("https://example.com/foo.txt", "foo.txt");
+    http.download("https://example.com/foo.txt", "dir/my-foo.txt");
+    http.download("https://example.com/bar.txt", "dir/");
+    http.download("https://example.com/baz.txt", "dir");
+    http.download("https://example.com/content-disposition", ".");
+    http.download("https://example.com/hack.txt", ".");
+    http.download("https://example.com/no-dest.txt");
+    http.download("https://example.com/404.txt");
+    `
+
+	nreqs := 0
+	client := &http.Client{
+		Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
+			nreqs++
+
+			if r.URL.Path == "/content-disposition" {
+				resp, err := flyscrape.MockResponse(200, "hello world")
+				resp.Header.Set("Content-Disposition", `attachment; filename="qux.txt"`)
+				return resp, err
+			}
+			if r.URL.Path == "/hack.txt" {
+				resp, err := flyscrape.MockResponse(200, "hello world")
+				resp.Header.Set("Content-Disposition", `attachment; filename="../../hack.txt"`)
+				return resp, err
+			}
+			if r.URL.Path == "/404.txt" {
+				resp, err := flyscrape.MockResponse(404, "hello world")
+				return resp, err
+			}
+
+			return flyscrape.MockResponse(200, "hello world")
+		}),
+	}
+
+	imports, wait := flyscrape.NewJSLibrary(client)
+	_, err = flyscrape.Compile(script, imports)
+	require.NoError(t, err)
+
+	wait()
+
+	require.Equal(t, nreqs, 8)
+	require.FileExists(t, "foo.txt")
+	require.FileExists(t, "dir/my-foo.txt")
+	require.FileExists(t, "dir/bar.txt")
+	require.FileExists(t, "qux.txt")
+	require.FileExists(t, "hack.txt")
+	require.FileExists(t, "no-dest.txt")
+	require.NoFileExists(t, "404.txt")
+}
diff --git a/modules/cache/cache.go b/modules/cache/cache.go
index 4750e55..401aa49 100644
--- a/modules/cache/cache.go
+++ b/modules/cache/cache.go
@@ -51,8 +51,11 @@ func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
 		return t
 	}
 	return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
-		key := r.Method + " " + r.URL.String()
+		if nocache(r) {
+			return t.RoundTrip(r)
+		}
 
+		key := r.Method + " " + r.URL.String()
 		if b, ok := m.store.Get(key); ok {
 			if resp, err := http.ReadResponse(bufio.NewReader(bytes.NewReader(b)), r); err == nil {
 				return resp, nil
@@ -86,6 +89,14 @@ func (m *Module) Finalize() {
 	}
 }
 
+func nocache(r *http.Request) bool {
+	if r.Header.Get(flyscrape.HeaderBypassCache) != "" {
+		r.Header.Del(flyscrape.HeaderBypassCache)
+		return true
+	}
+	return false
+}
+
 func replaceExt(filePath string, newExt string) string {
 	ext := filepath.Ext(filePath)
 	if ext != "" {
diff --git a/modules/cache/sqlitestore.go b/modules/cache/sqlitestore.go
index 50c8007..778699b 100644
--- a/modules/cache/sqlitestore.go
+++ b/modules/cache/sqlitestore.go
@@ -40,7 +40,7 @@ func (s *SQLiteStore) Get(key string) ([]byte, bool) {
 
 func (s *SQLiteStore) Set(key string, value []byte) {
 	if _, err := s.db.Exec(`INSERT INTO cache (key, value) VALUES (?, ?)`, key, value); err != nil {
-		log.Printf("cache: failed to insert cache key %q: %v\n", key, value)
+		log.Printf("cache: failed to insert cache key %q: %v\n", key, err)
 	}
 }
 
diff --git a/utils.go b/utils.go
index 861ee38..f26dc6c 100644
--- a/utils.go
+++ b/utils.go
@@ -13,6 +13,8 @@ import (
 	"strings"
 )
 
+const HeaderBypassCache = "X-Flyscrape-Bypass-Cache"
+
 func Prettify(v any, prefix string) string {
 	var buf bytes.Buffer
 	enc := json.NewEncoder(&buf)
author	Philipp Tanlak <philipp.tanlak@gmail.com>	2023-11-15 16:31:50 +0100
committer	Philipp Tanlak <philipp.tanlak@gmail.com>	2023-11-15 23:54:03 +0100
commit	94da9293f63e46712b0a890e1e0eab4153fdb3f9 (patch)
tree	de81e6d00f7e1a5215d18557e772e7f1131d218b
parent	3e01902887bdc52e743ef6cec53a5c89cb5637f0 (diff)