summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2023-11-15 16:31:50 +0100
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2023-11-15 23:54:03 +0100
commit94da9293f63e46712b0a890e1e0eab4153fdb3f9 (patch)
treede81e6d00f7e1a5215d18557e772e7f1131d218b
parent3e01902887bdc52e743ef6cec53a5c89cb5637f0 (diff)
Add file download functionality
-rw-r--r--README.md23
-rw-r--r--examples/download.js25
-rw-r--r--flyscrape.go10
-rw-r--r--go.mod1
-rw-r--r--go.sum2
-rw-r--r--js.go4
-rw-r--r--js_lib.go108
-rw-r--r--js_lib_test.go77
-rw-r--r--modules/cache/cache.go13
-rw-r--r--modules/cache/sqlitestore.go2
-rw-r--r--utils.go2
11 files changed, 254 insertions, 13 deletions
diff --git a/README.md b/README.md
index 4d46bbd..d7c701c 100644
--- a/README.md
+++ b/README.md
@@ -119,10 +119,10 @@ Below is an example scraping script that showcases the capabilities of flyscrape
```javascript
export const config = {
url: "https://example.com/", // Specify the URL to start scraping from.
- urls: [ // Specify the URL(S) to start scraping from. If both .url and .urls
+ urls: [ // Specify the URL(s) to start scraping from. If both `url` and `urls`
"https://example.com/foo", // are provided, all of the specified URLs will be scraped.
- "https://example.com/foo",
- ]
+ "https://example.com/bar",
+ ],
depth: 0, // Specify how deep links should be followed. (default = 0, no follow)
follow: [], // Speficy the css selectors to follow (default = ["a[href]"])
allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
@@ -180,6 +180,8 @@ items.filter(item => item.hasClass("a")) // [<li class="a">Item 1</li>]
## Flyscrape API
+### Document Parsing
+
```javascript
import { parse } from "flyscrape";
@@ -187,6 +189,8 @@ const doc = parse(`<div class="foo">bar</div>`);
const text = doc.find(".foo").text();
```
+### Basic HTTP Requests
+
```javascript
import http from "flyscrape/http";
@@ -214,7 +218,20 @@ const response = http.postJSON("https://example.com", {
}
```
+### File Downloads
+
+```javascript
+import { download } from "flyscrape/http";
+
+download("http://example.com/image.jpg") // downloads as "image.jpg"
+download("http://example.com/image.jpg", "other.jpg") // downloads as "other.jpg"
+download("http://example.com/image.jpg", "dir/") // downloads as "dir/image.jpg"
+// If the server offers a filename via the Content-Disposition header and no
+// destination filename is provided, Flyscrape will honor the suggested filename.
+// E.g. `Content-Disposition: attachment; filename="archive.zip"`
+download("http://example.com/generate_archive.php", "dir/") // downloads as "dir/archive.zip"
+```
## Issues and Suggestions
diff --git a/examples/download.js b/examples/download.js
new file mode 100644
index 0000000..7048846
--- /dev/null
+++ b/examples/download.js
@@ -0,0 +1,25 @@
+import { download } from "flyscrape/http";
+
+export const config = {
+ url: "https://commons.wikimedia.org/wiki/London",
+};
+
+export default function ({ doc }) {
+ const symbols = doc.find("#mw-content-text .mw-gallery-traditional:first-of-type li");
+
+ return {
+ symbols: symbols.map(symbol => {
+ const name = symbol.text().trim();
+ const url = symbol.find("img").attr("src");
+ const file = `symbols/${basename(url)}`;
+
+ download(url, file);
+
+ return { name, url, file };
+ })
+ };
+}
+
+function basename(path) {
+ return path.split("/").slice(-1)[0];
+}
diff --git a/flyscrape.go b/flyscrape.go
index 8e79c2a..a96d37a 100644
--- a/flyscrape.go
+++ b/flyscrape.go
@@ -25,7 +25,10 @@ func Run(file string) error {
client := &http.Client{}
- exports, err := Compile(string(src), NewJSLibrary(client))
+ imports, wait := NewJSLibrary(client)
+ defer wait()
+
+ exports, err := Compile(string(src), imports)
if err != nil {
return fmt.Errorf("failed to compile script: %w", err)
}
@@ -54,7 +57,10 @@ func Dev(file string) error {
fn := func(s string) error {
client := &http.Client{}
- exports, err := Compile(s, NewJSLibrary(client))
+ imports, wait := NewJSLibrary(client)
+ defer wait()
+
+ exports, err := Compile(s, imports)
if err != nil {
printCompileErr(file, err)
return nil
diff --git a/go.mod b/go.mod
index f0580cc..563e26d 100644
--- a/go.mod
+++ b/go.mod
@@ -27,6 +27,7 @@ require (
github.com/rogpeppe/go-internal v1.10.0 // indirect
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519 // indirect
golang.org/x/net v0.10.0 // indirect
+ golang.org/x/sync v0.5.0 // indirect
golang.org/x/sys v0.8.0 // indirect
golang.org/x/term v0.8.0 // indirect
golang.org/x/text v0.9.0 // indirect
diff --git a/go.sum b/go.sum
index 51beac7..06e3703 100644
--- a/go.sum
+++ b/go.sum
@@ -73,6 +73,8 @@ golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
+golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
diff --git a/js.go b/js.go
index a132c1c..719c031 100644
--- a/js.go
+++ b/js.go
@@ -126,6 +126,10 @@ func vm(src string, imports Imports) (Exports, error) {
}
exports := Exports{}
+ if goja.IsUndefined(v) {
+ return exports, nil
+ }
+
obj := v.ToObject(vm)
for _, key := range obj.Keys() {
exports[key] = obj.Get(key).Export()
diff --git a/js_lib.go b/js_lib.go
index abfe07c..ce4500a 100644
--- a/js_lib.go
+++ b/js_lib.go
@@ -9,13 +9,27 @@ import (
"encoding/json"
"fmt"
"io"
+ "log"
+ "mime"
"net/http"
gourl "net/url"
+ "os"
+ "path/filepath"
"strings"
+
+ "golang.org/x/sync/errgroup"
)
-func NewJSLibrary(client *http.Client) Imports {
- return Imports{
+func NewJSLibrary(client *http.Client) (imports Imports, wait func()) {
+ downloads := &errgroup.Group{}
+
+ // Allow 5 parallel downloads. Why 5?
+ // Docker downloads 3 layers in parallel.
+ // My Chrome downloads up to 6 files in parallel.
+ // 5 feels like a reasonable number.
+ downloads.SetLimit(5)
+
+ im := Imports{
"flyscrape": map[string]any{
"parse": jsParse(),
},
@@ -23,8 +37,11 @@ func NewJSLibrary(client *http.Client) Imports {
"get": jsHTTPGet(client),
"postForm": jsHTTPPostForm(client),
"postJSON": jsHTTPPostJSON(client),
+ "download": jsHTTPDownload(client, downloads),
},
}
+
+ return im, func() { downloads.Wait() }
}
func jsParse() func(html string) map[string]any {
@@ -85,6 +102,93 @@ func jsHTTPPostJSON(client *http.Client) func(url string, data any) map[string]a
}
}
+func jsHTTPDownload(client *http.Client, g *errgroup.Group) func(url string, dst string) {
+ fileExists := func(name string) bool {
+ _, err := os.Stat(name)
+ return err == nil
+ }
+
+ isDir := func(path string) bool {
+ if strings.HasSuffix(path, "/") {
+ return true
+ }
+ if filepath.Ext(path) == "" {
+ return true
+ }
+ s, err := os.Stat(path)
+ return err == nil && s.IsDir()
+ }
+
+ suggestedFilename := func(url, contentDisp string) string {
+ filename := filepath.Base(url)
+
+ if contentDisp == "" {
+ return filename
+ }
+
+ _, params, err := mime.ParseMediaType(contentDisp)
+ if err != nil {
+ return filename
+ }
+
+ name, ok := params["filename"]
+ if !ok || name == "" {
+ return filename
+ }
+
+ return filepath.Base(name)
+ }
+
+ return func(url string, dst string) {
+ g.Go(func() error {
+ req, err := http.NewRequest("GET", url, nil)
+ if err != nil {
+ log.Printf("error downloading file %q: %v", url, err)
+ return nil
+ }
+ req.Header.Add(HeaderBypassCache, "true")
+
+ resp, err := client.Do(req)
+ if err != nil {
+ log.Printf("error downloading file %q: %v", url, err)
+ return nil
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+ log.Printf("error downloading file %q: unexpected status code %d", url, resp.StatusCode)
+ return nil
+ }
+
+ dst, err = filepath.Abs(dst)
+ if err != nil {
+ log.Printf("error downloading file %q: abs path failed: %v", url, err)
+ return nil
+ }
+
+ if isDir(dst) {
+ name := suggestedFilename(url, resp.Header.Get("Content-Disposition"))
+ dst = filepath.Join(dst, name)
+ }
+
+ if fileExists(dst) {
+ return nil
+ }
+
+ os.MkdirAll(filepath.Dir(dst), 0o755)
+ f, err := os.Create(dst)
+ if err != nil {
+ log.Printf("error downloading file %q: file save failed: %v", url, err)
+ return nil
+ }
+ defer f.Close()
+
+ io.Copy(f, resp.Body)
+ return nil
+ })
+ }
+}
+
func jsFetch(client *http.Client, req *http.Request) (obj map[string]any) {
obj = map[string]any{
"body": "",
diff --git a/js_lib_test.go b/js_lib_test.go
index 3682813..e375308 100644
--- a/js_lib_test.go
+++ b/js_lib_test.go
@@ -7,6 +7,7 @@ package flyscrape_test
import (
"encoding/json"
"net/http"
+ "os"
"testing"
"github.com/philippta/flyscrape"
@@ -25,7 +26,8 @@ func TestJSLibParse(t *testing.T) {
Transport: flyscrape.MockTransport(200, html),
}
- exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client))
+ imports, _ := flyscrape.NewJSLibrary(client)
+ exports, err := flyscrape.Compile(script, imports)
require.NoError(t, err)
h, ok := exports["text"].(string)
@@ -49,7 +51,8 @@ func TestJSLibHTTPGet(t *testing.T) {
Transport: flyscrape.MockTransport(200, html),
}
- exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client))
+ imports, _ := flyscrape.NewJSLibrary(client)
+ exports, err := flyscrape.Compile(script, imports)
require.NoError(t, err)
body, ok := exports["body"].(string)
@@ -97,7 +100,8 @@ func TestJSLibHTTPPostForm(t *testing.T) {
}),
}
- exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client))
+ imports, _ := flyscrape.NewJSLibrary(client)
+ exports, err := flyscrape.Compile(script, imports)
require.NoError(t, err)
body, ok := exports["body"].(string)
@@ -146,7 +150,8 @@ func TestJSLibHTTPPostJSON(t *testing.T) {
}),
}
- exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client))
+ imports, _ := flyscrape.NewJSLibrary(client)
+ exports, err := flyscrape.Compile(script, imports)
require.NoError(t, err)
body, ok := exports["body"].(string)
@@ -165,3 +170,67 @@ func TestJSLibHTTPPostJSON(t *testing.T) {
require.True(t, ok)
require.NotEmpty(t, headers)
}
+
+func TestJSLibHTTPDownload(t *testing.T) {
+ cwd, err := os.Getwd()
+ require.NoError(t, err)
+
+ tmpdir, err := os.MkdirTemp("", "http-download")
+ require.NoError(t, err)
+
+ defer os.RemoveAll(tmpdir)
+ defer os.Chdir(cwd)
+ os.Chdir(tmpdir)
+
+ script := `
+ import http from "flyscrape/http";
+
+ http.download("https://example.com/foo.txt", "foo.txt");
+ http.download("https://example.com/foo.txt", "dir/my-foo.txt");
+ http.download("https://example.com/bar.txt", "dir/");
+ http.download("https://example.com/baz.txt", "dir");
+ http.download("https://example.com/content-disposition", ".");
+ http.download("https://example.com/hack.txt", ".");
+ http.download("https://example.com/no-dest.txt");
+ http.download("https://example.com/404.txt");
+ `
+
+ nreqs := 0
+ client := &http.Client{
+ Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
+ nreqs++
+
+ if r.URL.Path == "/content-disposition" {
+ resp, err := flyscrape.MockResponse(200, "hello world")
+ resp.Header.Set("Content-Disposition", `attachment; filename="qux.txt"`)
+ return resp, err
+ }
+ if r.URL.Path == "/hack.txt" {
+ resp, err := flyscrape.MockResponse(200, "hello world")
+ resp.Header.Set("Content-Disposition", `attachment; filename="../../hack.txt"`)
+ return resp, err
+ }
+ if r.URL.Path == "/404.txt" {
+ resp, err := flyscrape.MockResponse(404, "hello world")
+ return resp, err
+ }
+
+ return flyscrape.MockResponse(200, "hello world")
+ }),
+ }
+
+ imports, wait := flyscrape.NewJSLibrary(client)
+ _, err = flyscrape.Compile(script, imports)
+ require.NoError(t, err)
+
+ wait()
+
+ require.Equal(t, nreqs, 8)
+ require.FileExists(t, "foo.txt")
+ require.FileExists(t, "dir/my-foo.txt")
+ require.FileExists(t, "dir/bar.txt")
+ require.FileExists(t, "qux.txt")
+ require.FileExists(t, "hack.txt")
+ require.FileExists(t, "no-dest.txt")
+ require.NoFileExists(t, "404.txt")
+}
diff --git a/modules/cache/cache.go b/modules/cache/cache.go
index 4750e55..401aa49 100644
--- a/modules/cache/cache.go
+++ b/modules/cache/cache.go
@@ -51,8 +51,11 @@ func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
return t
}
return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
- key := r.Method + " " + r.URL.String()
+ if nocache(r) {
+ return t.RoundTrip(r)
+ }
+ key := r.Method + " " + r.URL.String()
if b, ok := m.store.Get(key); ok {
if resp, err := http.ReadResponse(bufio.NewReader(bytes.NewReader(b)), r); err == nil {
return resp, nil
@@ -86,6 +89,14 @@ func (m *Module) Finalize() {
}
}
+func nocache(r *http.Request) bool {
+ if r.Header.Get(flyscrape.HeaderBypassCache) != "" {
+ r.Header.Del(flyscrape.HeaderBypassCache)
+ return true
+ }
+ return false
+}
+
func replaceExt(filePath string, newExt string) string {
ext := filepath.Ext(filePath)
if ext != "" {
diff --git a/modules/cache/sqlitestore.go b/modules/cache/sqlitestore.go
index 50c8007..778699b 100644
--- a/modules/cache/sqlitestore.go
+++ b/modules/cache/sqlitestore.go
@@ -40,7 +40,7 @@ func (s *SQLiteStore) Get(key string) ([]byte, bool) {
func (s *SQLiteStore) Set(key string, value []byte) {
if _, err := s.db.Exec(`INSERT INTO cache (key, value) VALUES (?, ?)`, key, value); err != nil {
- log.Printf("cache: failed to insert cache key %q: %v\n", key, value)
+ log.Printf("cache: failed to insert cache key %q: %v\n", key, err)
}
}
diff --git a/utils.go b/utils.go
index 861ee38..f26dc6c 100644
--- a/utils.go
+++ b/utils.go
@@ -13,6 +13,8 @@ import (
"strings"
)
+const HeaderBypassCache = "X-Flyscrape-Bypass-Cache"
+
func Prettify(v any, prefix string) string {
var buf bytes.Buffer
enc := json.NewEncoder(&buf)