diff options
| -rw-r--r-- | README.md | 23 | ||||
| -rw-r--r-- | examples/download.js | 25 | ||||
| -rw-r--r-- | flyscrape.go | 10 | ||||
| -rw-r--r-- | go.mod | 1 | ||||
| -rw-r--r-- | go.sum | 2 | ||||
| -rw-r--r-- | js.go | 4 | ||||
| -rw-r--r-- | js_lib.go | 108 | ||||
| -rw-r--r-- | js_lib_test.go | 77 | ||||
| -rw-r--r-- | modules/cache/cache.go | 13 | ||||
| -rw-r--r-- | modules/cache/sqlitestore.go | 2 | ||||
| -rw-r--r-- | utils.go | 2 |
11 files changed, 254 insertions, 13 deletions
@@ -119,10 +119,10 @@ Below is an example scraping script that showcases the capabilities of flyscrape ```javascript export const config = { url: "https://example.com/", // Specify the URL to start scraping from. - urls: [ // Specify the URL(S) to start scraping from. If both .url and .urls + urls: [ // Specify the URL(s) to start scraping from. If both `url` and `urls` "https://example.com/foo", // are provided, all of the specified URLs will be scraped. - "https://example.com/foo", - ] + "https://example.com/bar", + ], depth: 0, // Specify how deep links should be followed. (default = 0, no follow) follow: [], // Speficy the css selectors to follow (default = ["a[href]"]) allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) @@ -180,6 +180,8 @@ items.filter(item => item.hasClass("a")) // [<li class="a">Item 1</li>] ## Flyscrape API +### Document Parsing + ```javascript import { parse } from "flyscrape"; @@ -187,6 +189,8 @@ const doc = parse(`<div class="foo">bar</div>`); const text = doc.find(".foo").text(); ``` +### Basic HTTP Requests + ```javascript import http from "flyscrape/http"; @@ -214,7 +218,20 @@ const response = http.postJSON("https://example.com", { } ``` +### File Downloads + +```javascript +import { download } from "flyscrape/http"; + +download("http://example.com/image.jpg") // downloads as "image.jpg" +download("http://example.com/image.jpg", "other.jpg") // downloads as "other.jpg" +download("http://example.com/image.jpg", "dir/") // downloads as "dir/image.jpg" +// If the server offers a filename via the Content-Disposition header and no +// destination filename is provided, Flyscrape will honor the suggested filename. +// E.g. `Content-Disposition: attachment; filename="archive.zip"` +download("http://example.com/generate_archive.php", "dir/") // downloads as "dir/archive.zip" +``` ## Issues and Suggestions diff --git a/examples/download.js b/examples/download.js new file mode 100644 index 0000000..7048846 --- /dev/null +++ b/examples/download.js @@ -0,0 +1,25 @@ +import { download } from "flyscrape/http"; + +export const config = { + url: "https://commons.wikimedia.org/wiki/London", +}; + +export default function ({ doc }) { + const symbols = doc.find("#mw-content-text .mw-gallery-traditional:first-of-type li"); + + return { + symbols: symbols.map(symbol => { + const name = symbol.text().trim(); + const url = symbol.find("img").attr("src"); + const file = `symbols/${basename(url)}`; + + download(url, file); + + return { name, url, file }; + }) + }; +} + +function basename(path) { + return path.split("/").slice(-1)[0]; +} diff --git a/flyscrape.go b/flyscrape.go index 8e79c2a..a96d37a 100644 --- a/flyscrape.go +++ b/flyscrape.go @@ -25,7 +25,10 @@ func Run(file string) error { client := &http.Client{} - exports, err := Compile(string(src), NewJSLibrary(client)) + imports, wait := NewJSLibrary(client) + defer wait() + + exports, err := Compile(string(src), imports) if err != nil { return fmt.Errorf("failed to compile script: %w", err) } @@ -54,7 +57,10 @@ func Dev(file string) error { fn := func(s string) error { client := &http.Client{} - exports, err := Compile(s, NewJSLibrary(client)) + imports, wait := NewJSLibrary(client) + defer wait() + + exports, err := Compile(s, imports) if err != nil { printCompileErr(file, err) return nil @@ -27,6 +27,7 @@ require ( github.com/rogpeppe/go-internal v1.10.0 // indirect golang.org/x/crypto v0.0.0-20210921155107-089bfa567519 // indirect golang.org/x/net v0.10.0 // indirect + golang.org/x/sync v0.5.0 // indirect golang.org/x/sys v0.8.0 // indirect golang.org/x/term v0.8.0 // indirect golang.org/x/text v0.9.0 // indirect @@ -73,6 +73,8 @@ golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= +golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -126,6 +126,10 @@ func vm(src string, imports Imports) (Exports, error) { } exports := Exports{} + if goja.IsUndefined(v) { + return exports, nil + } + obj := v.ToObject(vm) for _, key := range obj.Keys() { exports[key] = obj.Get(key).Export() @@ -9,13 +9,27 @@ import ( "encoding/json" "fmt" "io" + "log" + "mime" "net/http" gourl "net/url" + "os" + "path/filepath" "strings" + + "golang.org/x/sync/errgroup" ) -func NewJSLibrary(client *http.Client) Imports { - return Imports{ +func NewJSLibrary(client *http.Client) (imports Imports, wait func()) { + downloads := &errgroup.Group{} + + // Allow 5 parallel downloads. Why 5? + // Docker downloads 3 layers in parallel. + // My Chrome downloads up to 6 files in parallel. + // 5 feels like a reasonable number. + downloads.SetLimit(5) + + im := Imports{ "flyscrape": map[string]any{ "parse": jsParse(), }, @@ -23,8 +37,11 @@ func NewJSLibrary(client *http.Client) Imports { "get": jsHTTPGet(client), "postForm": jsHTTPPostForm(client), "postJSON": jsHTTPPostJSON(client), + "download": jsHTTPDownload(client, downloads), }, } + + return im, func() { downloads.Wait() } } func jsParse() func(html string) map[string]any { @@ -85,6 +102,93 @@ func jsHTTPPostJSON(client *http.Client) func(url string, data any) map[string]a } } +func jsHTTPDownload(client *http.Client, g *errgroup.Group) func(url string, dst string) { + fileExists := func(name string) bool { + _, err := os.Stat(name) + return err == nil + } + + isDir := func(path string) bool { + if strings.HasSuffix(path, "/") { + return true + } + if filepath.Ext(path) == "" { + return true + } + s, err := os.Stat(path) + return err == nil && s.IsDir() + } + + suggestedFilename := func(url, contentDisp string) string { + filename := filepath.Base(url) + + if contentDisp == "" { + return filename + } + + _, params, err := mime.ParseMediaType(contentDisp) + if err != nil { + return filename + } + + name, ok := params["filename"] + if !ok || name == "" { + return filename + } + + return filepath.Base(name) + } + + return func(url string, dst string) { + g.Go(func() error { + req, err := http.NewRequest("GET", url, nil) + if err != nil { + log.Printf("error downloading file %q: %v", url, err) + return nil + } + req.Header.Add(HeaderBypassCache, "true") + + resp, err := client.Do(req) + if err != nil { + log.Printf("error downloading file %q: %v", url, err) + return nil + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + log.Printf("error downloading file %q: unexpected status code %d", url, resp.StatusCode) + return nil + } + + dst, err = filepath.Abs(dst) + if err != nil { + log.Printf("error downloading file %q: abs path failed: %v", url, err) + return nil + } + + if isDir(dst) { + name := suggestedFilename(url, resp.Header.Get("Content-Disposition")) + dst = filepath.Join(dst, name) + } + + if fileExists(dst) { + return nil + } + + os.MkdirAll(filepath.Dir(dst), 0o755) + f, err := os.Create(dst) + if err != nil { + log.Printf("error downloading file %q: file save failed: %v", url, err) + return nil + } + defer f.Close() + + io.Copy(f, resp.Body) + return nil + }) + } +} + func jsFetch(client *http.Client, req *http.Request) (obj map[string]any) { obj = map[string]any{ "body": "", diff --git a/js_lib_test.go b/js_lib_test.go index 3682813..e375308 100644 --- a/js_lib_test.go +++ b/js_lib_test.go @@ -7,6 +7,7 @@ package flyscrape_test import ( "encoding/json" "net/http" + "os" "testing" "github.com/philippta/flyscrape" @@ -25,7 +26,8 @@ func TestJSLibParse(t *testing.T) { Transport: flyscrape.MockTransport(200, html), } - exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client)) + imports, _ := flyscrape.NewJSLibrary(client) + exports, err := flyscrape.Compile(script, imports) require.NoError(t, err) h, ok := exports["text"].(string) @@ -49,7 +51,8 @@ func TestJSLibHTTPGet(t *testing.T) { Transport: flyscrape.MockTransport(200, html), } - exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client)) + imports, _ := flyscrape.NewJSLibrary(client) + exports, err := flyscrape.Compile(script, imports) require.NoError(t, err) body, ok := exports["body"].(string) @@ -97,7 +100,8 @@ func TestJSLibHTTPPostForm(t *testing.T) { }), } - exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client)) + imports, _ := flyscrape.NewJSLibrary(client) + exports, err := flyscrape.Compile(script, imports) require.NoError(t, err) body, ok := exports["body"].(string) @@ -146,7 +150,8 @@ func TestJSLibHTTPPostJSON(t *testing.T) { }), } - exports, err := flyscrape.Compile(script, flyscrape.NewJSLibrary(client)) + imports, _ := flyscrape.NewJSLibrary(client) + exports, err := flyscrape.Compile(script, imports) require.NoError(t, err) body, ok := exports["body"].(string) @@ -165,3 +170,67 @@ func TestJSLibHTTPPostJSON(t *testing.T) { require.True(t, ok) require.NotEmpty(t, headers) } + +func TestJSLibHTTPDownload(t *testing.T) { + cwd, err := os.Getwd() + require.NoError(t, err) + + tmpdir, err := os.MkdirTemp("", "http-download") + require.NoError(t, err) + + defer os.RemoveAll(tmpdir) + defer os.Chdir(cwd) + os.Chdir(tmpdir) + + script := ` + import http from "flyscrape/http"; + + http.download("https://example.com/foo.txt", "foo.txt"); + http.download("https://example.com/foo.txt", "dir/my-foo.txt"); + http.download("https://example.com/bar.txt", "dir/"); + http.download("https://example.com/baz.txt", "dir"); + http.download("https://example.com/content-disposition", "."); + http.download("https://example.com/hack.txt", "."); + http.download("https://example.com/no-dest.txt"); + http.download("https://example.com/404.txt"); + ` + + nreqs := 0 + client := &http.Client{ + Transport: flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { + nreqs++ + + if r.URL.Path == "/content-disposition" { + resp, err := flyscrape.MockResponse(200, "hello world") + resp.Header.Set("Content-Disposition", `attachment; filename="qux.txt"`) + return resp, err + } + if r.URL.Path == "/hack.txt" { + resp, err := flyscrape.MockResponse(200, "hello world") + resp.Header.Set("Content-Disposition", `attachment; filename="../../hack.txt"`) + return resp, err + } + if r.URL.Path == "/404.txt" { + resp, err := flyscrape.MockResponse(404, "hello world") + return resp, err + } + + return flyscrape.MockResponse(200, "hello world") + }), + } + + imports, wait := flyscrape.NewJSLibrary(client) + _, err = flyscrape.Compile(script, imports) + require.NoError(t, err) + + wait() + + require.Equal(t, nreqs, 8) + require.FileExists(t, "foo.txt") + require.FileExists(t, "dir/my-foo.txt") + require.FileExists(t, "dir/bar.txt") + require.FileExists(t, "qux.txt") + require.FileExists(t, "hack.txt") + require.FileExists(t, "no-dest.txt") + require.NoFileExists(t, "404.txt") +} diff --git a/modules/cache/cache.go b/modules/cache/cache.go index 4750e55..401aa49 100644 --- a/modules/cache/cache.go +++ b/modules/cache/cache.go @@ -51,8 +51,11 @@ func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { return t } return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { - key := r.Method + " " + r.URL.String() + if nocache(r) { + return t.RoundTrip(r) + } + key := r.Method + " " + r.URL.String() if b, ok := m.store.Get(key); ok { if resp, err := http.ReadResponse(bufio.NewReader(bytes.NewReader(b)), r); err == nil { return resp, nil @@ -86,6 +89,14 @@ func (m *Module) Finalize() { } } +func nocache(r *http.Request) bool { + if r.Header.Get(flyscrape.HeaderBypassCache) != "" { + r.Header.Del(flyscrape.HeaderBypassCache) + return true + } + return false +} + func replaceExt(filePath string, newExt string) string { ext := filepath.Ext(filePath) if ext != "" { diff --git a/modules/cache/sqlitestore.go b/modules/cache/sqlitestore.go index 50c8007..778699b 100644 --- a/modules/cache/sqlitestore.go +++ b/modules/cache/sqlitestore.go @@ -40,7 +40,7 @@ func (s *SQLiteStore) Get(key string) ([]byte, bool) { func (s *SQLiteStore) Set(key string, value []byte) { if _, err := s.db.Exec(`INSERT INTO cache (key, value) VALUES (?, ?)`, key, value); err != nil { - log.Printf("cache: failed to insert cache key %q: %v\n", key, value) + log.Printf("cache: failed to insert cache key %q: %v\n", key, err) } } @@ -13,6 +13,8 @@ import ( "strings" ) +const HeaderBypassCache = "X-Flyscrape-Bypass-Cache" + func Prettify(v any, prefix string) string { var buf bytes.Buffer enc := json.NewEncoder(&buf) |