summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cmd/flyscrape/dev.go53
-rw-r--r--cmd/flyscrape/run.go1
-rw-r--r--go.mod3
-rw-r--r--go.sum2
-rw-r--r--module.go5
-rw-r--r--modules/cache/cache.go52
-rw-r--r--modules/cache/cache_test.go38
-rw-r--r--modules/cache/memstore.go13
-rw-r--r--modules/cache/sqlitestore.go60
-rw-r--r--modules/cache/sqlitestore_test.go32
-rw-r--r--modules/ratelimit/ratelimit.go16
-rw-r--r--modules/ratelimit/ratelimit_test.go8
-rw-r--r--scrape.go8
13 files changed, 221 insertions, 70 deletions
diff --git a/cmd/flyscrape/dev.go b/cmd/flyscrape/dev.go
index fba3fba..9ddb3bf 100644
--- a/cmd/flyscrape/dev.go
+++ b/cmd/flyscrape/dev.go
@@ -5,9 +5,14 @@
package main
import (
+ "encoding/json"
"flag"
"fmt"
"log"
+ "os"
+ "os/signal"
+ "path/filepath"
+ "syscall"
"github.com/inancgumus/screen"
"github.com/philippta/flyscrape"
@@ -28,19 +33,28 @@ func (c *DevCommand) Run(args []string) error {
}
script := fs.Arg(0)
+ cachefile, err := newCacheFile()
+ if err != nil {
+ return fmt.Errorf("failed to create cache file: %w", err)
+ }
+
+ trapsignal(func() { os.RemoveAll(cachefile) })
- err := flyscrape.Watch(script, func(s string) error {
+ err = flyscrape.Watch(script, func(s string) error {
cfg, scrape, err := flyscrape.Compile(s)
if err != nil {
printCompileErr(script, err)
return nil
}
+ cfg = updateCfg(cfg, "depth", 0)
+ cfg = updateCfg(cfg, "cache", "file:"+cachefile)
+
scraper := flyscrape.NewScraper()
scraper.ScrapeFunc = scrape
+ scraper.Script = script
flyscrape.LoadModules(scraper, cfg)
- scraper.DisableModule("followlinks")
screen.Clear()
screen.MoveTopLeft()
@@ -84,3 +98,38 @@ func printCompileErr(script string, err error) {
log.Println(err)
}
}
+
+func updateCfg(cfg flyscrape.Config, key string, value any) flyscrape.Config {
+ var m map[string]any
+ if err := json.Unmarshal(cfg, &m); err != nil {
+ return cfg
+ }
+
+ m[key] = value
+
+ b, err := json.Marshal(m)
+ if err != nil {
+ return cfg
+ }
+
+ return b
+}
+
+func newCacheFile() (string, error) {
+ cachedir, err := os.MkdirTemp("", "flyscrape-cache")
+ if err != nil {
+ return "", err
+ }
+ return filepath.Join(cachedir, "dev.cache"), nil
+}
+
+func trapsignal(f func()) {
+ sig := make(chan os.Signal, 2)
+ signal.Notify(sig, os.Interrupt, syscall.SIGTERM)
+
+ go func() {
+ <-sig
+ f()
+ os.Exit(0)
+ }()
+}
diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go
index b467abe..039574b 100644
--- a/cmd/flyscrape/run.go
+++ b/cmd/flyscrape/run.go
@@ -42,6 +42,7 @@ func (c *RunCommand) Run(args []string) error {
scraper := flyscrape.NewScraper()
scraper.ScrapeFunc = scrape
+ scraper.Script = script
flyscrape.LoadModules(scraper, cfg)
diff --git a/go.mod b/go.mod
index 5e7d975..6f76e03 100644
--- a/go.mod
+++ b/go.mod
@@ -8,8 +8,10 @@ require (
github.com/evanw/esbuild v0.18.14
github.com/fsnotify/fsnotify v1.6.0
github.com/inancgumus/screen v0.0.0-20190314163918-06e984b86ed3
+ github.com/mattn/go-sqlite3 v1.14.17
github.com/nlnwa/whatwg-url v0.4.0
github.com/stretchr/testify v1.8.4
+ go.kuoruan.net/v8go-polyfills v0.5.0
rogchap.com/v8go v0.9.0
)
@@ -20,7 +22,6 @@ require (
github.com/kr/pretty v0.3.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rogpeppe/go-internal v1.10.0 // indirect
- go.kuoruan.net/v8go-polyfills v0.5.0 // indirect
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519 // indirect
golang.org/x/net v0.10.0 // indirect
golang.org/x/sys v0.8.0 // indirect
diff --git a/go.sum b/go.sum
index 21fcdaf..376e5cc 100644
--- a/go.sum
+++ b/go.sum
@@ -22,6 +22,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/mattn/go-sqlite3 v1.14.17 h1:mCRHCLDUBXgpKAqIKsaAaAsrAlbkeomtRFKXh2L6YIM=
+github.com/mattn/go-sqlite3 v1.14.17/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg=
github.com/nlnwa/whatwg-url v0.4.0 h1:B3kFb5EL7KILeBkhrlQvFi41Ex0p4ropVA9brt5ungI=
github.com/nlnwa/whatwg-url v0.4.0/go.mod h1:pLzpJjFPtA+n7RCLvp0GBxvDHa/2ckNCBK9mfEeNOMQ=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
diff --git a/module.go b/module.go
index 0465808..da81ffb 100644
--- a/module.go
+++ b/module.go
@@ -89,13 +89,16 @@ var (
modulesMu sync.RWMutex
moduleOrder = []string{
+ // Transport Adapters
+ "ratelimit",
"cache",
+
+ // Rest
"starturl",
"followlinks",
"depth",
"domainfilter",
"urlfilter",
- "ratelimit",
"jsonprint",
}
)
diff --git a/modules/cache/cache.go b/modules/cache/cache.go
index 1a321be..10762f9 100644
--- a/modules/cache/cache.go
+++ b/modules/cache/cache.go
@@ -9,8 +9,9 @@ import (
"bytes"
"net/http"
"net/http/httputil"
+ "path/filepath"
+ "strings"
- "github.com/cornelk/hashmap"
"github.com/philippta/flyscrape"
)
@@ -21,7 +22,7 @@ func init() {
type Module struct {
Cache string `json:"cache"`
- cache *hashmap.Map[string, []byte]
+ store Store
}
func (Module) ModuleInfo() flyscrape.ModuleInfo {
@@ -31,24 +32,28 @@ func (Module) ModuleInfo() flyscrape.ModuleInfo {
}
}
-func (m *Module) Provision(flyscrape.Context) {
- if m.disabled() {
- return
- }
- if m.cache == nil {
- m.cache = hashmap.New[string, []byte]()
+func (m *Module) Provision(ctx flyscrape.Context) {
+ switch {
+ case m.Cache == "memory":
+ m.store = NewMemStore()
+
+ case m.Cache == "file":
+ file := replaceExt(ctx.ScriptName(), ".cache")
+ m.store = NewSQLiteStore(file)
+
+ case strings.HasPrefix(m.Cache, "file:"):
+ m.store = NewSQLiteStore(strings.TrimPrefix(m.Cache, "file:"))
}
}
func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
- if m.disabled() {
+ if m.store == nil {
return t
}
-
return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
- key := cacheKey(r)
+ key := r.Method + " " + r.URL.String()
- if b, ok := m.cache.Get(key); ok {
+ if b, ok := m.store.Get(key); ok {
if resp, err := http.ReadResponse(bufio.NewReader(bytes.NewReader(b)), r); err == nil {
return resp, nil
}
@@ -64,15 +69,28 @@ func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
return resp, err
}
- m.cache.Set(key, encoded)
+ m.store.Set(key, encoded)
return resp, nil
})
}
-func (m *Module) disabled() bool {
- return m.Cache == ""
+func (m *Module) Finalize() {
+ if v, ok := m.store.(interface{ Close() }); ok {
+ v.Close()
+ }
+}
+
+func replaceExt(filePath string, newExt string) string {
+ ext := filepath.Ext(filePath)
+ if ext != "" {
+ fileNameWithoutExt := filePath[:len(filePath)-len(ext)]
+ newFilePath := fileNameWithoutExt + newExt
+ return newFilePath
+ }
+ return filePath + newExt
}
-func cacheKey(r *http.Request) string {
- return r.Method + " " + r.URL.String()
+type Store interface {
+ Get(key string) ([]byte, bool)
+ Set(key string, value []byte)
}
diff --git a/modules/cache/cache_test.go b/modules/cache/cache_test.go
deleted file mode 100644
index 4565e00..0000000
--- a/modules/cache/cache_test.go
+++ /dev/null
@@ -1,38 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-package cache_test
-
-import (
- "net/http"
- "testing"
-
- "github.com/philippta/flyscrape"
- "github.com/philippta/flyscrape/modules/cache"
- "github.com/philippta/flyscrape/modules/hook"
- "github.com/philippta/flyscrape/modules/starturl"
- "github.com/stretchr/testify/require"
-)
-
-func TestCache(t *testing.T) {
- cachemod := &cache.Module{Cache: "memory"}
- calls := 0
-
- for i := 0; i < 2; i++ {
- scraper := flyscrape.NewScraper()
- scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"})
- scraper.LoadModule(hook.Module{
- AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
- return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
- calls++
- return flyscrape.MockResponse(200, "foo")
- })
- },
- })
- scraper.LoadModule(cachemod)
- scraper.Run()
- }
-
- require.Equal(t, 1, calls)
-}
diff --git a/modules/cache/memstore.go b/modules/cache/memstore.go
new file mode 100644
index 0000000..0f4d9e2
--- /dev/null
+++ b/modules/cache/memstore.go
@@ -0,0 +1,13 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package cache
+
+import (
+ "github.com/cornelk/hashmap"
+)
+
+func NewMemStore() *hashmap.Map[string, []byte] {
+ return hashmap.New[string, []byte]()
+}
diff --git a/modules/cache/sqlitestore.go b/modules/cache/sqlitestore.go
new file mode 100644
index 0000000..50c8007
--- /dev/null
+++ b/modules/cache/sqlitestore.go
@@ -0,0 +1,60 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package cache
+
+import (
+ "database/sql"
+ "fmt"
+ "log"
+ "os"
+
+ _ "github.com/mattn/go-sqlite3"
+)
+
+func NewSQLiteStore(file string) *SQLiteStore {
+ db, err := sql.Open("sqlite3", fmt.Sprintf("file:%s?_timeout=5000&_journal=WAL", file))
+ if err != nil {
+ log.Printf("cache: failed to create database file %q: %v\n", file, err)
+ os.Exit(1)
+ }
+
+ c := &SQLiteStore{db: db}
+ c.migrate()
+
+ return c
+}
+
+type SQLiteStore struct {
+ db *sql.DB
+}
+
+func (s *SQLiteStore) Get(key string) ([]byte, bool) {
+ var value []byte
+ if err := s.db.QueryRow(`SELECT value FROM cache WHERE key = ? LIMIT 1`, key).Scan(&value); err != nil {
+ return nil, false
+ }
+ return value, true
+}
+
+func (s *SQLiteStore) Set(key string, value []byte) {
+ if _, err := s.db.Exec(`INSERT INTO cache (key, value) VALUES (?, ?)`, key, value); err != nil {
+ log.Printf("cache: failed to insert cache key %q: %v\n", key, value)
+ }
+}
+
+func (s *SQLiteStore) Close() {
+ s.db.Close()
+}
+
+func (s *SQLiteStore) migrate() {
+ if _, err := s.db.Exec(`CREATE TABLE IF NOT EXISTS cache (key TEXT, value BLOB)`); err != nil {
+ log.Printf("cache: failed to create cache table: %v\n", err)
+ os.Exit(1)
+ }
+ if _, err := s.db.Exec(`CREATE UNIQUE INDEX IF NOT EXISTS cache_key_idx ON cache(key)`); err != nil {
+ log.Printf("cache: failed to create cache index: %v\n", err)
+ os.Exit(1)
+ }
+}
diff --git a/modules/cache/sqlitestore_test.go b/modules/cache/sqlitestore_test.go
new file mode 100644
index 0000000..769a69f
--- /dev/null
+++ b/modules/cache/sqlitestore_test.go
@@ -0,0 +1,32 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package cache_test
+
+import (
+ "os"
+ "testing"
+
+ "github.com/philippta/flyscrape/modules/cache"
+ "github.com/stretchr/testify/require"
+)
+
+func TestSQLiteStore(t *testing.T) {
+ dir, err := os.MkdirTemp("", "sqlitestore")
+ require.NoError(t, err)
+ defer os.RemoveAll(dir)
+
+ store := cache.NewSQLiteStore(dir + "/test.db")
+
+ v, ok := store.Get("foo")
+ require.Nil(t, v)
+ require.False(t, ok)
+
+ store.Set("foo", []byte("bar"))
+
+ v, ok = store.Get("foo")
+ require.NotNil(t, v)
+ require.True(t, ok)
+ require.Equal(t, []byte("bar"), v)
+}
diff --git a/modules/ratelimit/ratelimit.go b/modules/ratelimit/ratelimit.go
index 9588db3..b23cd7a 100644
--- a/modules/ratelimit/ratelimit.go
+++ b/modules/ratelimit/ratelimit.go
@@ -5,6 +5,7 @@
package ratelimit
import (
+ "net/http"
"time"
"github.com/philippta/flyscrape"
@@ -45,11 +46,14 @@ func (m *Module) Provision(v flyscrape.Context) {
}()
}
-func (m *Module) BuildRequest(_ *flyscrape.Request) {
+func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper {
if m.disabled() {
- return
+ return t
}
- <-m.semaphore
+ return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) {
+ <-m.semaphore
+ return t.RoundTrip(r)
+ })
}
func (m *Module) Finalize() {
@@ -64,7 +68,7 @@ func (m *Module) disabled() bool {
}
var (
- _ flyscrape.RequestBuilder = (*Module)(nil)
- _ flyscrape.Provisioner = (*Module)(nil)
- _ flyscrape.Finalizer = (*Module)(nil)
+ _ flyscrape.TransportAdapter = (*Module)(nil)
+ _ flyscrape.Provisioner = (*Module)(nil)
+ _ flyscrape.Finalizer = (*Module)(nil)
)
diff --git a/modules/ratelimit/ratelimit_test.go b/modules/ratelimit/ratelimit_test.go
index ffd061c..1fe22b1 100644
--- a/modules/ratelimit/ratelimit_test.go
+++ b/modules/ratelimit/ratelimit_test.go
@@ -23,17 +23,17 @@ func TestRatelimit(t *testing.T) {
scraper := flyscrape.NewScraper()
scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"})
scraper.LoadModule(&followlinks.Module{})
- scraper.LoadModule(&ratelimit.Module{
- Rate: 100,
- })
scraper.LoadModule(hook.Module{
AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
return flyscrape.MockTransport(200, `<a href="foo">foo</a>`)
},
- BuildRequestFn: func(r *flyscrape.Request) {
+ ReceiveResponseFn: func(r *flyscrape.Response) {
times = append(times, time.Now())
},
})
+ scraper.LoadModule(&ratelimit.Module{
+ Rate: 100,
+ })
start := time.Now()
scraper.Run()
diff --git a/scrape.go b/scrape.go
index c1257a9..00a74bf 100644
--- a/scrape.go
+++ b/scrape.go
@@ -18,6 +18,7 @@ import (
type FetchFunc func(url string) (string, error)
type Context interface {
+ ScriptName() string
Visit(url string)
MarkVisited(url string)
MarkUnvisited(url string)
@@ -57,6 +58,7 @@ func NewScraper() *Scraper {
type Scraper struct {
ScrapeFunc ScrapeFunc
+ Script string
wg sync.WaitGroup
jobs chan target
@@ -95,6 +97,10 @@ func (s *Scraper) MarkUnvisited(url string) {
s.visited.Del(url)
}
+func (s *Scraper) ScriptName() string {
+ return s.Script
+}
+
func (s *Scraper) Run() {
for _, mod := range s.modules {
if v, ok := mod.(Provisioner); ok {
@@ -116,7 +122,7 @@ func (s *Scraper) Run() {
func (s *Scraper) initClient() {
jar, _ := cookiejar.New(nil)
- s.client = &http.Client{Jar: jar}
+ s.client = &http.Client{Jar: jar, Transport: http.DefaultTransport}
for _, mod := range s.modules {
if v, ok := mod.(TransportAdapter); ok {