diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-10-11 20:20:30 +0200 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-10-11 20:20:30 +0200 |
| commit | fb84ca746e92e371161f1e1de3b01a048a9ae979 (patch) | |
| tree | 5bb8fbb7fd654b241b389697cc46bad00ce2f8b7 | |
| parent | cd40ab75f44e9f6ac86beca576a934fd790fc9fb (diff) | |
Implement file based caching
| -rw-r--r-- | cmd/flyscrape/dev.go | 53 | ||||
| -rw-r--r-- | cmd/flyscrape/run.go | 1 | ||||
| -rw-r--r-- | go.mod | 3 | ||||
| -rw-r--r-- | go.sum | 2 | ||||
| -rw-r--r-- | module.go | 5 | ||||
| -rw-r--r-- | modules/cache/cache.go | 52 | ||||
| -rw-r--r-- | modules/cache/cache_test.go | 38 | ||||
| -rw-r--r-- | modules/cache/memstore.go | 13 | ||||
| -rw-r--r-- | modules/cache/sqlitestore.go | 60 | ||||
| -rw-r--r-- | modules/cache/sqlitestore_test.go | 32 | ||||
| -rw-r--r-- | modules/ratelimit/ratelimit.go | 16 | ||||
| -rw-r--r-- | modules/ratelimit/ratelimit_test.go | 8 | ||||
| -rw-r--r-- | scrape.go | 8 |
13 files changed, 221 insertions, 70 deletions
diff --git a/cmd/flyscrape/dev.go b/cmd/flyscrape/dev.go index fba3fba..9ddb3bf 100644 --- a/cmd/flyscrape/dev.go +++ b/cmd/flyscrape/dev.go @@ -5,9 +5,14 @@ package main import ( + "encoding/json" "flag" "fmt" "log" + "os" + "os/signal" + "path/filepath" + "syscall" "github.com/inancgumus/screen" "github.com/philippta/flyscrape" @@ -28,19 +33,28 @@ func (c *DevCommand) Run(args []string) error { } script := fs.Arg(0) + cachefile, err := newCacheFile() + if err != nil { + return fmt.Errorf("failed to create cache file: %w", err) + } + + trapsignal(func() { os.RemoveAll(cachefile) }) - err := flyscrape.Watch(script, func(s string) error { + err = flyscrape.Watch(script, func(s string) error { cfg, scrape, err := flyscrape.Compile(s) if err != nil { printCompileErr(script, err) return nil } + cfg = updateCfg(cfg, "depth", 0) + cfg = updateCfg(cfg, "cache", "file:"+cachefile) + scraper := flyscrape.NewScraper() scraper.ScrapeFunc = scrape + scraper.Script = script flyscrape.LoadModules(scraper, cfg) - scraper.DisableModule("followlinks") screen.Clear() screen.MoveTopLeft() @@ -84,3 +98,38 @@ func printCompileErr(script string, err error) { log.Println(err) } } + +func updateCfg(cfg flyscrape.Config, key string, value any) flyscrape.Config { + var m map[string]any + if err := json.Unmarshal(cfg, &m); err != nil { + return cfg + } + + m[key] = value + + b, err := json.Marshal(m) + if err != nil { + return cfg + } + + return b +} + +func newCacheFile() (string, error) { + cachedir, err := os.MkdirTemp("", "flyscrape-cache") + if err != nil { + return "", err + } + return filepath.Join(cachedir, "dev.cache"), nil +} + +func trapsignal(f func()) { + sig := make(chan os.Signal, 2) + signal.Notify(sig, os.Interrupt, syscall.SIGTERM) + + go func() { + <-sig + f() + os.Exit(0) + }() +} diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go index b467abe..039574b 100644 --- a/cmd/flyscrape/run.go +++ b/cmd/flyscrape/run.go @@ -42,6 +42,7 @@ func (c *RunCommand) Run(args []string) error { scraper := flyscrape.NewScraper() scraper.ScrapeFunc = scrape + scraper.Script = script flyscrape.LoadModules(scraper, cfg) @@ -8,8 +8,10 @@ require ( github.com/evanw/esbuild v0.18.14 github.com/fsnotify/fsnotify v1.6.0 github.com/inancgumus/screen v0.0.0-20190314163918-06e984b86ed3 + github.com/mattn/go-sqlite3 v1.14.17 github.com/nlnwa/whatwg-url v0.4.0 github.com/stretchr/testify v1.8.4 + go.kuoruan.net/v8go-polyfills v0.5.0 rogchap.com/v8go v0.9.0 ) @@ -20,7 +22,6 @@ require ( github.com/kr/pretty v0.3.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rogpeppe/go-internal v1.10.0 // indirect - go.kuoruan.net/v8go-polyfills v0.5.0 // indirect golang.org/x/crypto v0.0.0-20210921155107-089bfa567519 // indirect golang.org/x/net v0.10.0 // indirect golang.org/x/sys v0.8.0 // indirect @@ -22,6 +22,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mattn/go-sqlite3 v1.14.17 h1:mCRHCLDUBXgpKAqIKsaAaAsrAlbkeomtRFKXh2L6YIM= +github.com/mattn/go-sqlite3 v1.14.17/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg= github.com/nlnwa/whatwg-url v0.4.0 h1:B3kFb5EL7KILeBkhrlQvFi41Ex0p4ropVA9brt5ungI= github.com/nlnwa/whatwg-url v0.4.0/go.mod h1:pLzpJjFPtA+n7RCLvp0GBxvDHa/2ckNCBK9mfEeNOMQ= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= @@ -89,13 +89,16 @@ var ( modulesMu sync.RWMutex moduleOrder = []string{ + // Transport Adapters + "ratelimit", "cache", + + // Rest "starturl", "followlinks", "depth", "domainfilter", "urlfilter", - "ratelimit", "jsonprint", } ) diff --git a/modules/cache/cache.go b/modules/cache/cache.go index 1a321be..10762f9 100644 --- a/modules/cache/cache.go +++ b/modules/cache/cache.go @@ -9,8 +9,9 @@ import ( "bytes" "net/http" "net/http/httputil" + "path/filepath" + "strings" - "github.com/cornelk/hashmap" "github.com/philippta/flyscrape" ) @@ -21,7 +22,7 @@ func init() { type Module struct { Cache string `json:"cache"` - cache *hashmap.Map[string, []byte] + store Store } func (Module) ModuleInfo() flyscrape.ModuleInfo { @@ -31,24 +32,28 @@ func (Module) ModuleInfo() flyscrape.ModuleInfo { } } -func (m *Module) Provision(flyscrape.Context) { - if m.disabled() { - return - } - if m.cache == nil { - m.cache = hashmap.New[string, []byte]() +func (m *Module) Provision(ctx flyscrape.Context) { + switch { + case m.Cache == "memory": + m.store = NewMemStore() + + case m.Cache == "file": + file := replaceExt(ctx.ScriptName(), ".cache") + m.store = NewSQLiteStore(file) + + case strings.HasPrefix(m.Cache, "file:"): + m.store = NewSQLiteStore(strings.TrimPrefix(m.Cache, "file:")) } } func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { - if m.disabled() { + if m.store == nil { return t } - return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { - key := cacheKey(r) + key := r.Method + " " + r.URL.String() - if b, ok := m.cache.Get(key); ok { + if b, ok := m.store.Get(key); ok { if resp, err := http.ReadResponse(bufio.NewReader(bytes.NewReader(b)), r); err == nil { return resp, nil } @@ -64,15 +69,28 @@ func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { return resp, err } - m.cache.Set(key, encoded) + m.store.Set(key, encoded) return resp, nil }) } -func (m *Module) disabled() bool { - return m.Cache == "" +func (m *Module) Finalize() { + if v, ok := m.store.(interface{ Close() }); ok { + v.Close() + } +} + +func replaceExt(filePath string, newExt string) string { + ext := filepath.Ext(filePath) + if ext != "" { + fileNameWithoutExt := filePath[:len(filePath)-len(ext)] + newFilePath := fileNameWithoutExt + newExt + return newFilePath + } + return filePath + newExt } -func cacheKey(r *http.Request) string { - return r.Method + " " + r.URL.String() +type Store interface { + Get(key string) ([]byte, bool) + Set(key string, value []byte) } diff --git a/modules/cache/cache_test.go b/modules/cache/cache_test.go deleted file mode 100644 index 4565e00..0000000 --- a/modules/cache/cache_test.go +++ /dev/null @@ -1,38 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at http://mozilla.org/MPL/2.0/. - -package cache_test - -import ( - "net/http" - "testing" - - "github.com/philippta/flyscrape" - "github.com/philippta/flyscrape/modules/cache" - "github.com/philippta/flyscrape/modules/hook" - "github.com/philippta/flyscrape/modules/starturl" - "github.com/stretchr/testify/require" -) - -func TestCache(t *testing.T) { - cachemod := &cache.Module{Cache: "memory"} - calls := 0 - - for i := 0; i < 2; i++ { - scraper := flyscrape.NewScraper() - scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) - scraper.LoadModule(hook.Module{ - AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { - return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { - calls++ - return flyscrape.MockResponse(200, "foo") - }) - }, - }) - scraper.LoadModule(cachemod) - scraper.Run() - } - - require.Equal(t, 1, calls) -} diff --git a/modules/cache/memstore.go b/modules/cache/memstore.go new file mode 100644 index 0000000..0f4d9e2 --- /dev/null +++ b/modules/cache/memstore.go @@ -0,0 +1,13 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cache + +import ( + "github.com/cornelk/hashmap" +) + +func NewMemStore() *hashmap.Map[string, []byte] { + return hashmap.New[string, []byte]() +} diff --git a/modules/cache/sqlitestore.go b/modules/cache/sqlitestore.go new file mode 100644 index 0000000..50c8007 --- /dev/null +++ b/modules/cache/sqlitestore.go @@ -0,0 +1,60 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cache + +import ( + "database/sql" + "fmt" + "log" + "os" + + _ "github.com/mattn/go-sqlite3" +) + +func NewSQLiteStore(file string) *SQLiteStore { + db, err := sql.Open("sqlite3", fmt.Sprintf("file:%s?_timeout=5000&_journal=WAL", file)) + if err != nil { + log.Printf("cache: failed to create database file %q: %v\n", file, err) + os.Exit(1) + } + + c := &SQLiteStore{db: db} + c.migrate() + + return c +} + +type SQLiteStore struct { + db *sql.DB +} + +func (s *SQLiteStore) Get(key string) ([]byte, bool) { + var value []byte + if err := s.db.QueryRow(`SELECT value FROM cache WHERE key = ? LIMIT 1`, key).Scan(&value); err != nil { + return nil, false + } + return value, true +} + +func (s *SQLiteStore) Set(key string, value []byte) { + if _, err := s.db.Exec(`INSERT INTO cache (key, value) VALUES (?, ?)`, key, value); err != nil { + log.Printf("cache: failed to insert cache key %q: %v\n", key, value) + } +} + +func (s *SQLiteStore) Close() { + s.db.Close() +} + +func (s *SQLiteStore) migrate() { + if _, err := s.db.Exec(`CREATE TABLE IF NOT EXISTS cache (key TEXT, value BLOB)`); err != nil { + log.Printf("cache: failed to create cache table: %v\n", err) + os.Exit(1) + } + if _, err := s.db.Exec(`CREATE UNIQUE INDEX IF NOT EXISTS cache_key_idx ON cache(key)`); err != nil { + log.Printf("cache: failed to create cache index: %v\n", err) + os.Exit(1) + } +} diff --git a/modules/cache/sqlitestore_test.go b/modules/cache/sqlitestore_test.go new file mode 100644 index 0000000..769a69f --- /dev/null +++ b/modules/cache/sqlitestore_test.go @@ -0,0 +1,32 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cache_test + +import ( + "os" + "testing" + + "github.com/philippta/flyscrape/modules/cache" + "github.com/stretchr/testify/require" +) + +func TestSQLiteStore(t *testing.T) { + dir, err := os.MkdirTemp("", "sqlitestore") + require.NoError(t, err) + defer os.RemoveAll(dir) + + store := cache.NewSQLiteStore(dir + "/test.db") + + v, ok := store.Get("foo") + require.Nil(t, v) + require.False(t, ok) + + store.Set("foo", []byte("bar")) + + v, ok = store.Get("foo") + require.NotNil(t, v) + require.True(t, ok) + require.Equal(t, []byte("bar"), v) +} diff --git a/modules/ratelimit/ratelimit.go b/modules/ratelimit/ratelimit.go index 9588db3..b23cd7a 100644 --- a/modules/ratelimit/ratelimit.go +++ b/modules/ratelimit/ratelimit.go @@ -5,6 +5,7 @@ package ratelimit import ( + "net/http" "time" "github.com/philippta/flyscrape" @@ -45,11 +46,14 @@ func (m *Module) Provision(v flyscrape.Context) { }() } -func (m *Module) BuildRequest(_ *flyscrape.Request) { +func (m *Module) AdaptTransport(t http.RoundTripper) http.RoundTripper { if m.disabled() { - return + return t } - <-m.semaphore + return flyscrape.RoundTripFunc(func(r *http.Request) (*http.Response, error) { + <-m.semaphore + return t.RoundTrip(r) + }) } func (m *Module) Finalize() { @@ -64,7 +68,7 @@ func (m *Module) disabled() bool { } var ( - _ flyscrape.RequestBuilder = (*Module)(nil) - _ flyscrape.Provisioner = (*Module)(nil) - _ flyscrape.Finalizer = (*Module)(nil) + _ flyscrape.TransportAdapter = (*Module)(nil) + _ flyscrape.Provisioner = (*Module)(nil) + _ flyscrape.Finalizer = (*Module)(nil) ) diff --git a/modules/ratelimit/ratelimit_test.go b/modules/ratelimit/ratelimit_test.go index ffd061c..1fe22b1 100644 --- a/modules/ratelimit/ratelimit_test.go +++ b/modules/ratelimit/ratelimit_test.go @@ -23,17 +23,17 @@ func TestRatelimit(t *testing.T) { scraper := flyscrape.NewScraper() scraper.LoadModule(&starturl.Module{URL: "http://www.example.com"}) scraper.LoadModule(&followlinks.Module{}) - scraper.LoadModule(&ratelimit.Module{ - Rate: 100, - }) scraper.LoadModule(hook.Module{ AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper { return flyscrape.MockTransport(200, `<a href="foo">foo</a>`) }, - BuildRequestFn: func(r *flyscrape.Request) { + ReceiveResponseFn: func(r *flyscrape.Response) { times = append(times, time.Now()) }, }) + scraper.LoadModule(&ratelimit.Module{ + Rate: 100, + }) start := time.Now() scraper.Run() @@ -18,6 +18,7 @@ import ( type FetchFunc func(url string) (string, error) type Context interface { + ScriptName() string Visit(url string) MarkVisited(url string) MarkUnvisited(url string) @@ -57,6 +58,7 @@ func NewScraper() *Scraper { type Scraper struct { ScrapeFunc ScrapeFunc + Script string wg sync.WaitGroup jobs chan target @@ -95,6 +97,10 @@ func (s *Scraper) MarkUnvisited(url string) { s.visited.Del(url) } +func (s *Scraper) ScriptName() string { + return s.Script +} + func (s *Scraper) Run() { for _, mod := range s.modules { if v, ok := mod.(Provisioner); ok { @@ -116,7 +122,7 @@ func (s *Scraper) Run() { func (s *Scraper) initClient() { jar, _ := cookiejar.New(nil) - s.client = &http.Client{Jar: jar} + s.client = &http.Client{Jar: jar, Transport: http.DefaultTransport} for _, mod := range s.modules { if v, ok := mod.(TransportAdapter); ok { |