diff options
| -rw-r--r-- | README.md | 30 | ||||
| -rw-r--r-- | cmd/args.go | 97 | ||||
| -rw-r--r-- | cmd/args_test.go | 75 | ||||
| -rw-r--r-- | cmd/dev.go (renamed from cmd/flyscrape/dev.go) | 19 | ||||
| -rw-r--r-- | cmd/flyscrape/main.go | 52 | ||||
| -rw-r--r-- | cmd/main.go | 76 | ||||
| -rw-r--r-- | cmd/new.go (renamed from cmd/flyscrape/new.go) | 2 | ||||
| -rw-r--r-- | cmd/run.go (renamed from cmd/flyscrape/run.go) | 22 | ||||
| -rw-r--r-- | flyscrape.go | 38 | ||||
| -rw-r--r-- | go.mod | 6 | ||||
| -rw-r--r-- | go.sum | 10 | ||||
| -rw-r--r-- | watch.go | 4 |
12 files changed, 349 insertions, 82 deletions
@@ -37,6 +37,16 @@ ```javascript export const config = { url: "https://news.ycombinator.com/", + // urls: [] // Specify additional URLs to start from. (default = none) + // depth: 0, // Specify how deep links should be followed. (default = 0, no follow) + // follow: [], // Speficy the css selectors to follow (default = ["a[href]"]) + // allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url) + // blockedDomains: [], // Specify the blocked domains. (default = none) + // allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed) + // blockedURLs: [], // Specify the blocked URLs as regex. (default = none) + // rate: 100, // Specify the rate in requests per second. (default = no rate limit) + // proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy) + // cache: "file", // Enable file-based request caching. (default = no cache) } export default function ({ doc, absoluteURL }) { @@ -99,17 +109,23 @@ To compile flyscrape from source, follow these steps: ## Usage ``` -flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites. - Usage: - flyscrape <command> [arguments] + flyscrape run SCRIPT [config flags] + +Examples: + + # Run the script. + $ flyscrape run example.js + + # Set the URL as argument. + $ flyscrape run example.js --url "http://other.com" -Commands: + # Enable proxy support. + $ flyscrape run example.js --proxies "http://someproxy:8043" - new creates a sample scraping script - run runs a scraping script - dev watches and re-runs a scraping script + # Follow paginated links. + $ flyscrape run example.js --depth 5 --follow ".next-button > a" ``` ## Configuration diff --git a/cmd/args.go b/cmd/args.go new file mode 100644 index 0000000..e11308b --- /dev/null +++ b/cmd/args.go @@ -0,0 +1,97 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cmd + +import ( + "fmt" + "slices" + "strconv" + "strings" +) + +var arrayFields = []string{ + "urls", + "follow", + "allowedDomains", + "blockedDomains", + "allowedURLs", + "blockedURLs", + "proxies", +} + +func parseConfigArgs(args []string) (map[string]any, error) { + updates := map[string]any{} + + flag := "" + for _, arg := range normalizeArgs(args) { + if flag == "" && !isFlag(arg) { + return nil, fmt.Errorf("expected flag, got %q instead", arg) + } + + if flag != "" && isFlag(arg) { + updates[flag[2:]] = true + flag = "" + continue + } + + if flag != "" { + if v, ok := updates[flag[2:]]; ok { + if vv, ok := v.([]any); ok { + updates[flag[2:]] = append(vv, parseArg(arg)) + } else { + updates[flag[2:]] = []any{v, parseArg(arg)} + } + } else { + if slices.Contains(arrayFields, flag[2:]) { + updates[flag[2:]] = []any{parseArg(arg)} + } else { + updates[flag[2:]] = parseArg(arg) + } + } + flag = "" + continue + } + + flag = arg + } + + if flag != "" { + updates[flag[2:]] = true + flag = "" + } + + return updates, nil +} + +func normalizeArgs(args []string) []string { + var norm []string + + for _, arg := range args { + if !strings.HasPrefix(arg, "--") { + norm = append(norm, arg) + } else { + norm = append(norm, strings.SplitN(arg, "=", 2)...) + } + } + + return norm +} + +func parseArg(arg string) any { + if arg == "true" { + return true + } + if arg == "false" { + return false + } + if num, err := strconv.Atoi(arg); err == nil { + return num + } + return arg +} + +func isFlag(arg string) bool { + return strings.HasPrefix(arg, "--") +} diff --git a/cmd/args_test.go b/cmd/args_test.go new file mode 100644 index 0000000..3153fd8 --- /dev/null +++ b/cmd/args_test.go @@ -0,0 +1,75 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cmd + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestParseConfigUpdates(t *testing.T) { + tests := []struct { + flags string + err bool + updates map[string]any + }{ + { + flags: `--foo bar`, + updates: map[string]any{"foo": "bar"}, + }, + { + flags: `--foo=bar`, + updates: map[string]any{"foo": "bar"}, + }, + { + flags: `--foo`, + updates: map[string]any{"foo": true}, + }, + { + flags: `--foo false`, + updates: map[string]any{"foo": false}, + }, + { + flags: `--foo a --foo b`, + updates: map[string]any{"foo": []any{"a", "b"}}, + }, + { + flags: `--foo a --foo=b`, + updates: map[string]any{"foo": []any{"a", "b"}}, + }, + { + flags: `--foo 69`, + updates: map[string]any{"foo": 69}, + }, + { + flags: `--foo.bar a`, + updates: map[string]any{"foo.bar": "a"}, + }, + { + flags: `foo`, + err: true, + }, + { + flags: `--foo a b`, + err: true, + }, + } + for _, test := range tests { + t.Run(test.flags, func(t *testing.T) { + args, err := parseConfigArgs(strings.Fields(test.flags)) + + if test.err { + require.Error(t, err) + require.Empty(t, args) + return + } + + require.NoError(t, err) + require.Equal(t, test.updates, args) + }) + } +} diff --git a/cmd/flyscrape/dev.go b/cmd/dev.go index 84a436b..e7edbf8 100644 --- a/cmd/flyscrape/dev.go +++ b/cmd/dev.go @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -package main +package cmd import ( "flag" @@ -22,11 +22,14 @@ func (c *DevCommand) Run(args []string) error { } else if fs.NArg() == 0 || fs.Arg(0) == "" { c.Usage() return flag.ErrHelp - } else if fs.NArg() > 1 { - return fmt.Errorf("too many arguments") } - return flyscrape.Dev(fs.Arg(0)) + cfg, err := parseConfigArgs(fs.Args()[1:]) + if err != nil { + return fmt.Errorf("Error parsing config flags: %w", err) + } + + return flyscrape.Dev(fs.Arg(0), cfg) } func (c *DevCommand) Usage() { @@ -36,11 +39,17 @@ Recursive scraping is disabled in this mode, only the initial URL will be scrape Usage: - flyscrape dev SCRIPT + flyscrape dev SCRIPT [config flags] Examples: # Run and watch script. $ flyscrape dev example.js + + # Set the URL as argument. + $ flyscrape dev example.js --url "http://other.com" + + # Enable proxy support. + $ flyscrape dev example.js --proxies "http://someproxy:8043" `[1:]) } diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go index 5ea140a..8b9d430 100644 --- a/cmd/flyscrape/main.go +++ b/cmd/flyscrape/main.go @@ -7,11 +7,10 @@ package main import ( _ "embed" "flag" - "fmt" "log" "os" - "strings" + "github.com/philippta/flyscrape/cmd" _ "github.com/philippta/flyscrape/modules/cache" _ "github.com/philippta/flyscrape/modules/depth" _ "github.com/philippta/flyscrape/modules/domainfilter" @@ -26,51 +25,10 @@ import ( func main() { log.SetFlags(0) - m := &Main{} - if err := m.Run(os.Args[1:]); err == flag.ErrHelp { - os.Exit(1) - } else if err != nil { - log.Println(err) - os.Exit(1) - } -} - -type Main struct{} - -func (m *Main) Run(args []string) error { - var cmd string - if len(args) > 0 { - cmd, args = args[0], args[1:] - } - - switch cmd { - case "new": - return (&NewCommand{}).Run(args) - case "run": - return (&RunCommand{}).Run(args) - case "dev": - return (&DevCommand{}).Run(args) - default: - if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") { - m.Usage() - return flag.ErrHelp + if err := (&cmd.Main{}).Run(os.Args[1:]); err != nil { + if err != flag.ErrHelp { + log.Println(err) } - return fmt.Errorf("flyscrape %s: unknown command", cmd) + os.Exit(1) } } - -func (m *Main) Usage() { - fmt.Println(` -flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites. - -Usage: - - flyscrape <command> [arguments] - -Commands: - - new creates a sample scraping script - run runs a scraping script - dev watches and re-runs a scraping script -`[1:]) -} diff --git a/cmd/main.go b/cmd/main.go new file mode 100644 index 0000000..7c49dbf --- /dev/null +++ b/cmd/main.go @@ -0,0 +1,76 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cmd + +import ( + _ "embed" + "flag" + "fmt" + "log" + "os" + "strings" + + _ "github.com/philippta/flyscrape/modules/cache" + _ "github.com/philippta/flyscrape/modules/depth" + _ "github.com/philippta/flyscrape/modules/domainfilter" + _ "github.com/philippta/flyscrape/modules/followlinks" + _ "github.com/philippta/flyscrape/modules/jsonprint" + _ "github.com/philippta/flyscrape/modules/proxy" + _ "github.com/philippta/flyscrape/modules/ratelimit" + _ "github.com/philippta/flyscrape/modules/starturl" + _ "github.com/philippta/flyscrape/modules/urlfilter" +) + +func main() { + log.SetFlags(0) + + m := &Main{} + if err := m.Run(os.Args[1:]); err == flag.ErrHelp { + os.Exit(1) + } else if err != nil { + log.Println(err) + os.Exit(1) + } +} + +type Main struct{} + +func (m *Main) Run(args []string) error { + var cmd string + if len(args) > 0 { + cmd, args = args[0], args[1:] + } + + switch cmd { + case "new": + return (&NewCommand{}).Run(args) + case "run": + return (&RunCommand{}).Run(args) + case "dev": + return (&DevCommand{}).Run(args) + default: + if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") { + m.Usage() + return flag.ErrHelp + } + return fmt.Errorf("flyscrape %s: unknown command", cmd) + } +} + +func (m *Main) Usage() { + fmt.Println(` +flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites. + +Usage: + + flyscrape <command> [arguments] + +Commands: + + new creates a sample scraping script + run runs a scraping script + dev watches and re-runs a scraping script +`[1:]) +} diff --git a/cmd/flyscrape/new.go b/cmd/new.go index 4ab248e..1da962d 100644 --- a/cmd/flyscrape/new.go +++ b/cmd/new.go @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -package main +package cmd import ( "flag" diff --git a/cmd/flyscrape/run.go b/cmd/run.go index 7a8930a..8c1a39d 100644 --- a/cmd/flyscrape/run.go +++ b/cmd/run.go @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -package main +package cmd import ( "flag" @@ -22,11 +22,14 @@ func (c *RunCommand) Run(args []string) error { } else if fs.NArg() == 0 || fs.Arg(0) == "" { c.Usage() return flag.ErrHelp - } else if fs.NArg() > 1 { - return fmt.Errorf("too many arguments") } - return flyscrape.Run(fs.Arg(0)) + cfg, err := parseConfigArgs(fs.Args()[1:]) + if err != nil { + return fmt.Errorf("Error parsing config flags: %w", err) + } + + return flyscrape.Run(fs.Arg(0), cfg) } func (c *RunCommand) Usage() { @@ -35,11 +38,20 @@ The run command runs the scraping script. Usage: - flyscrape run SCRIPT + flyscrape run SCRIPT [config flags] Examples: # Run the script. $ flyscrape run example.js + + # Set the URL as argument. + $ flyscrape run example.js --url "http://other.com" + + # Enable proxy support. + $ flyscrape run example.js --proxies "http://someproxy:8043" + + # Follow paginated links. + $ flyscrape run example.js --depth 5 --follow ".next-button > a" `[1:]) } diff --git a/flyscrape.go b/flyscrape.go index a96d37a..797d4c7 100644 --- a/flyscrape.go +++ b/flyscrape.go @@ -5,7 +5,6 @@ package flyscrape import ( - "encoding/json" "fmt" "log" "net/http" @@ -15,9 +14,10 @@ import ( "syscall" "github.com/inancgumus/screen" + "github.com/tidwall/sjson" ) -func Run(file string) error { +func Run(file string, overrides map[string]any) error { src, err := os.ReadFile(file) if err != nil { return fmt.Errorf("failed to read script %q: %w", file, err) @@ -33,18 +33,21 @@ func Run(file string) error { return fmt.Errorf("failed to compile script: %w", err) } + cfg := exports.Config() + cfg = updateCfgMultiple(cfg, overrides) + scraper := NewScraper() scraper.ScrapeFunc = exports.Scrape scraper.SetupFunc = exports.Setup scraper.Script = file scraper.Client = client - scraper.Modules = LoadModules(exports.Config()) + scraper.Modules = LoadModules(cfg) scraper.Run() return nil } -func Dev(file string) error { +func Dev(file string, overrides map[string]any) error { cachefile, err := newCacheFile() if err != nil { return fmt.Errorf("failed to create cache file: %w", err) @@ -67,6 +70,7 @@ func Dev(file string) error { } cfg := exports.Config() + cfg = updateCfgMultiple(cfg, overrides) cfg = updateCfg(cfg, "depth", 0) cfg = updateCfg(cfg, "cache", "file:"+cachefile) @@ -104,19 +108,11 @@ func printCompileErr(script string, err error) { } func updateCfg(cfg Config, key string, value any) Config { - var m map[string]any - if err := json.Unmarshal(cfg, &m); err != nil { - return cfg - } - - m[key] = value - - b, err := json.Marshal(m) + newcfg, err := sjson.Set(string(cfg), key, value) if err != nil { return cfg } - - return b + return Config(newcfg) } func newCacheFile() (string, error) { @@ -137,3 +133,17 @@ func trapsignal(f func()) { os.Exit(0) }() } + +func updateCfgMultiple(cfg Config, updates map[string]any) Config { + c := string(cfg) + + for k, v := range updates { + nc, err := sjson.Set(c, k, v) + if err != nil { + continue + } + c = nc + } + + return []byte(c) +} @@ -13,6 +13,8 @@ require ( github.com/mattn/go-sqlite3 v1.14.17 github.com/nlnwa/whatwg-url v0.4.0 github.com/stretchr/testify v1.8.4 + github.com/tidwall/sjson v1.2.5 + golang.org/x/sync v0.5.0 ) require ( @@ -25,9 +27,11 @@ require ( github.com/kr/pretty v0.3.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rogpeppe/go-internal v1.10.0 // indirect + github.com/tidwall/gjson v1.17.0 // indirect + github.com/tidwall/match v1.1.1 // indirect + github.com/tidwall/pretty v1.2.1 // indirect golang.org/x/crypto v0.0.0-20210921155107-089bfa567519 // indirect golang.org/x/net v0.10.0 // indirect - golang.org/x/sync v0.5.0 // indirect golang.org/x/sys v0.8.0 // indirect golang.org/x/term v0.8.0 // indirect golang.org/x/text v0.9.0 // indirect @@ -56,6 +56,16 @@ github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjR github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/gjson v1.17.0 h1:/Jocvlh98kcTfpN2+JzGQWQcqrPQwDrVEMApx/M5ZwM= +github.com/tidwall/gjson v1.17.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519 h1:7I4JAnoQBe7ZtJcBaYHi5UtiO8tQHbUSXxL+pnGRANg= @@ -58,14 +58,14 @@ func Watch(path string, fn func(string) error) error { if errors.Is(err, StopWatch) { return nil } - return err + return nil } case err, ok := <-watcher.Errors: if !ok { return nil } if err != nil { - return err + return fmt.Errorf("watcher: %w", err) } } } |