diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-11-18 22:49:26 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-11-18 22:49:26 +0100 |
| commit | 6aa52bdbe2cefdbc9219abfb4399afa0d492913d (patch) | |
| tree | 02c743b7d9393dbf024e14adada73c6594bdd34a /cmd | |
| parent | 94da9293f63e46712b0a890e1e0eab4153fdb3f9 (diff) | |
Support passing config options as CLI arguments (#15)
Diffstat (limited to 'cmd')
| -rw-r--r-- | cmd/args.go | 97 | ||||
| -rw-r--r-- | cmd/args_test.go | 75 | ||||
| -rw-r--r-- | cmd/dev.go (renamed from cmd/flyscrape/dev.go) | 19 | ||||
| -rw-r--r-- | cmd/flyscrape/main.go | 52 | ||||
| -rw-r--r-- | cmd/main.go | 76 | ||||
| -rw-r--r-- | cmd/new.go (renamed from cmd/flyscrape/new.go) | 2 | ||||
| -rw-r--r-- | cmd/run.go (renamed from cmd/flyscrape/run.go) | 22 |
7 files changed, 285 insertions, 58 deletions
diff --git a/cmd/args.go b/cmd/args.go new file mode 100644 index 0000000..e11308b --- /dev/null +++ b/cmd/args.go @@ -0,0 +1,97 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cmd + +import ( + "fmt" + "slices" + "strconv" + "strings" +) + +var arrayFields = []string{ + "urls", + "follow", + "allowedDomains", + "blockedDomains", + "allowedURLs", + "blockedURLs", + "proxies", +} + +func parseConfigArgs(args []string) (map[string]any, error) { + updates := map[string]any{} + + flag := "" + for _, arg := range normalizeArgs(args) { + if flag == "" && !isFlag(arg) { + return nil, fmt.Errorf("expected flag, got %q instead", arg) + } + + if flag != "" && isFlag(arg) { + updates[flag[2:]] = true + flag = "" + continue + } + + if flag != "" { + if v, ok := updates[flag[2:]]; ok { + if vv, ok := v.([]any); ok { + updates[flag[2:]] = append(vv, parseArg(arg)) + } else { + updates[flag[2:]] = []any{v, parseArg(arg)} + } + } else { + if slices.Contains(arrayFields, flag[2:]) { + updates[flag[2:]] = []any{parseArg(arg)} + } else { + updates[flag[2:]] = parseArg(arg) + } + } + flag = "" + continue + } + + flag = arg + } + + if flag != "" { + updates[flag[2:]] = true + flag = "" + } + + return updates, nil +} + +func normalizeArgs(args []string) []string { + var norm []string + + for _, arg := range args { + if !strings.HasPrefix(arg, "--") { + norm = append(norm, arg) + } else { + norm = append(norm, strings.SplitN(arg, "=", 2)...) + } + } + + return norm +} + +func parseArg(arg string) any { + if arg == "true" { + return true + } + if arg == "false" { + return false + } + if num, err := strconv.Atoi(arg); err == nil { + return num + } + return arg +} + +func isFlag(arg string) bool { + return strings.HasPrefix(arg, "--") +} diff --git a/cmd/args_test.go b/cmd/args_test.go new file mode 100644 index 0000000..3153fd8 --- /dev/null +++ b/cmd/args_test.go @@ -0,0 +1,75 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cmd + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestParseConfigUpdates(t *testing.T) { + tests := []struct { + flags string + err bool + updates map[string]any + }{ + { + flags: `--foo bar`, + updates: map[string]any{"foo": "bar"}, + }, + { + flags: `--foo=bar`, + updates: map[string]any{"foo": "bar"}, + }, + { + flags: `--foo`, + updates: map[string]any{"foo": true}, + }, + { + flags: `--foo false`, + updates: map[string]any{"foo": false}, + }, + { + flags: `--foo a --foo b`, + updates: map[string]any{"foo": []any{"a", "b"}}, + }, + { + flags: `--foo a --foo=b`, + updates: map[string]any{"foo": []any{"a", "b"}}, + }, + { + flags: `--foo 69`, + updates: map[string]any{"foo": 69}, + }, + { + flags: `--foo.bar a`, + updates: map[string]any{"foo.bar": "a"}, + }, + { + flags: `foo`, + err: true, + }, + { + flags: `--foo a b`, + err: true, + }, + } + for _, test := range tests { + t.Run(test.flags, func(t *testing.T) { + args, err := parseConfigArgs(strings.Fields(test.flags)) + + if test.err { + require.Error(t, err) + require.Empty(t, args) + return + } + + require.NoError(t, err) + require.Equal(t, test.updates, args) + }) + } +} diff --git a/cmd/flyscrape/dev.go b/cmd/dev.go index 84a436b..e7edbf8 100644 --- a/cmd/flyscrape/dev.go +++ b/cmd/dev.go @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -package main +package cmd import ( "flag" @@ -22,11 +22,14 @@ func (c *DevCommand) Run(args []string) error { } else if fs.NArg() == 0 || fs.Arg(0) == "" { c.Usage() return flag.ErrHelp - } else if fs.NArg() > 1 { - return fmt.Errorf("too many arguments") } - return flyscrape.Dev(fs.Arg(0)) + cfg, err := parseConfigArgs(fs.Args()[1:]) + if err != nil { + return fmt.Errorf("Error parsing config flags: %w", err) + } + + return flyscrape.Dev(fs.Arg(0), cfg) } func (c *DevCommand) Usage() { @@ -36,11 +39,17 @@ Recursive scraping is disabled in this mode, only the initial URL will be scrape Usage: - flyscrape dev SCRIPT + flyscrape dev SCRIPT [config flags] Examples: # Run and watch script. $ flyscrape dev example.js + + # Set the URL as argument. + $ flyscrape dev example.js --url "http://other.com" + + # Enable proxy support. + $ flyscrape dev example.js --proxies "http://someproxy:8043" `[1:]) } diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go index 5ea140a..8b9d430 100644 --- a/cmd/flyscrape/main.go +++ b/cmd/flyscrape/main.go @@ -7,11 +7,10 @@ package main import ( _ "embed" "flag" - "fmt" "log" "os" - "strings" + "github.com/philippta/flyscrape/cmd" _ "github.com/philippta/flyscrape/modules/cache" _ "github.com/philippta/flyscrape/modules/depth" _ "github.com/philippta/flyscrape/modules/domainfilter" @@ -26,51 +25,10 @@ import ( func main() { log.SetFlags(0) - m := &Main{} - if err := m.Run(os.Args[1:]); err == flag.ErrHelp { - os.Exit(1) - } else if err != nil { - log.Println(err) - os.Exit(1) - } -} - -type Main struct{} - -func (m *Main) Run(args []string) error { - var cmd string - if len(args) > 0 { - cmd, args = args[0], args[1:] - } - - switch cmd { - case "new": - return (&NewCommand{}).Run(args) - case "run": - return (&RunCommand{}).Run(args) - case "dev": - return (&DevCommand{}).Run(args) - default: - if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") { - m.Usage() - return flag.ErrHelp + if err := (&cmd.Main{}).Run(os.Args[1:]); err != nil { + if err != flag.ErrHelp { + log.Println(err) } - return fmt.Errorf("flyscrape %s: unknown command", cmd) + os.Exit(1) } } - -func (m *Main) Usage() { - fmt.Println(` -flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites. - -Usage: - - flyscrape <command> [arguments] - -Commands: - - new creates a sample scraping script - run runs a scraping script - dev watches and re-runs a scraping script -`[1:]) -} diff --git a/cmd/main.go b/cmd/main.go new file mode 100644 index 0000000..7c49dbf --- /dev/null +++ b/cmd/main.go @@ -0,0 +1,76 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cmd + +import ( + _ "embed" + "flag" + "fmt" + "log" + "os" + "strings" + + _ "github.com/philippta/flyscrape/modules/cache" + _ "github.com/philippta/flyscrape/modules/depth" + _ "github.com/philippta/flyscrape/modules/domainfilter" + _ "github.com/philippta/flyscrape/modules/followlinks" + _ "github.com/philippta/flyscrape/modules/jsonprint" + _ "github.com/philippta/flyscrape/modules/proxy" + _ "github.com/philippta/flyscrape/modules/ratelimit" + _ "github.com/philippta/flyscrape/modules/starturl" + _ "github.com/philippta/flyscrape/modules/urlfilter" +) + +func main() { + log.SetFlags(0) + + m := &Main{} + if err := m.Run(os.Args[1:]); err == flag.ErrHelp { + os.Exit(1) + } else if err != nil { + log.Println(err) + os.Exit(1) + } +} + +type Main struct{} + +func (m *Main) Run(args []string) error { + var cmd string + if len(args) > 0 { + cmd, args = args[0], args[1:] + } + + switch cmd { + case "new": + return (&NewCommand{}).Run(args) + case "run": + return (&RunCommand{}).Run(args) + case "dev": + return (&DevCommand{}).Run(args) + default: + if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") { + m.Usage() + return flag.ErrHelp + } + return fmt.Errorf("flyscrape %s: unknown command", cmd) + } +} + +func (m *Main) Usage() { + fmt.Println(` +flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites. + +Usage: + + flyscrape <command> [arguments] + +Commands: + + new creates a sample scraping script + run runs a scraping script + dev watches and re-runs a scraping script +`[1:]) +} diff --git a/cmd/flyscrape/new.go b/cmd/new.go index 4ab248e..1da962d 100644 --- a/cmd/flyscrape/new.go +++ b/cmd/new.go @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -package main +package cmd import ( "flag" diff --git a/cmd/flyscrape/run.go b/cmd/run.go index 7a8930a..8c1a39d 100644 --- a/cmd/flyscrape/run.go +++ b/cmd/run.go @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -package main +package cmd import ( "flag" @@ -22,11 +22,14 @@ func (c *RunCommand) Run(args []string) error { } else if fs.NArg() == 0 || fs.Arg(0) == "" { c.Usage() return flag.ErrHelp - } else if fs.NArg() > 1 { - return fmt.Errorf("too many arguments") } - return flyscrape.Run(fs.Arg(0)) + cfg, err := parseConfigArgs(fs.Args()[1:]) + if err != nil { + return fmt.Errorf("Error parsing config flags: %w", err) + } + + return flyscrape.Run(fs.Arg(0), cfg) } func (c *RunCommand) Usage() { @@ -35,11 +38,20 @@ The run command runs the scraping script. Usage: - flyscrape run SCRIPT + flyscrape run SCRIPT [config flags] Examples: # Run the script. $ flyscrape run example.js + + # Set the URL as argument. + $ flyscrape run example.js --url "http://other.com" + + # Enable proxy support. + $ flyscrape run example.js --proxies "http://someproxy:8043" + + # Follow paginated links. + $ flyscrape run example.js --depth 5 --follow ".next-button > a" `[1:]) } |