From 6aa52bdbe2cefdbc9219abfb4399afa0d492913d Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Sat, 18 Nov 2023 22:49:26 +0100 Subject: Support passing config options as CLI arguments (#15) --- cmd/args.go | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++ cmd/args_test.go | 75 +++++++++++++++++++++++++++++++++++++++ cmd/dev.go | 55 +++++++++++++++++++++++++++++ cmd/flyscrape/dev.go | 46 ------------------------ cmd/flyscrape/main.go | 52 +++------------------------ cmd/flyscrape/new.go | 56 ----------------------------- cmd/flyscrape/run.go | 45 ------------------------ cmd/main.go | 76 ++++++++++++++++++++++++++++++++++++++++ cmd/new.go | 56 +++++++++++++++++++++++++++++ cmd/run.go | 57 ++++++++++++++++++++++++++++++ 10 files changed, 421 insertions(+), 194 deletions(-) create mode 100644 cmd/args.go create mode 100644 cmd/args_test.go create mode 100644 cmd/dev.go delete mode 100644 cmd/flyscrape/dev.go delete mode 100644 cmd/flyscrape/new.go delete mode 100644 cmd/flyscrape/run.go create mode 100644 cmd/main.go create mode 100644 cmd/new.go create mode 100644 cmd/run.go (limited to 'cmd') diff --git a/cmd/args.go b/cmd/args.go new file mode 100644 index 0000000..e11308b --- /dev/null +++ b/cmd/args.go @@ -0,0 +1,97 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cmd + +import ( + "fmt" + "slices" + "strconv" + "strings" +) + +var arrayFields = []string{ + "urls", + "follow", + "allowedDomains", + "blockedDomains", + "allowedURLs", + "blockedURLs", + "proxies", +} + +func parseConfigArgs(args []string) (map[string]any, error) { + updates := map[string]any{} + + flag := "" + for _, arg := range normalizeArgs(args) { + if flag == "" && !isFlag(arg) { + return nil, fmt.Errorf("expected flag, got %q instead", arg) + } + + if flag != "" && isFlag(arg) { + updates[flag[2:]] = true + flag = "" + continue + } + + if flag != "" { + if v, ok := updates[flag[2:]]; ok { + if vv, ok := v.([]any); ok { + updates[flag[2:]] = append(vv, parseArg(arg)) + } else { + updates[flag[2:]] = []any{v, parseArg(arg)} + } + } else { + if slices.Contains(arrayFields, flag[2:]) { + updates[flag[2:]] = []any{parseArg(arg)} + } else { + updates[flag[2:]] = parseArg(arg) + } + } + flag = "" + continue + } + + flag = arg + } + + if flag != "" { + updates[flag[2:]] = true + flag = "" + } + + return updates, nil +} + +func normalizeArgs(args []string) []string { + var norm []string + + for _, arg := range args { + if !strings.HasPrefix(arg, "--") { + norm = append(norm, arg) + } else { + norm = append(norm, strings.SplitN(arg, "=", 2)...) + } + } + + return norm +} + +func parseArg(arg string) any { + if arg == "true" { + return true + } + if arg == "false" { + return false + } + if num, err := strconv.Atoi(arg); err == nil { + return num + } + return arg +} + +func isFlag(arg string) bool { + return strings.HasPrefix(arg, "--") +} diff --git a/cmd/args_test.go b/cmd/args_test.go new file mode 100644 index 0000000..3153fd8 --- /dev/null +++ b/cmd/args_test.go @@ -0,0 +1,75 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cmd + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestParseConfigUpdates(t *testing.T) { + tests := []struct { + flags string + err bool + updates map[string]any + }{ + { + flags: `--foo bar`, + updates: map[string]any{"foo": "bar"}, + }, + { + flags: `--foo=bar`, + updates: map[string]any{"foo": "bar"}, + }, + { + flags: `--foo`, + updates: map[string]any{"foo": true}, + }, + { + flags: `--foo false`, + updates: map[string]any{"foo": false}, + }, + { + flags: `--foo a --foo b`, + updates: map[string]any{"foo": []any{"a", "b"}}, + }, + { + flags: `--foo a --foo=b`, + updates: map[string]any{"foo": []any{"a", "b"}}, + }, + { + flags: `--foo 69`, + updates: map[string]any{"foo": 69}, + }, + { + flags: `--foo.bar a`, + updates: map[string]any{"foo.bar": "a"}, + }, + { + flags: `foo`, + err: true, + }, + { + flags: `--foo a b`, + err: true, + }, + } + for _, test := range tests { + t.Run(test.flags, func(t *testing.T) { + args, err := parseConfigArgs(strings.Fields(test.flags)) + + if test.err { + require.Error(t, err) + require.Empty(t, args) + return + } + + require.NoError(t, err) + require.Equal(t, test.updates, args) + }) + } +} diff --git a/cmd/dev.go b/cmd/dev.go new file mode 100644 index 0000000..e7edbf8 --- /dev/null +++ b/cmd/dev.go @@ -0,0 +1,55 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cmd + +import ( + "flag" + "fmt" + + "github.com/philippta/flyscrape" +) + +type DevCommand struct{} + +func (c *DevCommand) Run(args []string) error { + fs := flag.NewFlagSet("flyscrape-dev", flag.ContinueOnError) + fs.Usage = c.Usage + + if err := fs.Parse(args); err != nil { + return err + } else if fs.NArg() == 0 || fs.Arg(0) == "" { + c.Usage() + return flag.ErrHelp + } + + cfg, err := parseConfigArgs(fs.Args()[1:]) + if err != nil { + return fmt.Errorf("Error parsing config flags: %w", err) + } + + return flyscrape.Dev(fs.Arg(0), cfg) +} + +func (c *DevCommand) Usage() { + fmt.Println(` +The dev command watches the scraping script and re-runs it on any change. +Recursive scraping is disabled in this mode, only the initial URL will be scraped. + +Usage: + + flyscrape dev SCRIPT [config flags] + +Examples: + + # Run and watch script. + $ flyscrape dev example.js + + # Set the URL as argument. + $ flyscrape dev example.js --url "http://other.com" + + # Enable proxy support. + $ flyscrape dev example.js --proxies "http://someproxy:8043" +`[1:]) +} diff --git a/cmd/flyscrape/dev.go b/cmd/flyscrape/dev.go deleted file mode 100644 index 84a436b..0000000 --- a/cmd/flyscrape/dev.go +++ /dev/null @@ -1,46 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at http://mozilla.org/MPL/2.0/. - -package main - -import ( - "flag" - "fmt" - - "github.com/philippta/flyscrape" -) - -type DevCommand struct{} - -func (c *DevCommand) Run(args []string) error { - fs := flag.NewFlagSet("flyscrape-dev", flag.ContinueOnError) - fs.Usage = c.Usage - - if err := fs.Parse(args); err != nil { - return err - } else if fs.NArg() == 0 || fs.Arg(0) == "" { - c.Usage() - return flag.ErrHelp - } else if fs.NArg() > 1 { - return fmt.Errorf("too many arguments") - } - - return flyscrape.Dev(fs.Arg(0)) -} - -func (c *DevCommand) Usage() { - fmt.Println(` -The dev command watches the scraping script and re-runs it on any change. -Recursive scraping is disabled in this mode, only the initial URL will be scraped. - -Usage: - - flyscrape dev SCRIPT - -Examples: - - # Run and watch script. - $ flyscrape dev example.js -`[1:]) -} diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go index 5ea140a..8b9d430 100644 --- a/cmd/flyscrape/main.go +++ b/cmd/flyscrape/main.go @@ -7,11 +7,10 @@ package main import ( _ "embed" "flag" - "fmt" "log" "os" - "strings" + "github.com/philippta/flyscrape/cmd" _ "github.com/philippta/flyscrape/modules/cache" _ "github.com/philippta/flyscrape/modules/depth" _ "github.com/philippta/flyscrape/modules/domainfilter" @@ -26,51 +25,10 @@ import ( func main() { log.SetFlags(0) - m := &Main{} - if err := m.Run(os.Args[1:]); err == flag.ErrHelp { - os.Exit(1) - } else if err != nil { - log.Println(err) - os.Exit(1) - } -} - -type Main struct{} - -func (m *Main) Run(args []string) error { - var cmd string - if len(args) > 0 { - cmd, args = args[0], args[1:] - } - - switch cmd { - case "new": - return (&NewCommand{}).Run(args) - case "run": - return (&RunCommand{}).Run(args) - case "dev": - return (&DevCommand{}).Run(args) - default: - if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") { - m.Usage() - return flag.ErrHelp + if err := (&cmd.Main{}).Run(os.Args[1:]); err != nil { + if err != flag.ErrHelp { + log.Println(err) } - return fmt.Errorf("flyscrape %s: unknown command", cmd) + os.Exit(1) } } - -func (m *Main) Usage() { - fmt.Println(` -flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites. - -Usage: - - flyscrape [arguments] - -Commands: - - new creates a sample scraping script - run runs a scraping script - dev watches and re-runs a scraping script -`[1:]) -} diff --git a/cmd/flyscrape/new.go b/cmd/flyscrape/new.go deleted file mode 100644 index 4ab248e..0000000 --- a/cmd/flyscrape/new.go +++ /dev/null @@ -1,56 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at http://mozilla.org/MPL/2.0/. - -package main - -import ( - "flag" - "fmt" - "os" - - "github.com/philippta/flyscrape" -) - -type NewCommand struct{} - -func (c *NewCommand) Run(args []string) error { - fs := flag.NewFlagSet("flyscrape-new", flag.ContinueOnError) - fs.Usage = c.Usage - - if err := fs.Parse(args); err != nil { - return err - } else if fs.NArg() == 0 || fs.Arg(0) == "" { - c.Usage() - return flag.ErrHelp - } else if fs.NArg() > 1 { - return fmt.Errorf("too many arguments") - } - - script := fs.Arg(0) - if _, err := os.Stat(script); err == nil { - return fmt.Errorf("script already exists") - } - - if err := os.WriteFile(script, flyscrape.ScriptTemplate, 0o644); err != nil { - return fmt.Errorf("failed to create script %q: %w", script, err) - } - - fmt.Printf("Scraping script %v created.\n", script) - return nil -} - -func (c *NewCommand) Usage() { - fmt.Println(` -The new command creates a new scraping script. - -Usage: - - flyscrape new SCRIPT - -Examples: - - # Create a new scraping script. - $ flyscrape new example.js -`[1:]) -} diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go deleted file mode 100644 index 7a8930a..0000000 --- a/cmd/flyscrape/run.go +++ /dev/null @@ -1,45 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at http://mozilla.org/MPL/2.0/. - -package main - -import ( - "flag" - "fmt" - - "github.com/philippta/flyscrape" -) - -type RunCommand struct{} - -func (c *RunCommand) Run(args []string) error { - fs := flag.NewFlagSet("flyscrape-run", flag.ContinueOnError) - fs.Usage = c.Usage - - if err := fs.Parse(args); err != nil { - return err - } else if fs.NArg() == 0 || fs.Arg(0) == "" { - c.Usage() - return flag.ErrHelp - } else if fs.NArg() > 1 { - return fmt.Errorf("too many arguments") - } - - return flyscrape.Run(fs.Arg(0)) -} - -func (c *RunCommand) Usage() { - fmt.Println(` -The run command runs the scraping script. - -Usage: - - flyscrape run SCRIPT - -Examples: - - # Run the script. - $ flyscrape run example.js -`[1:]) -} diff --git a/cmd/main.go b/cmd/main.go new file mode 100644 index 0000000..7c49dbf --- /dev/null +++ b/cmd/main.go @@ -0,0 +1,76 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cmd + +import ( + _ "embed" + "flag" + "fmt" + "log" + "os" + "strings" + + _ "github.com/philippta/flyscrape/modules/cache" + _ "github.com/philippta/flyscrape/modules/depth" + _ "github.com/philippta/flyscrape/modules/domainfilter" + _ "github.com/philippta/flyscrape/modules/followlinks" + _ "github.com/philippta/flyscrape/modules/jsonprint" + _ "github.com/philippta/flyscrape/modules/proxy" + _ "github.com/philippta/flyscrape/modules/ratelimit" + _ "github.com/philippta/flyscrape/modules/starturl" + _ "github.com/philippta/flyscrape/modules/urlfilter" +) + +func main() { + log.SetFlags(0) + + m := &Main{} + if err := m.Run(os.Args[1:]); err == flag.ErrHelp { + os.Exit(1) + } else if err != nil { + log.Println(err) + os.Exit(1) + } +} + +type Main struct{} + +func (m *Main) Run(args []string) error { + var cmd string + if len(args) > 0 { + cmd, args = args[0], args[1:] + } + + switch cmd { + case "new": + return (&NewCommand{}).Run(args) + case "run": + return (&RunCommand{}).Run(args) + case "dev": + return (&DevCommand{}).Run(args) + default: + if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") { + m.Usage() + return flag.ErrHelp + } + return fmt.Errorf("flyscrape %s: unknown command", cmd) + } +} + +func (m *Main) Usage() { + fmt.Println(` +flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites. + +Usage: + + flyscrape [arguments] + +Commands: + + new creates a sample scraping script + run runs a scraping script + dev watches and re-runs a scraping script +`[1:]) +} diff --git a/cmd/new.go b/cmd/new.go new file mode 100644 index 0000000..1da962d --- /dev/null +++ b/cmd/new.go @@ -0,0 +1,56 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cmd + +import ( + "flag" + "fmt" + "os" + + "github.com/philippta/flyscrape" +) + +type NewCommand struct{} + +func (c *NewCommand) Run(args []string) error { + fs := flag.NewFlagSet("flyscrape-new", flag.ContinueOnError) + fs.Usage = c.Usage + + if err := fs.Parse(args); err != nil { + return err + } else if fs.NArg() == 0 || fs.Arg(0) == "" { + c.Usage() + return flag.ErrHelp + } else if fs.NArg() > 1 { + return fmt.Errorf("too many arguments") + } + + script := fs.Arg(0) + if _, err := os.Stat(script); err == nil { + return fmt.Errorf("script already exists") + } + + if err := os.WriteFile(script, flyscrape.ScriptTemplate, 0o644); err != nil { + return fmt.Errorf("failed to create script %q: %w", script, err) + } + + fmt.Printf("Scraping script %v created.\n", script) + return nil +} + +func (c *NewCommand) Usage() { + fmt.Println(` +The new command creates a new scraping script. + +Usage: + + flyscrape new SCRIPT + +Examples: + + # Create a new scraping script. + $ flyscrape new example.js +`[1:]) +} diff --git a/cmd/run.go b/cmd/run.go new file mode 100644 index 0000000..8c1a39d --- /dev/null +++ b/cmd/run.go @@ -0,0 +1,57 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cmd + +import ( + "flag" + "fmt" + + "github.com/philippta/flyscrape" +) + +type RunCommand struct{} + +func (c *RunCommand) Run(args []string) error { + fs := flag.NewFlagSet("flyscrape-run", flag.ContinueOnError) + fs.Usage = c.Usage + + if err := fs.Parse(args); err != nil { + return err + } else if fs.NArg() == 0 || fs.Arg(0) == "" { + c.Usage() + return flag.ErrHelp + } + + cfg, err := parseConfigArgs(fs.Args()[1:]) + if err != nil { + return fmt.Errorf("Error parsing config flags: %w", err) + } + + return flyscrape.Run(fs.Arg(0), cfg) +} + +func (c *RunCommand) Usage() { + fmt.Println(` +The run command runs the scraping script. + +Usage: + + flyscrape run SCRIPT [config flags] + +Examples: + + # Run the script. + $ flyscrape run example.js + + # Set the URL as argument. + $ flyscrape run example.js --url "http://other.com" + + # Enable proxy support. + $ flyscrape run example.js --proxies "http://someproxy:8043" + + # Follow paginated links. + $ flyscrape run example.js --depth 5 --follow ".next-button > a" +`[1:]) +} -- cgit v1.2.3