summaryrefslogtreecommitdiff
path: root/cmd
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2023-11-18 22:49:26 +0100
committerGitHub <noreply@github.com>2023-11-18 22:49:26 +0100
commit6aa52bdbe2cefdbc9219abfb4399afa0d492913d (patch)
tree02c743b7d9393dbf024e14adada73c6594bdd34a /cmd
parent94da9293f63e46712b0a890e1e0eab4153fdb3f9 (diff)
Support passing config options as CLI arguments (#15)
Diffstat (limited to 'cmd')
-rw-r--r--cmd/args.go97
-rw-r--r--cmd/args_test.go75
-rw-r--r--cmd/dev.go (renamed from cmd/flyscrape/dev.go)19
-rw-r--r--cmd/flyscrape/main.go52
-rw-r--r--cmd/main.go76
-rw-r--r--cmd/new.go (renamed from cmd/flyscrape/new.go)2
-rw-r--r--cmd/run.go (renamed from cmd/flyscrape/run.go)22
7 files changed, 285 insertions, 58 deletions
diff --git a/cmd/args.go b/cmd/args.go
new file mode 100644
index 0000000..e11308b
--- /dev/null
+++ b/cmd/args.go
@@ -0,0 +1,97 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package cmd
+
+import (
+ "fmt"
+ "slices"
+ "strconv"
+ "strings"
+)
+
+var arrayFields = []string{
+ "urls",
+ "follow",
+ "allowedDomains",
+ "blockedDomains",
+ "allowedURLs",
+ "blockedURLs",
+ "proxies",
+}
+
+func parseConfigArgs(args []string) (map[string]any, error) {
+ updates := map[string]any{}
+
+ flag := ""
+ for _, arg := range normalizeArgs(args) {
+ if flag == "" && !isFlag(arg) {
+ return nil, fmt.Errorf("expected flag, got %q instead", arg)
+ }
+
+ if flag != "" && isFlag(arg) {
+ updates[flag[2:]] = true
+ flag = ""
+ continue
+ }
+
+ if flag != "" {
+ if v, ok := updates[flag[2:]]; ok {
+ if vv, ok := v.([]any); ok {
+ updates[flag[2:]] = append(vv, parseArg(arg))
+ } else {
+ updates[flag[2:]] = []any{v, parseArg(arg)}
+ }
+ } else {
+ if slices.Contains(arrayFields, flag[2:]) {
+ updates[flag[2:]] = []any{parseArg(arg)}
+ } else {
+ updates[flag[2:]] = parseArg(arg)
+ }
+ }
+ flag = ""
+ continue
+ }
+
+ flag = arg
+ }
+
+ if flag != "" {
+ updates[flag[2:]] = true
+ flag = ""
+ }
+
+ return updates, nil
+}
+
+func normalizeArgs(args []string) []string {
+ var norm []string
+
+ for _, arg := range args {
+ if !strings.HasPrefix(arg, "--") {
+ norm = append(norm, arg)
+ } else {
+ norm = append(norm, strings.SplitN(arg, "=", 2)...)
+ }
+ }
+
+ return norm
+}
+
+func parseArg(arg string) any {
+ if arg == "true" {
+ return true
+ }
+ if arg == "false" {
+ return false
+ }
+ if num, err := strconv.Atoi(arg); err == nil {
+ return num
+ }
+ return arg
+}
+
+func isFlag(arg string) bool {
+ return strings.HasPrefix(arg, "--")
+}
diff --git a/cmd/args_test.go b/cmd/args_test.go
new file mode 100644
index 0000000..3153fd8
--- /dev/null
+++ b/cmd/args_test.go
@@ -0,0 +1,75 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package cmd
+
+import (
+ "strings"
+ "testing"
+
+ "github.com/stretchr/testify/require"
+)
+
+func TestParseConfigUpdates(t *testing.T) {
+ tests := []struct {
+ flags string
+ err bool
+ updates map[string]any
+ }{
+ {
+ flags: `--foo bar`,
+ updates: map[string]any{"foo": "bar"},
+ },
+ {
+ flags: `--foo=bar`,
+ updates: map[string]any{"foo": "bar"},
+ },
+ {
+ flags: `--foo`,
+ updates: map[string]any{"foo": true},
+ },
+ {
+ flags: `--foo false`,
+ updates: map[string]any{"foo": false},
+ },
+ {
+ flags: `--foo a --foo b`,
+ updates: map[string]any{"foo": []any{"a", "b"}},
+ },
+ {
+ flags: `--foo a --foo=b`,
+ updates: map[string]any{"foo": []any{"a", "b"}},
+ },
+ {
+ flags: `--foo 69`,
+ updates: map[string]any{"foo": 69},
+ },
+ {
+ flags: `--foo.bar a`,
+ updates: map[string]any{"foo.bar": "a"},
+ },
+ {
+ flags: `foo`,
+ err: true,
+ },
+ {
+ flags: `--foo a b`,
+ err: true,
+ },
+ }
+ for _, test := range tests {
+ t.Run(test.flags, func(t *testing.T) {
+ args, err := parseConfigArgs(strings.Fields(test.flags))
+
+ if test.err {
+ require.Error(t, err)
+ require.Empty(t, args)
+ return
+ }
+
+ require.NoError(t, err)
+ require.Equal(t, test.updates, args)
+ })
+ }
+}
diff --git a/cmd/flyscrape/dev.go b/cmd/dev.go
index 84a436b..e7edbf8 100644
--- a/cmd/flyscrape/dev.go
+++ b/cmd/dev.go
@@ -2,7 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
-package main
+package cmd
import (
"flag"
@@ -22,11 +22,14 @@ func (c *DevCommand) Run(args []string) error {
} else if fs.NArg() == 0 || fs.Arg(0) == "" {
c.Usage()
return flag.ErrHelp
- } else if fs.NArg() > 1 {
- return fmt.Errorf("too many arguments")
}
- return flyscrape.Dev(fs.Arg(0))
+ cfg, err := parseConfigArgs(fs.Args()[1:])
+ if err != nil {
+ return fmt.Errorf("Error parsing config flags: %w", err)
+ }
+
+ return flyscrape.Dev(fs.Arg(0), cfg)
}
func (c *DevCommand) Usage() {
@@ -36,11 +39,17 @@ Recursive scraping is disabled in this mode, only the initial URL will be scrape
Usage:
- flyscrape dev SCRIPT
+ flyscrape dev SCRIPT [config flags]
Examples:
# Run and watch script.
$ flyscrape dev example.js
+
+ # Set the URL as argument.
+ $ flyscrape dev example.js --url "http://other.com"
+
+ # Enable proxy support.
+ $ flyscrape dev example.js --proxies "http://someproxy:8043"
`[1:])
}
diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go
index 5ea140a..8b9d430 100644
--- a/cmd/flyscrape/main.go
+++ b/cmd/flyscrape/main.go
@@ -7,11 +7,10 @@ package main
import (
_ "embed"
"flag"
- "fmt"
"log"
"os"
- "strings"
+ "github.com/philippta/flyscrape/cmd"
_ "github.com/philippta/flyscrape/modules/cache"
_ "github.com/philippta/flyscrape/modules/depth"
_ "github.com/philippta/flyscrape/modules/domainfilter"
@@ -26,51 +25,10 @@ import (
func main() {
log.SetFlags(0)
- m := &Main{}
- if err := m.Run(os.Args[1:]); err == flag.ErrHelp {
- os.Exit(1)
- } else if err != nil {
- log.Println(err)
- os.Exit(1)
- }
-}
-
-type Main struct{}
-
-func (m *Main) Run(args []string) error {
- var cmd string
- if len(args) > 0 {
- cmd, args = args[0], args[1:]
- }
-
- switch cmd {
- case "new":
- return (&NewCommand{}).Run(args)
- case "run":
- return (&RunCommand{}).Run(args)
- case "dev":
- return (&DevCommand{}).Run(args)
- default:
- if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") {
- m.Usage()
- return flag.ErrHelp
+ if err := (&cmd.Main{}).Run(os.Args[1:]); err != nil {
+ if err != flag.ErrHelp {
+ log.Println(err)
}
- return fmt.Errorf("flyscrape %s: unknown command", cmd)
+ os.Exit(1)
}
}
-
-func (m *Main) Usage() {
- fmt.Println(`
-flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites.
-
-Usage:
-
- flyscrape <command> [arguments]
-
-Commands:
-
- new creates a sample scraping script
- run runs a scraping script
- dev watches and re-runs a scraping script
-`[1:])
-}
diff --git a/cmd/main.go b/cmd/main.go
new file mode 100644
index 0000000..7c49dbf
--- /dev/null
+++ b/cmd/main.go
@@ -0,0 +1,76 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package cmd
+
+import (
+ _ "embed"
+ "flag"
+ "fmt"
+ "log"
+ "os"
+ "strings"
+
+ _ "github.com/philippta/flyscrape/modules/cache"
+ _ "github.com/philippta/flyscrape/modules/depth"
+ _ "github.com/philippta/flyscrape/modules/domainfilter"
+ _ "github.com/philippta/flyscrape/modules/followlinks"
+ _ "github.com/philippta/flyscrape/modules/jsonprint"
+ _ "github.com/philippta/flyscrape/modules/proxy"
+ _ "github.com/philippta/flyscrape/modules/ratelimit"
+ _ "github.com/philippta/flyscrape/modules/starturl"
+ _ "github.com/philippta/flyscrape/modules/urlfilter"
+)
+
+func main() {
+ log.SetFlags(0)
+
+ m := &Main{}
+ if err := m.Run(os.Args[1:]); err == flag.ErrHelp {
+ os.Exit(1)
+ } else if err != nil {
+ log.Println(err)
+ os.Exit(1)
+ }
+}
+
+type Main struct{}
+
+func (m *Main) Run(args []string) error {
+ var cmd string
+ if len(args) > 0 {
+ cmd, args = args[0], args[1:]
+ }
+
+ switch cmd {
+ case "new":
+ return (&NewCommand{}).Run(args)
+ case "run":
+ return (&RunCommand{}).Run(args)
+ case "dev":
+ return (&DevCommand{}).Run(args)
+ default:
+ if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") {
+ m.Usage()
+ return flag.ErrHelp
+ }
+ return fmt.Errorf("flyscrape %s: unknown command", cmd)
+ }
+}
+
+func (m *Main) Usage() {
+ fmt.Println(`
+flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites.
+
+Usage:
+
+ flyscrape <command> [arguments]
+
+Commands:
+
+ new creates a sample scraping script
+ run runs a scraping script
+ dev watches and re-runs a scraping script
+`[1:])
+}
diff --git a/cmd/flyscrape/new.go b/cmd/new.go
index 4ab248e..1da962d 100644
--- a/cmd/flyscrape/new.go
+++ b/cmd/new.go
@@ -2,7 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
-package main
+package cmd
import (
"flag"
diff --git a/cmd/flyscrape/run.go b/cmd/run.go
index 7a8930a..8c1a39d 100644
--- a/cmd/flyscrape/run.go
+++ b/cmd/run.go
@@ -2,7 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
-package main
+package cmd
import (
"flag"
@@ -22,11 +22,14 @@ func (c *RunCommand) Run(args []string) error {
} else if fs.NArg() == 0 || fs.Arg(0) == "" {
c.Usage()
return flag.ErrHelp
- } else if fs.NArg() > 1 {
- return fmt.Errorf("too many arguments")
}
- return flyscrape.Run(fs.Arg(0))
+ cfg, err := parseConfigArgs(fs.Args()[1:])
+ if err != nil {
+ return fmt.Errorf("Error parsing config flags: %w", err)
+ }
+
+ return flyscrape.Run(fs.Arg(0), cfg)
}
func (c *RunCommand) Usage() {
@@ -35,11 +38,20 @@ The run command runs the scraping script.
Usage:
- flyscrape run SCRIPT
+ flyscrape run SCRIPT [config flags]
Examples:
# Run the script.
$ flyscrape run example.js
+
+ # Set the URL as argument.
+ $ flyscrape run example.js --url "http://other.com"
+
+ # Enable proxy support.
+ $ flyscrape run example.js --proxies "http://someproxy:8043"
+
+ # Follow paginated links.
+ $ flyscrape run example.js --depth 5 --follow ".next-button > a"
`[1:])
}