summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2023-11-18 22:49:26 +0100
committerGitHub <noreply@github.com>2023-11-18 22:49:26 +0100
commit6aa52bdbe2cefdbc9219abfb4399afa0d492913d (patch)
tree02c743b7d9393dbf024e14adada73c6594bdd34a
parent94da9293f63e46712b0a890e1e0eab4153fdb3f9 (diff)
Support passing config options as CLI arguments (#15)
-rw-r--r--README.md30
-rw-r--r--cmd/args.go97
-rw-r--r--cmd/args_test.go75
-rw-r--r--cmd/dev.go (renamed from cmd/flyscrape/dev.go)19
-rw-r--r--cmd/flyscrape/main.go52
-rw-r--r--cmd/main.go76
-rw-r--r--cmd/new.go (renamed from cmd/flyscrape/new.go)2
-rw-r--r--cmd/run.go (renamed from cmd/flyscrape/run.go)22
-rw-r--r--flyscrape.go38
-rw-r--r--go.mod6
-rw-r--r--go.sum10
-rw-r--r--watch.go4
12 files changed, 349 insertions, 82 deletions
diff --git a/README.md b/README.md
index d7c701c..2f84843 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,16 @@
```javascript
export const config = {
url: "https://news.ycombinator.com/",
+ // urls: [] // Specify additional URLs to start from. (default = none)
+ // depth: 0, // Specify how deep links should be followed. (default = 0, no follow)
+ // follow: [], // Speficy the css selectors to follow (default = ["a[href]"])
+ // allowedDomains: [], // Specify the allowed domains. ['*'] for all. (default = domain from url)
+ // blockedDomains: [], // Specify the blocked domains. (default = none)
+ // allowedURLs: [], // Specify the allowed URLs as regex. (default = all allowed)
+ // blockedURLs: [], // Specify the blocked URLs as regex. (default = none)
+ // rate: 100, // Specify the rate in requests per second. (default = no rate limit)
+ // proxies: [], // Specify the HTTP(S) proxy URLs. (default = no proxy)
+ // cache: "file", // Enable file-based request caching. (default = no cache)
}
export default function ({ doc, absoluteURL }) {
@@ -99,17 +109,23 @@ To compile flyscrape from source, follow these steps:
## Usage
```
-flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites.
-
Usage:
- flyscrape <command> [arguments]
+ flyscrape run SCRIPT [config flags]
+
+Examples:
+
+ # Run the script.
+ $ flyscrape run example.js
+
+ # Set the URL as argument.
+ $ flyscrape run example.js --url "http://other.com"
-Commands:
+ # Enable proxy support.
+ $ flyscrape run example.js --proxies "http://someproxy:8043"
- new creates a sample scraping script
- run runs a scraping script
- dev watches and re-runs a scraping script
+ # Follow paginated links.
+ $ flyscrape run example.js --depth 5 --follow ".next-button > a"
```
## Configuration
diff --git a/cmd/args.go b/cmd/args.go
new file mode 100644
index 0000000..e11308b
--- /dev/null
+++ b/cmd/args.go
@@ -0,0 +1,97 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package cmd
+
+import (
+ "fmt"
+ "slices"
+ "strconv"
+ "strings"
+)
+
+var arrayFields = []string{
+ "urls",
+ "follow",
+ "allowedDomains",
+ "blockedDomains",
+ "allowedURLs",
+ "blockedURLs",
+ "proxies",
+}
+
+func parseConfigArgs(args []string) (map[string]any, error) {
+ updates := map[string]any{}
+
+ flag := ""
+ for _, arg := range normalizeArgs(args) {
+ if flag == "" && !isFlag(arg) {
+ return nil, fmt.Errorf("expected flag, got %q instead", arg)
+ }
+
+ if flag != "" && isFlag(arg) {
+ updates[flag[2:]] = true
+ flag = ""
+ continue
+ }
+
+ if flag != "" {
+ if v, ok := updates[flag[2:]]; ok {
+ if vv, ok := v.([]any); ok {
+ updates[flag[2:]] = append(vv, parseArg(arg))
+ } else {
+ updates[flag[2:]] = []any{v, parseArg(arg)}
+ }
+ } else {
+ if slices.Contains(arrayFields, flag[2:]) {
+ updates[flag[2:]] = []any{parseArg(arg)}
+ } else {
+ updates[flag[2:]] = parseArg(arg)
+ }
+ }
+ flag = ""
+ continue
+ }
+
+ flag = arg
+ }
+
+ if flag != "" {
+ updates[flag[2:]] = true
+ flag = ""
+ }
+
+ return updates, nil
+}
+
+func normalizeArgs(args []string) []string {
+ var norm []string
+
+ for _, arg := range args {
+ if !strings.HasPrefix(arg, "--") {
+ norm = append(norm, arg)
+ } else {
+ norm = append(norm, strings.SplitN(arg, "=", 2)...)
+ }
+ }
+
+ return norm
+}
+
+func parseArg(arg string) any {
+ if arg == "true" {
+ return true
+ }
+ if arg == "false" {
+ return false
+ }
+ if num, err := strconv.Atoi(arg); err == nil {
+ return num
+ }
+ return arg
+}
+
+func isFlag(arg string) bool {
+ return strings.HasPrefix(arg, "--")
+}
diff --git a/cmd/args_test.go b/cmd/args_test.go
new file mode 100644
index 0000000..3153fd8
--- /dev/null
+++ b/cmd/args_test.go
@@ -0,0 +1,75 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package cmd
+
+import (
+ "strings"
+ "testing"
+
+ "github.com/stretchr/testify/require"
+)
+
+func TestParseConfigUpdates(t *testing.T) {
+ tests := []struct {
+ flags string
+ err bool
+ updates map[string]any
+ }{
+ {
+ flags: `--foo bar`,
+ updates: map[string]any{"foo": "bar"},
+ },
+ {
+ flags: `--foo=bar`,
+ updates: map[string]any{"foo": "bar"},
+ },
+ {
+ flags: `--foo`,
+ updates: map[string]any{"foo": true},
+ },
+ {
+ flags: `--foo false`,
+ updates: map[string]any{"foo": false},
+ },
+ {
+ flags: `--foo a --foo b`,
+ updates: map[string]any{"foo": []any{"a", "b"}},
+ },
+ {
+ flags: `--foo a --foo=b`,
+ updates: map[string]any{"foo": []any{"a", "b"}},
+ },
+ {
+ flags: `--foo 69`,
+ updates: map[string]any{"foo": 69},
+ },
+ {
+ flags: `--foo.bar a`,
+ updates: map[string]any{"foo.bar": "a"},
+ },
+ {
+ flags: `foo`,
+ err: true,
+ },
+ {
+ flags: `--foo a b`,
+ err: true,
+ },
+ }
+ for _, test := range tests {
+ t.Run(test.flags, func(t *testing.T) {
+ args, err := parseConfigArgs(strings.Fields(test.flags))
+
+ if test.err {
+ require.Error(t, err)
+ require.Empty(t, args)
+ return
+ }
+
+ require.NoError(t, err)
+ require.Equal(t, test.updates, args)
+ })
+ }
+}
diff --git a/cmd/flyscrape/dev.go b/cmd/dev.go
index 84a436b..e7edbf8 100644
--- a/cmd/flyscrape/dev.go
+++ b/cmd/dev.go
@@ -2,7 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
-package main
+package cmd
import (
"flag"
@@ -22,11 +22,14 @@ func (c *DevCommand) Run(args []string) error {
} else if fs.NArg() == 0 || fs.Arg(0) == "" {
c.Usage()
return flag.ErrHelp
- } else if fs.NArg() > 1 {
- return fmt.Errorf("too many arguments")
}
- return flyscrape.Dev(fs.Arg(0))
+ cfg, err := parseConfigArgs(fs.Args()[1:])
+ if err != nil {
+ return fmt.Errorf("Error parsing config flags: %w", err)
+ }
+
+ return flyscrape.Dev(fs.Arg(0), cfg)
}
func (c *DevCommand) Usage() {
@@ -36,11 +39,17 @@ Recursive scraping is disabled in this mode, only the initial URL will be scrape
Usage:
- flyscrape dev SCRIPT
+ flyscrape dev SCRIPT [config flags]
Examples:
# Run and watch script.
$ flyscrape dev example.js
+
+ # Set the URL as argument.
+ $ flyscrape dev example.js --url "http://other.com"
+
+ # Enable proxy support.
+ $ flyscrape dev example.js --proxies "http://someproxy:8043"
`[1:])
}
diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go
index 5ea140a..8b9d430 100644
--- a/cmd/flyscrape/main.go
+++ b/cmd/flyscrape/main.go
@@ -7,11 +7,10 @@ package main
import (
_ "embed"
"flag"
- "fmt"
"log"
"os"
- "strings"
+ "github.com/philippta/flyscrape/cmd"
_ "github.com/philippta/flyscrape/modules/cache"
_ "github.com/philippta/flyscrape/modules/depth"
_ "github.com/philippta/flyscrape/modules/domainfilter"
@@ -26,51 +25,10 @@ import (
func main() {
log.SetFlags(0)
- m := &Main{}
- if err := m.Run(os.Args[1:]); err == flag.ErrHelp {
- os.Exit(1)
- } else if err != nil {
- log.Println(err)
- os.Exit(1)
- }
-}
-
-type Main struct{}
-
-func (m *Main) Run(args []string) error {
- var cmd string
- if len(args) > 0 {
- cmd, args = args[0], args[1:]
- }
-
- switch cmd {
- case "new":
- return (&NewCommand{}).Run(args)
- case "run":
- return (&RunCommand{}).Run(args)
- case "dev":
- return (&DevCommand{}).Run(args)
- default:
- if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") {
- m.Usage()
- return flag.ErrHelp
+ if err := (&cmd.Main{}).Run(os.Args[1:]); err != nil {
+ if err != flag.ErrHelp {
+ log.Println(err)
}
- return fmt.Errorf("flyscrape %s: unknown command", cmd)
+ os.Exit(1)
}
}
-
-func (m *Main) Usage() {
- fmt.Println(`
-flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites.
-
-Usage:
-
- flyscrape <command> [arguments]
-
-Commands:
-
- new creates a sample scraping script
- run runs a scraping script
- dev watches and re-runs a scraping script
-`[1:])
-}
diff --git a/cmd/main.go b/cmd/main.go
new file mode 100644
index 0000000..7c49dbf
--- /dev/null
+++ b/cmd/main.go
@@ -0,0 +1,76 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package cmd
+
+import (
+ _ "embed"
+ "flag"
+ "fmt"
+ "log"
+ "os"
+ "strings"
+
+ _ "github.com/philippta/flyscrape/modules/cache"
+ _ "github.com/philippta/flyscrape/modules/depth"
+ _ "github.com/philippta/flyscrape/modules/domainfilter"
+ _ "github.com/philippta/flyscrape/modules/followlinks"
+ _ "github.com/philippta/flyscrape/modules/jsonprint"
+ _ "github.com/philippta/flyscrape/modules/proxy"
+ _ "github.com/philippta/flyscrape/modules/ratelimit"
+ _ "github.com/philippta/flyscrape/modules/starturl"
+ _ "github.com/philippta/flyscrape/modules/urlfilter"
+)
+
+func main() {
+ log.SetFlags(0)
+
+ m := &Main{}
+ if err := m.Run(os.Args[1:]); err == flag.ErrHelp {
+ os.Exit(1)
+ } else if err != nil {
+ log.Println(err)
+ os.Exit(1)
+ }
+}
+
+type Main struct{}
+
+func (m *Main) Run(args []string) error {
+ var cmd string
+ if len(args) > 0 {
+ cmd, args = args[0], args[1:]
+ }
+
+ switch cmd {
+ case "new":
+ return (&NewCommand{}).Run(args)
+ case "run":
+ return (&RunCommand{}).Run(args)
+ case "dev":
+ return (&DevCommand{}).Run(args)
+ default:
+ if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") {
+ m.Usage()
+ return flag.ErrHelp
+ }
+ return fmt.Errorf("flyscrape %s: unknown command", cmd)
+ }
+}
+
+func (m *Main) Usage() {
+ fmt.Println(`
+flyscrape is a standalone and scriptable web scraper for efficiently extracting data from websites.
+
+Usage:
+
+ flyscrape <command> [arguments]
+
+Commands:
+
+ new creates a sample scraping script
+ run runs a scraping script
+ dev watches and re-runs a scraping script
+`[1:])
+}
diff --git a/cmd/flyscrape/new.go b/cmd/new.go
index 4ab248e..1da962d 100644
--- a/cmd/flyscrape/new.go
+++ b/cmd/new.go
@@ -2,7 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
-package main
+package cmd
import (
"flag"
diff --git a/cmd/flyscrape/run.go b/cmd/run.go
index 7a8930a..8c1a39d 100644
--- a/cmd/flyscrape/run.go
+++ b/cmd/run.go
@@ -2,7 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
-package main
+package cmd
import (
"flag"
@@ -22,11 +22,14 @@ func (c *RunCommand) Run(args []string) error {
} else if fs.NArg() == 0 || fs.Arg(0) == "" {
c.Usage()
return flag.ErrHelp
- } else if fs.NArg() > 1 {
- return fmt.Errorf("too many arguments")
}
- return flyscrape.Run(fs.Arg(0))
+ cfg, err := parseConfigArgs(fs.Args()[1:])
+ if err != nil {
+ return fmt.Errorf("Error parsing config flags: %w", err)
+ }
+
+ return flyscrape.Run(fs.Arg(0), cfg)
}
func (c *RunCommand) Usage() {
@@ -35,11 +38,20 @@ The run command runs the scraping script.
Usage:
- flyscrape run SCRIPT
+ flyscrape run SCRIPT [config flags]
Examples:
# Run the script.
$ flyscrape run example.js
+
+ # Set the URL as argument.
+ $ flyscrape run example.js --url "http://other.com"
+
+ # Enable proxy support.
+ $ flyscrape run example.js --proxies "http://someproxy:8043"
+
+ # Follow paginated links.
+ $ flyscrape run example.js --depth 5 --follow ".next-button > a"
`[1:])
}
diff --git a/flyscrape.go b/flyscrape.go
index a96d37a..797d4c7 100644
--- a/flyscrape.go
+++ b/flyscrape.go
@@ -5,7 +5,6 @@
package flyscrape
import (
- "encoding/json"
"fmt"
"log"
"net/http"
@@ -15,9 +14,10 @@ import (
"syscall"
"github.com/inancgumus/screen"
+ "github.com/tidwall/sjson"
)
-func Run(file string) error {
+func Run(file string, overrides map[string]any) error {
src, err := os.ReadFile(file)
if err != nil {
return fmt.Errorf("failed to read script %q: %w", file, err)
@@ -33,18 +33,21 @@ func Run(file string) error {
return fmt.Errorf("failed to compile script: %w", err)
}
+ cfg := exports.Config()
+ cfg = updateCfgMultiple(cfg, overrides)
+
scraper := NewScraper()
scraper.ScrapeFunc = exports.Scrape
scraper.SetupFunc = exports.Setup
scraper.Script = file
scraper.Client = client
- scraper.Modules = LoadModules(exports.Config())
+ scraper.Modules = LoadModules(cfg)
scraper.Run()
return nil
}
-func Dev(file string) error {
+func Dev(file string, overrides map[string]any) error {
cachefile, err := newCacheFile()
if err != nil {
return fmt.Errorf("failed to create cache file: %w", err)
@@ -67,6 +70,7 @@ func Dev(file string) error {
}
cfg := exports.Config()
+ cfg = updateCfgMultiple(cfg, overrides)
cfg = updateCfg(cfg, "depth", 0)
cfg = updateCfg(cfg, "cache", "file:"+cachefile)
@@ -104,19 +108,11 @@ func printCompileErr(script string, err error) {
}
func updateCfg(cfg Config, key string, value any) Config {
- var m map[string]any
- if err := json.Unmarshal(cfg, &m); err != nil {
- return cfg
- }
-
- m[key] = value
-
- b, err := json.Marshal(m)
+ newcfg, err := sjson.Set(string(cfg), key, value)
if err != nil {
return cfg
}
-
- return b
+ return Config(newcfg)
}
func newCacheFile() (string, error) {
@@ -137,3 +133,17 @@ func trapsignal(f func()) {
os.Exit(0)
}()
}
+
+func updateCfgMultiple(cfg Config, updates map[string]any) Config {
+ c := string(cfg)
+
+ for k, v := range updates {
+ nc, err := sjson.Set(c, k, v)
+ if err != nil {
+ continue
+ }
+ c = nc
+ }
+
+ return []byte(c)
+}
diff --git a/go.mod b/go.mod
index 563e26d..c2c7b2b 100644
--- a/go.mod
+++ b/go.mod
@@ -13,6 +13,8 @@ require (
github.com/mattn/go-sqlite3 v1.14.17
github.com/nlnwa/whatwg-url v0.4.0
github.com/stretchr/testify v1.8.4
+ github.com/tidwall/sjson v1.2.5
+ golang.org/x/sync v0.5.0
)
require (
@@ -25,9 +27,11 @@ require (
github.com/kr/pretty v0.3.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rogpeppe/go-internal v1.10.0 // indirect
+ github.com/tidwall/gjson v1.17.0 // indirect
+ github.com/tidwall/match v1.1.1 // indirect
+ github.com/tidwall/pretty v1.2.1 // indirect
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519 // indirect
golang.org/x/net v0.10.0 // indirect
- golang.org/x/sync v0.5.0 // indirect
golang.org/x/sys v0.8.0 // indirect
golang.org/x/term v0.8.0 // indirect
golang.org/x/text v0.9.0 // indirect
diff --git a/go.sum b/go.sum
index 06e3703..bf6ad1e 100644
--- a/go.sum
+++ b/go.sum
@@ -56,6 +56,16 @@ github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjR
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
+github.com/tidwall/gjson v1.17.0 h1:/Jocvlh98kcTfpN2+JzGQWQcqrPQwDrVEMApx/M5ZwM=
+github.com/tidwall/gjson v1.17.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
+github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
+github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
+github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
+github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
+github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
+github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
+github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519 h1:7I4JAnoQBe7ZtJcBaYHi5UtiO8tQHbUSXxL+pnGRANg=
diff --git a/watch.go b/watch.go
index fb34bf5..711f7dd 100644
--- a/watch.go
+++ b/watch.go
@@ -58,14 +58,14 @@ func Watch(path string, fn func(string) error) error {
if errors.Is(err, StopWatch) {
return nil
}
- return err
+ return nil
}
case err, ok := <-watcher.Errors:
if !ok {
return nil
}
if err != nil {
- return err
+ return fmt.Errorf("watcher: %w", err)
}
}
}