summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2023-08-16 18:25:04 +0200
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2023-08-16 18:25:04 +0200
commit807796ad35b48c58f61f6c058e12ec10078fd0e3 (patch)
tree872b1eace112a066099a18c03191e2c7e162e35b
parent062b36fe5725d1267c66db2e506b4131d78ce772 (diff)
create cli
-rw-r--r--cmd/flyscrape/main.go85
-rw-r--r--cmd/flyscrape/run.go78
-rw-r--r--cmd/flyscrape/watch.go75
-rw-r--r--cmd/watch/main.go83
-rw-r--r--fetch.go51
-rw-r--r--scrape.go3
-rw-r--r--utils.go13
-rw-r--r--watch.go2
8 files changed, 262 insertions, 128 deletions
diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go
index 85e40a9..ab57d02 100644
--- a/cmd/flyscrape/main.go
+++ b/cmd/flyscrape/main.go
@@ -2,61 +2,58 @@ package main
import (
_ "embed"
- "encoding/json"
+ "flag"
"fmt"
- "io"
- "net/http"
+ "log"
"os"
- "time"
-
- "flyscrape"
+ "strings"
)
func main() {
- if len(os.Args) != 2 {
- exit("Please provide a file to run.")
- }
-
- src, err := os.ReadFile(os.Args[1])
- if err != nil {
- exit(fmt.Sprintf("Error reading file: %v", err))
+ log.SetFlags(0)
+
+ m := &Main{}
+ if err := m.Run(os.Args[1:]); err == flag.ErrHelp {
+ os.Exit(1)
+ } else if err != nil {
+ log.Println(err)
+ os.Exit(1)
}
+}
- opts, scrape, err := flyscrape.Compile(string(src))
- if err != nil {
- exit(fmt.Sprintf("Error compiling JavaScript file: %v", err))
- }
+type Main struct{}
- svc := flyscrape.Scraper{
- ScrapeOptions: opts,
- ScrapeFunc: scrape,
- Concurrency: 5,
- FetchFunc: func(url string) (string, error) {
- resp, err := http.Get(url)
- if err != nil {
- return "", err
- }
- defer resp.Body.Close()
-
- data, err := io.ReadAll(resp.Body)
- if err != nil {
- return "", err
- }
- return string(data), nil
- },
+func (m *Main) Run(args []string) error {
+ var cmd string
+ if len(args) > 0 {
+ cmd, args = args[0], args[1:]
}
- count := 0
- start := time.Now()
- for result := range svc.Scrape() {
- data, _ := json.MarshalIndent(result, "", " ")
- fmt.Println(string(data))
- count++
+ switch cmd {
+ case "run":
+ return (&RunCommand{}).Run(args)
+ case "watch":
+ return (&WatchCommand{}).Run(args)
+ default:
+ if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") {
+ m.Usage()
+ return flag.ErrHelp
+ }
+ return fmt.Errorf("flyscrape %s: unknown command", cmd)
}
- fmt.Printf("Scraped %d websites in %v\n", count, time.Since(start))
}
-func exit(msg string) {
- fmt.Fprintln(os.Stderr, msg)
- os.Exit(1)
+func (m *Main) Usage() {
+ fmt.Println(`
+flyscrape is an elegant scraping tool for efficiently extracting data from websites.
+
+Usage:
+
+ flyscrape <command> [arguments]
+
+Commands:
+
+ run runs a scraping script
+ watch watches and re-runs a scraping script
+`[1:])
}
diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go
new file mode 100644
index 0000000..ca037d2
--- /dev/null
+++ b/cmd/flyscrape/run.go
@@ -0,0 +1,78 @@
+package main
+
+import (
+ "flag"
+ "fmt"
+ "log"
+ "os"
+ "time"
+
+ "flyscrape"
+)
+
+type RunCommand struct{}
+
+func (c *RunCommand) Run(args []string) error {
+ fs := flag.NewFlagSet("flyscrape-run", flag.ContinueOnError)
+ concurrent := fs.Int("concurrent", 0, "concurrency")
+ fs.Usage = c.Usage
+
+ if err := fs.Parse(args); err != nil {
+ return err
+ } else if fs.NArg() == 0 || fs.Arg(0) == "" {
+ return fmt.Errorf("script path required")
+ } else if fs.NArg() > 1 {
+ return fmt.Errorf("too many arguments")
+ }
+
+ script := fs.Arg(0)
+ src, err := os.ReadFile(script)
+ if err != nil {
+ return fmt.Errorf("failed to read script %q: %v", script, err)
+ }
+
+ opts, scrape, err := flyscrape.Compile(string(src))
+ if err != nil {
+ return fmt.Errorf("failed to compile script: %v", err)
+ }
+
+ svc := flyscrape.Scraper{
+ ScrapeOptions: opts,
+ ScrapeFunc: scrape,
+ Concurrency: *concurrent,
+ }
+
+ count := 0
+ start := time.Now()
+ for result := range svc.Scrape() {
+ flyscrape.PrettyPrint(result)
+ count++
+ }
+ log.Printf("Scraped %d websites in %v\n", count, time.Since(start))
+
+ return nil
+}
+
+func (c *RunCommand) Usage() {
+ fmt.Println(`
+The run command runs the scraping script.
+
+Usage:
+
+ flyscrape run SCRIPT
+
+Arguments:
+
+ -concurrent NUM
+ Determines the number of concurrent requests.
+
+
+Examples:
+
+ # Run the script.
+ $ flyscrape run example.js
+
+ # Run the script with 10 concurrent requests.
+ $ flyscrape run -concurrent 10 example.js
+`[1:])
+}
diff --git a/cmd/flyscrape/watch.go b/cmd/flyscrape/watch.go
new file mode 100644
index 0000000..ca006db
--- /dev/null
+++ b/cmd/flyscrape/watch.go
@@ -0,0 +1,75 @@
+package main
+
+import (
+ "flag"
+ "fmt"
+ "log"
+
+ "flyscrape"
+
+ "github.com/inancgumus/screen"
+)
+
+type WatchCommand struct{}
+
+func (c *WatchCommand) Run(args []string) error {
+ fs := flag.NewFlagSet("flyscrape-watch", flag.ContinueOnError)
+ fs.Usage = c.Usage
+
+ if err := fs.Parse(args); err != nil {
+ return err
+ } else if fs.NArg() == 0 || fs.Arg(0) == "" {
+ return fmt.Errorf("script path required")
+ } else if fs.NArg() > 1 {
+ return fmt.Errorf("too many arguments")
+ }
+
+ fetch := flyscrape.CachedFetch()
+ script := fs.Arg(0)
+
+ flyscrape.Watch(script, func(s string) error {
+ opts, scrape, err := flyscrape.Compile(s)
+ if err != nil {
+ log.Println(err)
+ // ignore compilation errors
+ return nil
+ }
+
+ opts.Depth = 0
+ scr := flyscrape.Scraper{
+ ScrapeOptions: opts,
+ ScrapeFunc: scrape,
+ FetchFunc: fetch,
+ }
+
+ result := <-scr.Scrape()
+ if result.Error != nil {
+ log.Println(result.Error)
+ return nil
+ }
+
+ screen.Clear()
+ screen.MoveTopLeft()
+ flyscrape.PrettyPrint(result)
+ return nil
+ })
+
+ return nil
+}
+
+func (c *WatchCommand) Usage() {
+ fmt.Println(`
+The watch command watches the scraping script and re-runs it on any change.
+Recursive scraping is disabled in this mode, only the initial URL will be scraped.
+
+Usage:
+
+ flyscrape watch SCRIPT
+
+
+Examples:
+
+ # Run and watch script.
+ $ flyscrape watch example.js
+`[1:])
+}
diff --git a/cmd/watch/main.go b/cmd/watch/main.go
deleted file mode 100644
index 5065d8b..0000000
--- a/cmd/watch/main.go
+++ /dev/null
@@ -1,83 +0,0 @@
-package main
-
-import (
- "encoding/json"
- "fmt"
- "io"
- "log"
- "net/http"
- "os"
-
- "flyscrape"
-
- "github.com/cornelk/hashmap"
- "github.com/inancgumus/screen"
-)
-
-func main() {
- if len(os.Args) != 2 {
- fmt.Println("Please provide a file to run.")
- os.Exit(1)
- }
-
- cache := hashmap.New[string, string]()
-
- err := flyscrape.Watch(os.Args[1], func(s string) error {
- opts, scrape, err := flyscrape.Compile(s)
- if err == nil {
- run(cache, opts, scrape)
- }
- return nil
- })
- if err != nil {
- log.Fatal(err)
- }
-}
-
-func run(cache *hashmap.Map[string, string], opts flyscrape.ScrapeOptions, fn flyscrape.ScrapeFunc) {
- opts.Depth = 0
-
- svc := flyscrape.Scraper{
- Concurrency: 20,
- ScrapeOptions: opts,
- ScrapeFunc: fn,
- FetchFunc: func(url string) (string, error) {
- if html, ok := cache.Get(url); ok {
- return html, nil
- }
- html, err := fetch(url)
- if err != nil {
- return "", err
- }
- cache.Set(url, html)
- return html, nil
- },
- }
-
- result := <-svc.Scrape()
- if result.Error != nil {
- fmt.Println(result.Error)
- }
-
- screen.Clear()
- screen.MoveTopLeft()
-
- enc := json.NewEncoder(os.Stdout)
- enc.SetEscapeHTML(false)
- enc.SetIndent("", " ")
- enc.Encode(result)
-}
-
-func fetch(url string) (string, error) {
- resp, err := http.Get(url)
- if err != nil {
- return "", err
- }
- defer resp.Body.Close()
- body, err := io.ReadAll(resp.Body)
- if err != nil {
- return "", err
- }
-
- return string(body), nil
-}
diff --git a/fetch.go b/fetch.go
new file mode 100644
index 0000000..68b10b6
--- /dev/null
+++ b/fetch.go
@@ -0,0 +1,51 @@
+package flyscrape
+
+import (
+ "io"
+ "net/http"
+
+ "github.com/cornelk/hashmap"
+)
+
+func CachedFetch() FetchFunc {
+ cache := hashmap.New[string, string]()
+
+ return func(url string) (string, error) {
+ if html, ok := cache.Get(url); ok {
+ return html, nil
+ }
+
+ resp, err := http.Get(url)
+ if err != nil {
+ return "", err
+ }
+ defer resp.Body.Close()
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return "", err
+ }
+
+ html := string(body)
+ cache.Set(url, html)
+ return html, nil
+ }
+}
+
+func Fetch() FetchFunc {
+ return func(url string) (string, error) {
+ resp, err := http.Get(url)
+ if err != nil {
+ return "", err
+ }
+ defer resp.Body.Close()
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return "", err
+ }
+
+ html := string(body)
+ return html, nil
+ }
+}
diff --git a/scrape.go b/scrape.go
index 3706510..0c31518 100644
--- a/scrape.go
+++ b/scrape.go
@@ -59,6 +59,9 @@ func (s *Scraper) Scrape() <-chan ScrapeResult {
if s.Concurrency == 0 {
s.Concurrency = 1
}
+ if s.FetchFunc == nil {
+ s.FetchFunc = Fetch()
+ }
jobs := make(chan target, 1024)
results := make(chan result)
diff --git a/utils.go b/utils.go
new file mode 100644
index 0000000..b3aca0b
--- /dev/null
+++ b/utils.go
@@ -0,0 +1,13 @@
+package flyscrape
+
+import (
+ "encoding/json"
+ "os"
+)
+
+func PrettyPrint(v any) {
+ enc := json.NewEncoder(os.Stdout)
+ enc.SetEscapeHTML(false)
+ enc.SetIndent("", " ")
+ enc.Encode(v)
+}
diff --git a/watch.go b/watch.go
index 864557b..1f62acb 100644
--- a/watch.go
+++ b/watch.go
@@ -43,7 +43,7 @@ func Watch(path string, fn func(string) error) error {
return nil
}
if event.Has(fsnotify.Remove) {
- return nil
+ continue
}
if event.Has(fsnotify.Chmod) {
continue