diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-16 18:25:04 +0200 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-16 18:25:04 +0200 |
| commit | 807796ad35b48c58f61f6c058e12ec10078fd0e3 (patch) | |
| tree | 872b1eace112a066099a18c03191e2c7e162e35b /cmd | |
| parent | 062b36fe5725d1267c66db2e506b4131d78ce772 (diff) | |
create cli
Diffstat (limited to 'cmd')
| -rw-r--r-- | cmd/flyscrape/main.go | 85 | ||||
| -rw-r--r-- | cmd/flyscrape/run.go | 78 | ||||
| -rw-r--r-- | cmd/flyscrape/watch.go | 75 | ||||
| -rw-r--r-- | cmd/watch/main.go | 83 |
4 files changed, 194 insertions, 127 deletions
diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go index 85e40a9..ab57d02 100644 --- a/cmd/flyscrape/main.go +++ b/cmd/flyscrape/main.go @@ -2,61 +2,58 @@ package main import ( _ "embed" - "encoding/json" + "flag" "fmt" - "io" - "net/http" + "log" "os" - "time" - - "flyscrape" + "strings" ) func main() { - if len(os.Args) != 2 { - exit("Please provide a file to run.") - } - - src, err := os.ReadFile(os.Args[1]) - if err != nil { - exit(fmt.Sprintf("Error reading file: %v", err)) + log.SetFlags(0) + + m := &Main{} + if err := m.Run(os.Args[1:]); err == flag.ErrHelp { + os.Exit(1) + } else if err != nil { + log.Println(err) + os.Exit(1) } +} - opts, scrape, err := flyscrape.Compile(string(src)) - if err != nil { - exit(fmt.Sprintf("Error compiling JavaScript file: %v", err)) - } +type Main struct{} - svc := flyscrape.Scraper{ - ScrapeOptions: opts, - ScrapeFunc: scrape, - Concurrency: 5, - FetchFunc: func(url string) (string, error) { - resp, err := http.Get(url) - if err != nil { - return "", err - } - defer resp.Body.Close() - - data, err := io.ReadAll(resp.Body) - if err != nil { - return "", err - } - return string(data), nil - }, +func (m *Main) Run(args []string) error { + var cmd string + if len(args) > 0 { + cmd, args = args[0], args[1:] } - count := 0 - start := time.Now() - for result := range svc.Scrape() { - data, _ := json.MarshalIndent(result, "", " ") - fmt.Println(string(data)) - count++ + switch cmd { + case "run": + return (&RunCommand{}).Run(args) + case "watch": + return (&WatchCommand{}).Run(args) + default: + if cmd == "" || cmd == "help" || strings.HasPrefix(cmd, "-") { + m.Usage() + return flag.ErrHelp + } + return fmt.Errorf("flyscrape %s: unknown command", cmd) } - fmt.Printf("Scraped %d websites in %v\n", count, time.Since(start)) } -func exit(msg string) { - fmt.Fprintln(os.Stderr, msg) - os.Exit(1) +func (m *Main) Usage() { + fmt.Println(` +flyscrape is an elegant scraping tool for efficiently extracting data from websites. + +Usage: + + flyscrape <command> [arguments] + +Commands: + + run runs a scraping script + watch watches and re-runs a scraping script +`[1:]) } diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go new file mode 100644 index 0000000..ca037d2 --- /dev/null +++ b/cmd/flyscrape/run.go @@ -0,0 +1,78 @@ +package main + +import ( + "flag" + "fmt" + "log" + "os" + "time" + + "flyscrape" +) + +type RunCommand struct{} + +func (c *RunCommand) Run(args []string) error { + fs := flag.NewFlagSet("flyscrape-run", flag.ContinueOnError) + concurrent := fs.Int("concurrent", 0, "concurrency") + fs.Usage = c.Usage + + if err := fs.Parse(args); err != nil { + return err + } else if fs.NArg() == 0 || fs.Arg(0) == "" { + return fmt.Errorf("script path required") + } else if fs.NArg() > 1 { + return fmt.Errorf("too many arguments") + } + + script := fs.Arg(0) + src, err := os.ReadFile(script) + if err != nil { + return fmt.Errorf("failed to read script %q: %v", script, err) + } + + opts, scrape, err := flyscrape.Compile(string(src)) + if err != nil { + return fmt.Errorf("failed to compile script: %v", err) + } + + svc := flyscrape.Scraper{ + ScrapeOptions: opts, + ScrapeFunc: scrape, + Concurrency: *concurrent, + } + + count := 0 + start := time.Now() + for result := range svc.Scrape() { + flyscrape.PrettyPrint(result) + count++ + } + log.Printf("Scraped %d websites in %v\n", count, time.Since(start)) + + return nil +} + +func (c *RunCommand) Usage() { + fmt.Println(` +The run command runs the scraping script. + +Usage: + + flyscrape run SCRIPT + +Arguments: + + -concurrent NUM + Determines the number of concurrent requests. + + +Examples: + + # Run the script. + $ flyscrape run example.js + + # Run the script with 10 concurrent requests. + $ flyscrape run -concurrent 10 example.js +`[1:]) +} diff --git a/cmd/flyscrape/watch.go b/cmd/flyscrape/watch.go new file mode 100644 index 0000000..ca006db --- /dev/null +++ b/cmd/flyscrape/watch.go @@ -0,0 +1,75 @@ +package main + +import ( + "flag" + "fmt" + "log" + + "flyscrape" + + "github.com/inancgumus/screen" +) + +type WatchCommand struct{} + +func (c *WatchCommand) Run(args []string) error { + fs := flag.NewFlagSet("flyscrape-watch", flag.ContinueOnError) + fs.Usage = c.Usage + + if err := fs.Parse(args); err != nil { + return err + } else if fs.NArg() == 0 || fs.Arg(0) == "" { + return fmt.Errorf("script path required") + } else if fs.NArg() > 1 { + return fmt.Errorf("too many arguments") + } + + fetch := flyscrape.CachedFetch() + script := fs.Arg(0) + + flyscrape.Watch(script, func(s string) error { + opts, scrape, err := flyscrape.Compile(s) + if err != nil { + log.Println(err) + // ignore compilation errors + return nil + } + + opts.Depth = 0 + scr := flyscrape.Scraper{ + ScrapeOptions: opts, + ScrapeFunc: scrape, + FetchFunc: fetch, + } + + result := <-scr.Scrape() + if result.Error != nil { + log.Println(result.Error) + return nil + } + + screen.Clear() + screen.MoveTopLeft() + flyscrape.PrettyPrint(result) + return nil + }) + + return nil +} + +func (c *WatchCommand) Usage() { + fmt.Println(` +The watch command watches the scraping script and re-runs it on any change. +Recursive scraping is disabled in this mode, only the initial URL will be scraped. + +Usage: + + flyscrape watch SCRIPT + + +Examples: + + # Run and watch script. + $ flyscrape watch example.js +`[1:]) +} diff --git a/cmd/watch/main.go b/cmd/watch/main.go deleted file mode 100644 index 5065d8b..0000000 --- a/cmd/watch/main.go +++ /dev/null @@ -1,83 +0,0 @@ -package main - -import ( - "encoding/json" - "fmt" - "io" - "log" - "net/http" - "os" - - "flyscrape" - - "github.com/cornelk/hashmap" - "github.com/inancgumus/screen" -) - -func main() { - if len(os.Args) != 2 { - fmt.Println("Please provide a file to run.") - os.Exit(1) - } - - cache := hashmap.New[string, string]() - - err := flyscrape.Watch(os.Args[1], func(s string) error { - opts, scrape, err := flyscrape.Compile(s) - if err == nil { - run(cache, opts, scrape) - } - return nil - }) - if err != nil { - log.Fatal(err) - } -} - -func run(cache *hashmap.Map[string, string], opts flyscrape.ScrapeOptions, fn flyscrape.ScrapeFunc) { - opts.Depth = 0 - - svc := flyscrape.Scraper{ - Concurrency: 20, - ScrapeOptions: opts, - ScrapeFunc: fn, - FetchFunc: func(url string) (string, error) { - if html, ok := cache.Get(url); ok { - return html, nil - } - html, err := fetch(url) - if err != nil { - return "", err - } - cache.Set(url, html) - return html, nil - }, - } - - result := <-svc.Scrape() - if result.Error != nil { - fmt.Println(result.Error) - } - - screen.Clear() - screen.MoveTopLeft() - - enc := json.NewEncoder(os.Stdout) - enc.SetEscapeHTML(false) - enc.SetIndent("", " ") - enc.Encode(result) -} - -func fetch(url string) (string, error) { - resp, err := http.Get(url) - if err != nil { - return "", err - } - defer resp.Body.Close() - body, err := io.ReadAll(resp.Body) - if err != nil { - return "", err - } - - return string(body), nil -} |