diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-16 19:05:24 +0200 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-16 19:05:24 +0200 |
| commit | d82e66800478219dd924c6969bd91dbfe004fc9d (patch) | |
| tree | 2c8211096fa38d619c7f5260b0193aea3c1a56ff | |
| parent | 807796ad35b48c58f61f6c058e12ec10078fd0e3 (diff) | |
create new command
| -rw-r--r-- | cmd/flyscrape/main.go | 3 | ||||
| -rw-r--r-- | cmd/flyscrape/new.go | 52 | ||||
| -rw-r--r-- | cmd/flyscrape/run.go | 4 | ||||
| -rw-r--r-- | cmd/flyscrape/watch.go | 5 | ||||
| -rw-r--r-- | js/embed.go | 3 | ||||
| -rw-r--r-- | js/template.js | 27 |
6 files changed, 91 insertions, 3 deletions
diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go index ab57d02..299e7e5 100644 --- a/cmd/flyscrape/main.go +++ b/cmd/flyscrape/main.go @@ -30,6 +30,8 @@ func (m *Main) Run(args []string) error { } switch cmd { + case "new": + return (&NewCommand{}).Run(args) case "run": return (&RunCommand{}).Run(args) case "watch": @@ -53,6 +55,7 @@ Usage: Commands: + new creates a sample scraping script run runs a scraping script watch watches and re-runs a scraping script `[1:]) diff --git a/cmd/flyscrape/new.go b/cmd/flyscrape/new.go new file mode 100644 index 0000000..7a4c662 --- /dev/null +++ b/cmd/flyscrape/new.go @@ -0,0 +1,52 @@ +package main + +import ( + "flag" + "fmt" + "os" + + "flyscrape/js" +) + +type NewCommand struct{} + +func (c *NewCommand) Run(args []string) error { + fs := flag.NewFlagSet("flyscrape-new", flag.ContinueOnError) + fs.Usage = c.Usage + + if err := fs.Parse(args); err != nil { + return err + } else if fs.NArg() == 0 || fs.Arg(0) == "" { + return fmt.Errorf("script path required") + } else if fs.NArg() > 1 { + return fmt.Errorf("too many arguments") + } + + script := fs.Arg(0) + if _, err := os.Stat(script); err == nil { + return fmt.Errorf("script already exists") + } + + if err := os.WriteFile(script, js.Template, 0o644); err != nil { + return fmt.Errorf("failed to create script %q: %w", script, err) + } + + fmt.Printf("Scraping script %v created.\n", script) + return nil +} + +func (c *NewCommand) Usage() { + fmt.Println(` +The new command creates a new scraping script. + +Usage: + + flyscrape new SCRIPT + + +Examples: + + # Create a new scraping script. + $ flyscrape new example.js +`[1:]) +} diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go index ca037d2..8ec9390 100644 --- a/cmd/flyscrape/run.go +++ b/cmd/flyscrape/run.go @@ -28,12 +28,12 @@ func (c *RunCommand) Run(args []string) error { script := fs.Arg(0) src, err := os.ReadFile(script) if err != nil { - return fmt.Errorf("failed to read script %q: %v", script, err) + return fmt.Errorf("failed to read script %q: %w", script, err) } opts, scrape, err := flyscrape.Compile(string(src)) if err != nil { - return fmt.Errorf("failed to compile script: %v", err) + return fmt.Errorf("failed to compile script: %w", err) } svc := flyscrape.Scraper{ diff --git a/cmd/flyscrape/watch.go b/cmd/flyscrape/watch.go index ca006db..99fac4e 100644 --- a/cmd/flyscrape/watch.go +++ b/cmd/flyscrape/watch.go @@ -27,7 +27,7 @@ func (c *WatchCommand) Run(args []string) error { fetch := flyscrape.CachedFetch() script := fs.Arg(0) - flyscrape.Watch(script, func(s string) error { + err := flyscrape.Watch(script, func(s string) error { opts, scrape, err := flyscrape.Compile(s) if err != nil { log.Println(err) @@ -53,6 +53,9 @@ func (c *WatchCommand) Run(args []string) error { flyscrape.PrettyPrint(result) return nil }) + if err != nil && err != flyscrape.StopWatch { + return fmt.Errorf("failed to watch script %q: %w", script, err) + } return nil } diff --git a/js/embed.go b/js/embed.go index 5413e77..dcc8d93 100644 --- a/js/embed.go +++ b/js/embed.go @@ -6,3 +6,6 @@ import _ "embed" //go:embed flyscrape_bundle.js var Flyscrape string + +//go:embed template.js +var Template []byte diff --git a/js/template.js b/js/template.js new file mode 100644 index 0000000..d33adc5 --- /dev/null +++ b/js/template.js @@ -0,0 +1,27 @@ +import { parse } from "flyscrape"; + +export const options = { + url: "https://news.ycombinator.com/", // Specify the URL to start scraping from. + depth: 1, // Specify how deep links should be followed (0 = no follow). + allowedDomains: ["news.ycombinator.com"], // Specify the allowed domains to follow. +} + +export default function({ html, url }) { + const $ = parse(html); + + return { + title: $('title').text(), + entries: $('.athing').toArray().map(entry => { + const link = $(entry).find('.titleline > a'); + const rank = $(entry).find('.rank'); + const points = $(entry).next().find('.score'); + + return { + title: link.text(), // Extract the title text. + url: link.attr('href'), // Extract the link href. + rank: parseInt(rank.text().slice(0, -1)), // Extract and cleanup the rank. + points: parseInt(points.text().replace(' points', '')), // Extract and cleanup the points. + } + }), + }; +} |