summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2023-08-16 19:05:24 +0200
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2023-08-16 19:05:24 +0200
commitd82e66800478219dd924c6969bd91dbfe004fc9d (patch)
tree2c8211096fa38d619c7f5260b0193aea3c1a56ff
parent807796ad35b48c58f61f6c058e12ec10078fd0e3 (diff)
create new command
-rw-r--r--cmd/flyscrape/main.go3
-rw-r--r--cmd/flyscrape/new.go52
-rw-r--r--cmd/flyscrape/run.go4
-rw-r--r--cmd/flyscrape/watch.go5
-rw-r--r--js/embed.go3
-rw-r--r--js/template.js27
6 files changed, 91 insertions, 3 deletions
diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go
index ab57d02..299e7e5 100644
--- a/cmd/flyscrape/main.go
+++ b/cmd/flyscrape/main.go
@@ -30,6 +30,8 @@ func (m *Main) Run(args []string) error {
}
switch cmd {
+ case "new":
+ return (&NewCommand{}).Run(args)
case "run":
return (&RunCommand{}).Run(args)
case "watch":
@@ -53,6 +55,7 @@ Usage:
Commands:
+ new creates a sample scraping script
run runs a scraping script
watch watches and re-runs a scraping script
`[1:])
diff --git a/cmd/flyscrape/new.go b/cmd/flyscrape/new.go
new file mode 100644
index 0000000..7a4c662
--- /dev/null
+++ b/cmd/flyscrape/new.go
@@ -0,0 +1,52 @@
+package main
+
+import (
+ "flag"
+ "fmt"
+ "os"
+
+ "flyscrape/js"
+)
+
+type NewCommand struct{}
+
+func (c *NewCommand) Run(args []string) error {
+ fs := flag.NewFlagSet("flyscrape-new", flag.ContinueOnError)
+ fs.Usage = c.Usage
+
+ if err := fs.Parse(args); err != nil {
+ return err
+ } else if fs.NArg() == 0 || fs.Arg(0) == "" {
+ return fmt.Errorf("script path required")
+ } else if fs.NArg() > 1 {
+ return fmt.Errorf("too many arguments")
+ }
+
+ script := fs.Arg(0)
+ if _, err := os.Stat(script); err == nil {
+ return fmt.Errorf("script already exists")
+ }
+
+ if err := os.WriteFile(script, js.Template, 0o644); err != nil {
+ return fmt.Errorf("failed to create script %q: %w", script, err)
+ }
+
+ fmt.Printf("Scraping script %v created.\n", script)
+ return nil
+}
+
+func (c *NewCommand) Usage() {
+ fmt.Println(`
+The new command creates a new scraping script.
+
+Usage:
+
+ flyscrape new SCRIPT
+
+
+Examples:
+
+ # Create a new scraping script.
+ $ flyscrape new example.js
+`[1:])
+}
diff --git a/cmd/flyscrape/run.go b/cmd/flyscrape/run.go
index ca037d2..8ec9390 100644
--- a/cmd/flyscrape/run.go
+++ b/cmd/flyscrape/run.go
@@ -28,12 +28,12 @@ func (c *RunCommand) Run(args []string) error {
script := fs.Arg(0)
src, err := os.ReadFile(script)
if err != nil {
- return fmt.Errorf("failed to read script %q: %v", script, err)
+ return fmt.Errorf("failed to read script %q: %w", script, err)
}
opts, scrape, err := flyscrape.Compile(string(src))
if err != nil {
- return fmt.Errorf("failed to compile script: %v", err)
+ return fmt.Errorf("failed to compile script: %w", err)
}
svc := flyscrape.Scraper{
diff --git a/cmd/flyscrape/watch.go b/cmd/flyscrape/watch.go
index ca006db..99fac4e 100644
--- a/cmd/flyscrape/watch.go
+++ b/cmd/flyscrape/watch.go
@@ -27,7 +27,7 @@ func (c *WatchCommand) Run(args []string) error {
fetch := flyscrape.CachedFetch()
script := fs.Arg(0)
- flyscrape.Watch(script, func(s string) error {
+ err := flyscrape.Watch(script, func(s string) error {
opts, scrape, err := flyscrape.Compile(s)
if err != nil {
log.Println(err)
@@ -53,6 +53,9 @@ func (c *WatchCommand) Run(args []string) error {
flyscrape.PrettyPrint(result)
return nil
})
+ if err != nil && err != flyscrape.StopWatch {
+ return fmt.Errorf("failed to watch script %q: %w", script, err)
+ }
return nil
}
diff --git a/js/embed.go b/js/embed.go
index 5413e77..dcc8d93 100644
--- a/js/embed.go
+++ b/js/embed.go
@@ -6,3 +6,6 @@ import _ "embed"
//go:embed flyscrape_bundle.js
var Flyscrape string
+
+//go:embed template.js
+var Template []byte
diff --git a/js/template.js b/js/template.js
new file mode 100644
index 0000000..d33adc5
--- /dev/null
+++ b/js/template.js
@@ -0,0 +1,27 @@
+import { parse } from "flyscrape";
+
+export const options = {
+ url: "https://news.ycombinator.com/", // Specify the URL to start scraping from.
+ depth: 1, // Specify how deep links should be followed (0 = no follow).
+ allowedDomains: ["news.ycombinator.com"], // Specify the allowed domains to follow.
+}
+
+export default function({ html, url }) {
+ const $ = parse(html);
+
+ return {
+ title: $('title').text(),
+ entries: $('.athing').toArray().map(entry => {
+ const link = $(entry).find('.titleline > a');
+ const rank = $(entry).find('.rank');
+ const points = $(entry).next().find('.score');
+
+ return {
+ title: link.text(), // Extract the title text.
+ url: link.attr('href'), // Extract the link href.
+ rank: parseInt(rank.text().slice(0, -1)), // Extract and cleanup the rank.
+ points: parseInt(points.text().replace(' points', '')), // Extract and cleanup the points.
+ }
+ }),
+ };
+}