From 062b36fe5725d1267c66db2e506b4131d78ce772 Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Fri, 11 Aug 2023 18:31:20 +0200 Subject: simplify project structure --- cmd/flyscrape/main.go | 31 +++++++++++-------- cmd/watch/main.go | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 12 deletions(-) create mode 100644 cmd/watch/main.go (limited to 'cmd') diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go index fb31056..85e40a9 100644 --- a/cmd/flyscrape/main.go +++ b/cmd/flyscrape/main.go @@ -7,9 +7,9 @@ import ( "io" "net/http" "os" + "time" - "flyscrape/flyscrape" - "flyscrape/js" + "flyscrape" ) func main() { @@ -17,14 +17,20 @@ func main() { exit("Please provide a file to run.") } - opts, scrape, err := js.Compile(os.Args[1]) + src, err := os.ReadFile(os.Args[1]) + if err != nil { + exit(fmt.Sprintf("Error reading file: %v", err)) + } + + opts, scrape, err := flyscrape.Compile(string(src)) if err != nil { exit(fmt.Sprintf("Error compiling JavaScript file: %v", err)) } - svc := flyscrape.Service{ - ScrapeOptions: *opts, + svc := flyscrape.Scraper{ + ScrapeOptions: opts, ScrapeFunc: scrape, + Concurrency: 5, FetchFunc: func(url string) (string, error) { resp, err := http.Get(url) if err != nil { @@ -39,14 +45,15 @@ func main() { return string(data), nil }, } - results := svc.Scrape() - if err != nil { - } - fmt.Printf("%T\n", results[0]) - data, _ := json.MarshalIndent(results, "", " ") - fmt.Println(string(data)) - return + count := 0 + start := time.Now() + for result := range svc.Scrape() { + data, _ := json.MarshalIndent(result, "", " ") + fmt.Println(string(data)) + count++ + } + fmt.Printf("Scraped %d websites in %v\n", count, time.Since(start)) } func exit(msg string) { diff --git a/cmd/watch/main.go b/cmd/watch/main.go new file mode 100644 index 0000000..5065d8b --- /dev/null +++ b/cmd/watch/main.go @@ -0,0 +1,83 @@ +package main + +import ( + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os" + + "flyscrape" + + "github.com/cornelk/hashmap" + "github.com/inancgumus/screen" +) + +func main() { + if len(os.Args) != 2 { + fmt.Println("Please provide a file to run.") + os.Exit(1) + } + + cache := hashmap.New[string, string]() + + err := flyscrape.Watch(os.Args[1], func(s string) error { + opts, scrape, err := flyscrape.Compile(s) + if err == nil { + run(cache, opts, scrape) + } + return nil + }) + if err != nil { + log.Fatal(err) + } +} + +func run(cache *hashmap.Map[string, string], opts flyscrape.ScrapeOptions, fn flyscrape.ScrapeFunc) { + opts.Depth = 0 + + svc := flyscrape.Scraper{ + Concurrency: 20, + ScrapeOptions: opts, + ScrapeFunc: fn, + FetchFunc: func(url string) (string, error) { + if html, ok := cache.Get(url); ok { + return html, nil + } + html, err := fetch(url) + if err != nil { + return "", err + } + cache.Set(url, html) + return html, nil + }, + } + + result := <-svc.Scrape() + if result.Error != nil { + fmt.Println(result.Error) + } + + screen.Clear() + screen.MoveTopLeft() + + enc := json.NewEncoder(os.Stdout) + enc.SetEscapeHTML(false) + enc.SetIndent("", " ") + enc.Encode(result) +} + +func fetch(url string) (string, error) { + resp, err := http.Get(url) + if err != nil { + return "", err + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", err + } + + return string(body), nil +} -- cgit v1.2.3