diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-12-04 17:35:06 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-12-04 17:35:06 +0100 |
| commit | 8c68e0ed414bfb323d6e94db55c95db13797ef8e (patch) | |
| tree | 379fd505aec7e54d9f7dcc7b7d6ae5264f4b956a | |
| parent | 807b9a1693645270609d4d795aa2b8eeacaae40e (diff) | |
Make output file and format configurable (#24)
| -rw-r--r-- | cmd/flyscrape/main.go | 4 | ||||
| -rw-r--r-- | cmd/main.go | 11 | ||||
| -rw-r--r-- | cmd/run.go | 6 | ||||
| -rw-r--r-- | modules/output/json/json.go | 130 | ||||
| -rw-r--r-- | modules/output/ndjson/ndjson.go (renamed from modules/jsonprint/jsonprint.go) | 69 | ||||
| -rw-r--r-- | scrape.go | 2 | ||||
| -rw-r--r-- | utils.go | 11 |
7 files changed, 195 insertions, 38 deletions
diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go index 8b9d430..9772648 100644 --- a/cmd/flyscrape/main.go +++ b/cmd/flyscrape/main.go @@ -15,7 +15,9 @@ import ( _ "github.com/philippta/flyscrape/modules/depth" _ "github.com/philippta/flyscrape/modules/domainfilter" _ "github.com/philippta/flyscrape/modules/followlinks" - _ "github.com/philippta/flyscrape/modules/jsonprint" + _ "github.com/philippta/flyscrape/modules/headers" + _ "github.com/philippta/flyscrape/modules/output/json" + _ "github.com/philippta/flyscrape/modules/output/ndjson" _ "github.com/philippta/flyscrape/modules/proxy" _ "github.com/philippta/flyscrape/modules/ratelimit" _ "github.com/philippta/flyscrape/modules/starturl" diff --git a/cmd/main.go b/cmd/main.go index eab3e03..b4f8d1d 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -11,17 +11,6 @@ import ( "log" "os" "strings" - - _ "github.com/philippta/flyscrape/modules/cache" - _ "github.com/philippta/flyscrape/modules/depth" - _ "github.com/philippta/flyscrape/modules/domainfilter" - _ "github.com/philippta/flyscrape/modules/followlinks" - _ "github.com/philippta/flyscrape/modules/headers" - _ "github.com/philippta/flyscrape/modules/jsonprint" - _ "github.com/philippta/flyscrape/modules/proxy" - _ "github.com/philippta/flyscrape/modules/ratelimit" - _ "github.com/philippta/flyscrape/modules/starturl" - _ "github.com/philippta/flyscrape/modules/urlfilter" ) func main() { @@ -53,5 +53,11 @@ Examples: # Follow paginated links. $ flyscrape run example.js --depth 5 --follow ".next-button > a" + + # Set the output format to ndjson. + $ flyscrape run example.js --output.format ndjson + + # Write the output to a file. + $ flyscrape run example.js --output.file results.json `[1:]) } diff --git a/modules/output/json/json.go b/modules/output/json/json.go new file mode 100644 index 0000000..5b4e9d0 --- /dev/null +++ b/modules/output/json/json.go @@ -0,0 +1,130 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package json + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "log" + "os" + "time" + + "github.com/philippta/flyscrape" +) + +func init() { + flyscrape.RegisterModule(Module{}) +} + +type Module struct { + Output struct { + Format string `json:"format"` + File string `json:"file"` + } `json:"output"` + + once bool + w io.WriteCloser +} + +func (Module) ModuleInfo() flyscrape.ModuleInfo { + return flyscrape.ModuleInfo{ + ID: "output.json", + New: func() flyscrape.Module { return new(Module) }, + } +} + +func (m *Module) Provision(ctx flyscrape.Context) { + if m.disabled() { + return + } + + if m.Output.File == "" { + m.w = nopCloser{os.Stdout} + return + } + + f, err := os.Create(m.Output.File) + if err != nil { + log.Printf("failed to create file %q: %v", m.Output.File, err) + os.Exit(1) + } + m.w = f +} + +func (m *Module) ReceiveResponse(resp *flyscrape.Response) { + if m.disabled() { + return + } + + if resp.Error == nil && resp.Data == nil { + return + } + + if !m.once { + fmt.Fprintln(m.w, "[") + m.once = true + } else { + fmt.Fprintln(m.w, ",") + } + + o := output{ + URL: resp.Request.URL, + Data: resp.Data, + Timestamp: time.Now(), + } + if resp.Error != nil { + o.Error = resp.Error.Error() + } + + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + enc.SetEscapeHTML(false) + enc.SetIndent(" ", " ") + enc.Encode(o) + + fmt.Fprint(m.w, " ") + fmt.Fprint(m.w, buf.String()[:buf.Len()-1]) +} + +func (m *Module) Finalize() { + if m.disabled() { + return + } + if m.once { + fmt.Fprintln(m.w, "\n]") + } + m.w.Close() +} + +func (m *Module) disabled() bool { + return m.Output.Format != "json" && m.Output.Format != "" +} + +type output struct { + URL string `json:"url,omitempty"` + Data any `json:"data,omitempty"` + Error string `json:"error,omitempty"` + Timestamp time.Time `json:"timestamp,omitempty"` +} + +type nopCloser struct { + io.Writer +} + +func (c nopCloser) Write(p []byte) (n int, err error) { + return c.Writer.Write(p) +} + +func (c nopCloser) Close() error { + return nil +} + +var ( + _ flyscrape.Provisioner = (*Module)(nil) + _ flyscrape.ResponseReceiver = (*Module)(nil) + _ flyscrape.Finalizer = (*Module)(nil) +) diff --git a/modules/jsonprint/jsonprint.go b/modules/output/ndjson/ndjson.go index c40a8b9..956b2ed 100644 --- a/modules/jsonprint/jsonprint.go +++ b/modules/output/ndjson/ndjson.go @@ -2,10 +2,13 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -package jsonprint +package ndjson import ( - "fmt" + "encoding/json" + "io" + "log" + "os" "time" "github.com/philippta/flyscrape" @@ -16,26 +19,46 @@ func init() { } type Module struct { - once bool + Output struct { + Format string `json:"format"` + File string `json:"file"` + } `json:"output"` + + w io.WriteCloser } func (Module) ModuleInfo() flyscrape.ModuleInfo { return flyscrape.ModuleInfo{ - ID: "jsonprint", + ID: "output.ndjson", New: func() flyscrape.Module { return new(Module) }, } } +func (m *Module) Provision(ctx flyscrape.Context) { + if m.disabled() { + return + } + + if m.Output.File == "" { + m.w = nopCloser{os.Stdout} + return + } + + f, err := os.Create(m.Output.File) + if err != nil { + log.Printf("failed to create file %q: %v", m.Output.File, err) + os.Exit(1) + } + m.w = f +} + func (m *Module) ReceiveResponse(resp *flyscrape.Response) { - if resp.Error == nil && resp.Data == nil { + if m.disabled() { return } - if !m.once { - fmt.Println("[") - m.once = true - } else { - fmt.Println(",") + if resp.Error == nil && resp.Data == nil { + return } o := output{ @@ -47,13 +70,20 @@ func (m *Module) ReceiveResponse(resp *flyscrape.Response) { o.Error = resp.Error.Error() } - fmt.Print(flyscrape.Prettify(o, " ")) + enc := json.NewEncoder(m.w) + enc.SetEscapeHTML(false) + enc.Encode(o) } func (m *Module) Finalize() { - if m.once { - fmt.Println("\n]") + if m.disabled() { + return } + m.w.Close() +} + +func (m *Module) disabled() bool { + return m.Output.Format != "ndjson" } type output struct { @@ -63,7 +93,20 @@ type output struct { Timestamp time.Time `json:"timestamp,omitempty"` } +type nopCloser struct { + io.Writer +} + +func (c nopCloser) Write(p []byte) (n int, err error) { + return c.Writer.Write(p) +} + +func (c nopCloser) Close() error { + return nil +} + var ( + _ flyscrape.Provisioner = (*Module)(nil) _ flyscrape.ResponseReceiver = (*Module)(nil) _ flyscrape.Finalizer = (*Module)(nil) ) @@ -14,8 +14,6 @@ import ( "github.com/cornelk/hashmap" ) -type FetchFunc func(url string) (string, error) - type Context interface { ScriptName() string Visit(url string) @@ -5,8 +5,6 @@ package flyscrape import ( - "bytes" - "encoding/json" "fmt" "io" "net/http" @@ -15,15 +13,6 @@ import ( const HeaderBypassCache = "X-Flyscrape-Bypass-Cache" -func Prettify(v any, prefix string) string { - var buf bytes.Buffer - enc := json.NewEncoder(&buf) - enc.SetEscapeHTML(false) - enc.SetIndent(prefix, " ") - enc.Encode(v) - return prefix + strings.TrimSuffix(buf.String(), "\n") -} - type RoundTripFunc func(*http.Request) (*http.Response, error) func (f RoundTripFunc) RoundTrip(r *http.Request) (*http.Response, error) { |