summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cmd/flyscrape/main.go4
-rw-r--r--cmd/main.go11
-rw-r--r--cmd/run.go6
-rw-r--r--modules/output/json/json.go130
-rw-r--r--modules/output/ndjson/ndjson.go (renamed from modules/jsonprint/jsonprint.go)69
-rw-r--r--scrape.go2
-rw-r--r--utils.go11
7 files changed, 195 insertions, 38 deletions
diff --git a/cmd/flyscrape/main.go b/cmd/flyscrape/main.go
index 8b9d430..9772648 100644
--- a/cmd/flyscrape/main.go
+++ b/cmd/flyscrape/main.go
@@ -15,7 +15,9 @@ import (
_ "github.com/philippta/flyscrape/modules/depth"
_ "github.com/philippta/flyscrape/modules/domainfilter"
_ "github.com/philippta/flyscrape/modules/followlinks"
- _ "github.com/philippta/flyscrape/modules/jsonprint"
+ _ "github.com/philippta/flyscrape/modules/headers"
+ _ "github.com/philippta/flyscrape/modules/output/json"
+ _ "github.com/philippta/flyscrape/modules/output/ndjson"
_ "github.com/philippta/flyscrape/modules/proxy"
_ "github.com/philippta/flyscrape/modules/ratelimit"
_ "github.com/philippta/flyscrape/modules/starturl"
diff --git a/cmd/main.go b/cmd/main.go
index eab3e03..b4f8d1d 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -11,17 +11,6 @@ import (
"log"
"os"
"strings"
-
- _ "github.com/philippta/flyscrape/modules/cache"
- _ "github.com/philippta/flyscrape/modules/depth"
- _ "github.com/philippta/flyscrape/modules/domainfilter"
- _ "github.com/philippta/flyscrape/modules/followlinks"
- _ "github.com/philippta/flyscrape/modules/headers"
- _ "github.com/philippta/flyscrape/modules/jsonprint"
- _ "github.com/philippta/flyscrape/modules/proxy"
- _ "github.com/philippta/flyscrape/modules/ratelimit"
- _ "github.com/philippta/flyscrape/modules/starturl"
- _ "github.com/philippta/flyscrape/modules/urlfilter"
)
func main() {
diff --git a/cmd/run.go b/cmd/run.go
index 8c1a39d..a02b2ec 100644
--- a/cmd/run.go
+++ b/cmd/run.go
@@ -53,5 +53,11 @@ Examples:
# Follow paginated links.
$ flyscrape run example.js --depth 5 --follow ".next-button > a"
+
+ # Set the output format to ndjson.
+ $ flyscrape run example.js --output.format ndjson
+
+ # Write the output to a file.
+ $ flyscrape run example.js --output.file results.json
`[1:])
}
diff --git a/modules/output/json/json.go b/modules/output/json/json.go
new file mode 100644
index 0000000..5b4e9d0
--- /dev/null
+++ b/modules/output/json/json.go
@@ -0,0 +1,130 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+package json
+
+import (
+ "bytes"
+ "encoding/json"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "time"
+
+ "github.com/philippta/flyscrape"
+)
+
+func init() {
+ flyscrape.RegisterModule(Module{})
+}
+
+type Module struct {
+ Output struct {
+ Format string `json:"format"`
+ File string `json:"file"`
+ } `json:"output"`
+
+ once bool
+ w io.WriteCloser
+}
+
+func (Module) ModuleInfo() flyscrape.ModuleInfo {
+ return flyscrape.ModuleInfo{
+ ID: "output.json",
+ New: func() flyscrape.Module { return new(Module) },
+ }
+}
+
+func (m *Module) Provision(ctx flyscrape.Context) {
+ if m.disabled() {
+ return
+ }
+
+ if m.Output.File == "" {
+ m.w = nopCloser{os.Stdout}
+ return
+ }
+
+ f, err := os.Create(m.Output.File)
+ if err != nil {
+ log.Printf("failed to create file %q: %v", m.Output.File, err)
+ os.Exit(1)
+ }
+ m.w = f
+}
+
+func (m *Module) ReceiveResponse(resp *flyscrape.Response) {
+ if m.disabled() {
+ return
+ }
+
+ if resp.Error == nil && resp.Data == nil {
+ return
+ }
+
+ if !m.once {
+ fmt.Fprintln(m.w, "[")
+ m.once = true
+ } else {
+ fmt.Fprintln(m.w, ",")
+ }
+
+ o := output{
+ URL: resp.Request.URL,
+ Data: resp.Data,
+ Timestamp: time.Now(),
+ }
+ if resp.Error != nil {
+ o.Error = resp.Error.Error()
+ }
+
+ var buf bytes.Buffer
+ enc := json.NewEncoder(&buf)
+ enc.SetEscapeHTML(false)
+ enc.SetIndent(" ", " ")
+ enc.Encode(o)
+
+ fmt.Fprint(m.w, " ")
+ fmt.Fprint(m.w, buf.String()[:buf.Len()-1])
+}
+
+func (m *Module) Finalize() {
+ if m.disabled() {
+ return
+ }
+ if m.once {
+ fmt.Fprintln(m.w, "\n]")
+ }
+ m.w.Close()
+}
+
+func (m *Module) disabled() bool {
+ return m.Output.Format != "json" && m.Output.Format != ""
+}
+
+type output struct {
+ URL string `json:"url,omitempty"`
+ Data any `json:"data,omitempty"`
+ Error string `json:"error,omitempty"`
+ Timestamp time.Time `json:"timestamp,omitempty"`
+}
+
+type nopCloser struct {
+ io.Writer
+}
+
+func (c nopCloser) Write(p []byte) (n int, err error) {
+ return c.Writer.Write(p)
+}
+
+func (c nopCloser) Close() error {
+ return nil
+}
+
+var (
+ _ flyscrape.Provisioner = (*Module)(nil)
+ _ flyscrape.ResponseReceiver = (*Module)(nil)
+ _ flyscrape.Finalizer = (*Module)(nil)
+)
diff --git a/modules/jsonprint/jsonprint.go b/modules/output/ndjson/ndjson.go
index c40a8b9..956b2ed 100644
--- a/modules/jsonprint/jsonprint.go
+++ b/modules/output/ndjson/ndjson.go
@@ -2,10 +2,13 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
-package jsonprint
+package ndjson
import (
- "fmt"
+ "encoding/json"
+ "io"
+ "log"
+ "os"
"time"
"github.com/philippta/flyscrape"
@@ -16,26 +19,46 @@ func init() {
}
type Module struct {
- once bool
+ Output struct {
+ Format string `json:"format"`
+ File string `json:"file"`
+ } `json:"output"`
+
+ w io.WriteCloser
}
func (Module) ModuleInfo() flyscrape.ModuleInfo {
return flyscrape.ModuleInfo{
- ID: "jsonprint",
+ ID: "output.ndjson",
New: func() flyscrape.Module { return new(Module) },
}
}
+func (m *Module) Provision(ctx flyscrape.Context) {
+ if m.disabled() {
+ return
+ }
+
+ if m.Output.File == "" {
+ m.w = nopCloser{os.Stdout}
+ return
+ }
+
+ f, err := os.Create(m.Output.File)
+ if err != nil {
+ log.Printf("failed to create file %q: %v", m.Output.File, err)
+ os.Exit(1)
+ }
+ m.w = f
+}
+
func (m *Module) ReceiveResponse(resp *flyscrape.Response) {
- if resp.Error == nil && resp.Data == nil {
+ if m.disabled() {
return
}
- if !m.once {
- fmt.Println("[")
- m.once = true
- } else {
- fmt.Println(",")
+ if resp.Error == nil && resp.Data == nil {
+ return
}
o := output{
@@ -47,13 +70,20 @@ func (m *Module) ReceiveResponse(resp *flyscrape.Response) {
o.Error = resp.Error.Error()
}
- fmt.Print(flyscrape.Prettify(o, " "))
+ enc := json.NewEncoder(m.w)
+ enc.SetEscapeHTML(false)
+ enc.Encode(o)
}
func (m *Module) Finalize() {
- if m.once {
- fmt.Println("\n]")
+ if m.disabled() {
+ return
}
+ m.w.Close()
+}
+
+func (m *Module) disabled() bool {
+ return m.Output.Format != "ndjson"
}
type output struct {
@@ -63,7 +93,20 @@ type output struct {
Timestamp time.Time `json:"timestamp,omitempty"`
}
+type nopCloser struct {
+ io.Writer
+}
+
+func (c nopCloser) Write(p []byte) (n int, err error) {
+ return c.Writer.Write(p)
+}
+
+func (c nopCloser) Close() error {
+ return nil
+}
+
var (
+ _ flyscrape.Provisioner = (*Module)(nil)
_ flyscrape.ResponseReceiver = (*Module)(nil)
_ flyscrape.Finalizer = (*Module)(nil)
)
diff --git a/scrape.go b/scrape.go
index 4de36d8..3e38e46 100644
--- a/scrape.go
+++ b/scrape.go
@@ -14,8 +14,6 @@ import (
"github.com/cornelk/hashmap"
)
-type FetchFunc func(url string) (string, error)
-
type Context interface {
ScriptName() string
Visit(url string)
diff --git a/utils.go b/utils.go
index f26dc6c..eda38d5 100644
--- a/utils.go
+++ b/utils.go
@@ -5,8 +5,6 @@
package flyscrape
import (
- "bytes"
- "encoding/json"
"fmt"
"io"
"net/http"
@@ -15,15 +13,6 @@ import (
const HeaderBypassCache = "X-Flyscrape-Bypass-Cache"
-func Prettify(v any, prefix string) string {
- var buf bytes.Buffer
- enc := json.NewEncoder(&buf)
- enc.SetEscapeHTML(false)
- enc.SetIndent(prefix, " ")
- enc.Encode(v)
- return prefix + strings.TrimSuffix(buf.String(), "\n")
-}
-
type RoundTripFunc func(*http.Request) (*http.Response, error)
func (f RoundTripFunc) RoundTrip(r *http.Request) (*http.Response, error) {