summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2025-04-10 12:53:29 +0200
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2025-04-10 12:53:29 +0200
commitb15b921f633730cb88169deb787bfe893cd40aae (patch)
tree3da9ac1bb9419399d98af5bebd3b1022421aca67
parentbf99c233a18c3165e0d4d251b41224e5bc6eb93d (diff)
Support library usagegolib
-rw-r--r--examples/flyscrapelib.go41
-rw-r--r--flyscrape/debug.log3
-rw-r--r--flyscrape/flyscrape.go247
-rw-r--r--flyscrape/flyscrape_test.go45
-rwxr-xr-xflyscrape/godbg.testbin0 -> 29953922 bytes
5 files changed, 336 insertions, 0 deletions
diff --git a/examples/flyscrapelib.go b/examples/flyscrapelib.go
new file mode 100644
index 0000000..641e7ac
--- /dev/null
+++ b/examples/flyscrapelib.go
@@ -0,0 +1,41 @@
+package main
+
+import "github.com/philippta/flyscrape/flyscrape"
+
+var config = flyscrape.Config{
+ URL: "https://news.ycombinator.com/",
+ Browser: true,
+ Headless: false,
+}
+
+func scrape(ctx flyscrape.Context) any {
+ var (
+ post = ctx.Doc.Find(".athing.submission").First()
+ title = post.Find(".titleline > a").Text()
+ commentsLink = post.Next().Find("a").Last().Attr("href")
+ comments = ctx.Scrape(commentsLink, scrapeComments)
+ )
+
+ return flyscrape.M{
+ "title": title,
+ "comments": comments,
+ }
+}
+
+func scrapeComments(ctx flyscrape.Context) any {
+ var comments []flyscrape.M
+
+ for _, comment := range ctx.Doc.Find(".comtr").All() {
+ comments = append(comments, flyscrape.M{
+ "author": comment.Find(".hnuser").Text(),
+ "text": comment.Find(".commtext").Text(),
+ })
+
+ }
+
+ return comments
+}
+
+func main() {
+ flyscrape.Run(config, scrape)
+}
diff --git a/flyscrape/debug.log b/flyscrape/debug.log
new file mode 100644
index 0000000..aee1eac
--- /dev/null
+++ b/flyscrape/debug.log
@@ -0,0 +1,3 @@
+<nil>
+could not find statement at /Users/philipp/code/flyscrape-workspace/flyscrape/flyscrape/flyscrape_test.go:38, please use a line with a statement
+<nil>
diff --git a/flyscrape/flyscrape.go b/flyscrape/flyscrape.go
new file mode 100644
index 0000000..5cb068d
--- /dev/null
+++ b/flyscrape/flyscrape.go
@@ -0,0 +1,247 @@
+package flyscrape
+
+import (
+ "encoding/json"
+ "fmt"
+ "net/url"
+ "strings"
+
+ "github.com/PuerkitoBio/goquery"
+ "github.com/philippta/flyscrape"
+
+ _ "github.com/philippta/flyscrape/modules/browser"
+ _ "github.com/philippta/flyscrape/modules/cache"
+ _ "github.com/philippta/flyscrape/modules/cookies"
+ _ "github.com/philippta/flyscrape/modules/depth"
+ _ "github.com/philippta/flyscrape/modules/domainfilter"
+ _ "github.com/philippta/flyscrape/modules/followlinks"
+ _ "github.com/philippta/flyscrape/modules/headers"
+ _ "github.com/philippta/flyscrape/modules/output/json"
+ _ "github.com/philippta/flyscrape/modules/output/ndjson"
+ _ "github.com/philippta/flyscrape/modules/proxy"
+ _ "github.com/philippta/flyscrape/modules/ratelimit"
+ _ "github.com/philippta/flyscrape/modules/retry"
+ _ "github.com/philippta/flyscrape/modules/starturl"
+ _ "github.com/philippta/flyscrape/modules/urlfilter"
+)
+
+type M map[string]any
+
+type ConfigOutput struct {
+ Format string `json:"format"`
+ File string `json:"file"`
+}
+
+type Config struct {
+ URL string `json:"url"`
+ URLs []string `json:"urls"`
+ Browser bool `json:"browser"`
+ Headless bool `json:"headless"`
+ Depth int `json:"depth"`
+ Follow []string `json:"follow"`
+ AllowedDomains []string `json:"allowedDomains"`
+ BlockedDomains []string `json:"blockedDomains"`
+ AllowedURLs []string `json:"allowedURLs"`
+ BlockedURLs []string `json:"blockedURLs"`
+ Rate int `json:"rate"`
+ Concurrency int `json:"concurrency"`
+ Proxy string `json:"proxy"`
+ Proxies []string `json:"proxies"`
+ Cache string `json:"cache"`
+ Cookies string `json:"cookies"`
+ Headers map[string]string `json:"headers"`
+ Output ConfigOutput `json:"output"`
+}
+
+type Context struct {
+ URL string
+ Doc *Document
+ AbsoluteURL func(url string) string
+ Scrape func(url string, fn ScrapeFunc) any
+}
+
+type ScrapeFunc func(ctx Context) any
+
+func Run(cfg Config, fn ScrapeFunc) {
+ jsoncfg, _ := json.Marshal(cfg)
+
+ scraper := flyscrape.NewScraper()
+ scraper.ScrapeFunc = scrapeFunc(fn)
+ scraper.Script = "main.go"
+ scraper.Modules = flyscrape.LoadModules(jsoncfg)
+ scraper.Run()
+}
+
+func scrapeFunc(fn ScrapeFunc) flyscrape.ScrapeFunc {
+ return func(p flyscrape.ScrapeParams) (any, error) {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(p.HTML))
+ if err != nil {
+ return nil, fmt.Errorf("new document from reader: %w", err)
+ }
+
+ baseurl, err := url.Parse(p.URL)
+ if err != nil {
+ return nil, fmt.Errorf("parse url: %w", err)
+ }
+
+ absoluteURL := func(ref string) string {
+ abs, err := baseurl.Parse(ref)
+ if err != nil {
+ return ref
+ }
+ return abs.String()
+ }
+
+ ctx := Context{
+ URL: p.URL,
+ Doc: NewDocument(doc.Selection),
+ AbsoluteURL: absoluteURL,
+ Scrape: func(url string, sfn ScrapeFunc) any {
+ url = absoluteURL(url)
+
+ html, err := p.Process(url)
+ if err != nil {
+ return M{"error": err.Error()}
+ }
+
+ newp := flyscrape.ScrapeParams{
+ HTML: string(html),
+ URL: url,
+ Process: p.Process,
+ }
+
+ data, err := scrapeFunc(sfn)(newp)
+ if err != nil {
+ return M{"error": err.Error()}
+ }
+
+ return data
+ },
+ }
+
+ return fn(ctx), nil
+ }
+}
+
+type Document struct {
+ Selection *goquery.Selection
+}
+
+func NewDocument(sel *goquery.Selection) *Document {
+ return &Document{Selection: sel}
+}
+
+func (d *Document) Text() string {
+ return d.Selection.Text()
+}
+
+func (d *Document) Name() string {
+ if d.Selection.Length() > 0 {
+ return d.Selection.Get(0).Data
+ }
+ return ""
+}
+
+func (d *Document) Html() string {
+ h, _ := goquery.OuterHtml(d.Selection)
+ return h
+}
+
+func (d *Document) Attr(name string) string {
+ v, _ := d.Selection.Attr(name)
+ return v
+}
+
+func (d *Document) HasAttr(name string) bool {
+ _, ok := d.Selection.Attr(name)
+ return ok
+}
+
+func (d *Document) HasClass(className string) bool {
+ return d.Selection.HasClass(className)
+}
+
+func (d *Document) Length() int {
+ return d.Selection.Length()
+}
+
+func (d *Document) First() *Document {
+ return NewDocument(d.Selection.First())
+}
+
+func (d *Document) Last() *Document {
+ return NewDocument(d.Selection.Last())
+}
+
+func (d *Document) Get(index int) *Document {
+ return NewDocument(d.Selection.Eq(index))
+}
+
+func (d *Document) Find(s string) *Document {
+ return NewDocument(d.Selection.Find(s))
+}
+
+func (d *Document) Next() *Document {
+ return NewDocument(d.Selection.Next())
+}
+
+func (d *Document) NextAll() *Document {
+ return NewDocument(d.Selection.NextAll())
+}
+
+func (d *Document) NextUntil(s string) *Document {
+ return NewDocument(d.Selection.NextUntil(s))
+}
+
+func (d *Document) Prev() *Document {
+ return NewDocument(d.Selection.Prev())
+}
+
+func (d *Document) PrevAll() *Document {
+ return NewDocument(d.Selection.PrevAll())
+}
+
+func (d *Document) PrevUntil(s string) *Document {
+ return NewDocument(d.Selection.PrevUntil(s))
+}
+
+func (d *Document) Siblings() *Document {
+ return NewDocument(d.Selection.Siblings())
+}
+
+func (d *Document) Children() *Document {
+ return NewDocument(d.Selection.Children())
+}
+
+func (d *Document) Parent() *Document {
+ return NewDocument(d.Selection.Parent())
+}
+
+func (d *Document) Map(callback func(*Document, int) any) []any {
+ var vals []any
+ d.Selection.Map(func(i int, s *goquery.Selection) string {
+ vals = append(vals, callback(NewDocument(s), i))
+ return ""
+ })
+ return vals
+}
+
+func (d *Document) Filter(callback func(*Document, int) bool) []*Document {
+ var vals []*Document
+ d.Selection.Each(func(i int, s *goquery.Selection) {
+ el := NewDocument(s)
+ ok := callback(el, i)
+ if ok {
+ vals = append(vals, el)
+ }
+ })
+ return vals
+}
+
+func (d *Document) All() []*Document {
+ ss := make([]*Document, 0, d.Length())
+ d.Selection.Each(func(i int, s *goquery.Selection) {
+ ss = append(ss, NewDocument(s))
+ })
+ return ss
+}
diff --git a/flyscrape/flyscrape_test.go b/flyscrape/flyscrape_test.go
new file mode 100644
index 0000000..8518caa
--- /dev/null
+++ b/flyscrape/flyscrape_test.go
@@ -0,0 +1,45 @@
+package flyscrape_test
+
+import (
+ "testing"
+
+ "github.com/philippta/flyscrape/flyscrape"
+)
+
+func TestRun(t *testing.T) {
+ cfg := flyscrape.Config{
+ URL: "https://news.ycombinator.com/",
+ Browser: true,
+ Headless: false,
+ Output: flyscrape.ConfigOutput{
+ Format: "ndjson",
+ },
+ }
+
+ fn := func(ctx flyscrape.Context) any {
+ post := ctx.Doc.Find(".athing.submission").First()
+ title := post.Find(".titleline > a").Text()
+ commentsLink := post.Next().Find("a").Last().Attr("href")
+
+ comments := ctx.Scrape(commentsLink, func(ctx flyscrape.Context) any {
+ var comments []flyscrape.M
+
+ for _, comment := range ctx.Doc.Find(".comtr").All() {
+ comments = append(comments, flyscrape.M{
+ "author": comment.Find(".hnuser").Text(),
+ "text": comment.Find(".commtext").Text(),
+ })
+
+ }
+
+ return comments
+ })
+
+ return flyscrape.M{
+ "title": title,
+ "comments": comments,
+ }
+ }
+
+ flyscrape.Run(cfg, fn)
+}
diff --git a/flyscrape/godbg.test b/flyscrape/godbg.test
new file mode 100755
index 0000000..c7bf68f
--- /dev/null
+++ b/flyscrape/godbg.test
Binary files differ