From b15b921f633730cb88169deb787bfe893cd40aae Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Thu, 10 Apr 2025 12:53:29 +0200 Subject: Support library usage --- examples/flyscrapelib.go | 41 ++++++++ flyscrape/debug.log | 3 + flyscrape/flyscrape.go | 247 ++++++++++++++++++++++++++++++++++++++++++++ flyscrape/flyscrape_test.go | 45 ++++++++ flyscrape/godbg.test | Bin 0 -> 29953922 bytes 5 files changed, 336 insertions(+) create mode 100644 examples/flyscrapelib.go create mode 100644 flyscrape/debug.log create mode 100644 flyscrape/flyscrape.go create mode 100644 flyscrape/flyscrape_test.go create mode 100755 flyscrape/godbg.test diff --git a/examples/flyscrapelib.go b/examples/flyscrapelib.go new file mode 100644 index 0000000..641e7ac --- /dev/null +++ b/examples/flyscrapelib.go @@ -0,0 +1,41 @@ +package main + +import "github.com/philippta/flyscrape/flyscrape" + +var config = flyscrape.Config{ + URL: "https://news.ycombinator.com/", + Browser: true, + Headless: false, +} + +func scrape(ctx flyscrape.Context) any { + var ( + post = ctx.Doc.Find(".athing.submission").First() + title = post.Find(".titleline > a").Text() + commentsLink = post.Next().Find("a").Last().Attr("href") + comments = ctx.Scrape(commentsLink, scrapeComments) + ) + + return flyscrape.M{ + "title": title, + "comments": comments, + } +} + +func scrapeComments(ctx flyscrape.Context) any { + var comments []flyscrape.M + + for _, comment := range ctx.Doc.Find(".comtr").All() { + comments = append(comments, flyscrape.M{ + "author": comment.Find(".hnuser").Text(), + "text": comment.Find(".commtext").Text(), + }) + + } + + return comments +} + +func main() { + flyscrape.Run(config, scrape) +} diff --git a/flyscrape/debug.log b/flyscrape/debug.log new file mode 100644 index 0000000..aee1eac --- /dev/null +++ b/flyscrape/debug.log @@ -0,0 +1,3 @@ + +could not find statement at /Users/philipp/code/flyscrape-workspace/flyscrape/flyscrape/flyscrape_test.go:38, please use a line with a statement + diff --git a/flyscrape/flyscrape.go b/flyscrape/flyscrape.go new file mode 100644 index 0000000..5cb068d --- /dev/null +++ b/flyscrape/flyscrape.go @@ -0,0 +1,247 @@ +package flyscrape + +import ( + "encoding/json" + "fmt" + "net/url" + "strings" + + "github.com/PuerkitoBio/goquery" + "github.com/philippta/flyscrape" + + _ "github.com/philippta/flyscrape/modules/browser" + _ "github.com/philippta/flyscrape/modules/cache" + _ "github.com/philippta/flyscrape/modules/cookies" + _ "github.com/philippta/flyscrape/modules/depth" + _ "github.com/philippta/flyscrape/modules/domainfilter" + _ "github.com/philippta/flyscrape/modules/followlinks" + _ "github.com/philippta/flyscrape/modules/headers" + _ "github.com/philippta/flyscrape/modules/output/json" + _ "github.com/philippta/flyscrape/modules/output/ndjson" + _ "github.com/philippta/flyscrape/modules/proxy" + _ "github.com/philippta/flyscrape/modules/ratelimit" + _ "github.com/philippta/flyscrape/modules/retry" + _ "github.com/philippta/flyscrape/modules/starturl" + _ "github.com/philippta/flyscrape/modules/urlfilter" +) + +type M map[string]any + +type ConfigOutput struct { + Format string `json:"format"` + File string `json:"file"` +} + +type Config struct { + URL string `json:"url"` + URLs []string `json:"urls"` + Browser bool `json:"browser"` + Headless bool `json:"headless"` + Depth int `json:"depth"` + Follow []string `json:"follow"` + AllowedDomains []string `json:"allowedDomains"` + BlockedDomains []string `json:"blockedDomains"` + AllowedURLs []string `json:"allowedURLs"` + BlockedURLs []string `json:"blockedURLs"` + Rate int `json:"rate"` + Concurrency int `json:"concurrency"` + Proxy string `json:"proxy"` + Proxies []string `json:"proxies"` + Cache string `json:"cache"` + Cookies string `json:"cookies"` + Headers map[string]string `json:"headers"` + Output ConfigOutput `json:"output"` +} + +type Context struct { + URL string + Doc *Document + AbsoluteURL func(url string) string + Scrape func(url string, fn ScrapeFunc) any +} + +type ScrapeFunc func(ctx Context) any + +func Run(cfg Config, fn ScrapeFunc) { + jsoncfg, _ := json.Marshal(cfg) + + scraper := flyscrape.NewScraper() + scraper.ScrapeFunc = scrapeFunc(fn) + scraper.Script = "main.go" + scraper.Modules = flyscrape.LoadModules(jsoncfg) + scraper.Run() +} + +func scrapeFunc(fn ScrapeFunc) flyscrape.ScrapeFunc { + return func(p flyscrape.ScrapeParams) (any, error) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(p.HTML)) + if err != nil { + return nil, fmt.Errorf("new document from reader: %w", err) + } + + baseurl, err := url.Parse(p.URL) + if err != nil { + return nil, fmt.Errorf("parse url: %w", err) + } + + absoluteURL := func(ref string) string { + abs, err := baseurl.Parse(ref) + if err != nil { + return ref + } + return abs.String() + } + + ctx := Context{ + URL: p.URL, + Doc: NewDocument(doc.Selection), + AbsoluteURL: absoluteURL, + Scrape: func(url string, sfn ScrapeFunc) any { + url = absoluteURL(url) + + html, err := p.Process(url) + if err != nil { + return M{"error": err.Error()} + } + + newp := flyscrape.ScrapeParams{ + HTML: string(html), + URL: url, + Process: p.Process, + } + + data, err := scrapeFunc(sfn)(newp) + if err != nil { + return M{"error": err.Error()} + } + + return data + }, + } + + return fn(ctx), nil + } +} + +type Document struct { + Selection *goquery.Selection +} + +func NewDocument(sel *goquery.Selection) *Document { + return &Document{Selection: sel} +} + +func (d *Document) Text() string { + return d.Selection.Text() +} + +func (d *Document) Name() string { + if d.Selection.Length() > 0 { + return d.Selection.Get(0).Data + } + return "" +} + +func (d *Document) Html() string { + h, _ := goquery.OuterHtml(d.Selection) + return h +} + +func (d *Document) Attr(name string) string { + v, _ := d.Selection.Attr(name) + return v +} + +func (d *Document) HasAttr(name string) bool { + _, ok := d.Selection.Attr(name) + return ok +} + +func (d *Document) HasClass(className string) bool { + return d.Selection.HasClass(className) +} + +func (d *Document) Length() int { + return d.Selection.Length() +} + +func (d *Document) First() *Document { + return NewDocument(d.Selection.First()) +} + +func (d *Document) Last() *Document { + return NewDocument(d.Selection.Last()) +} + +func (d *Document) Get(index int) *Document { + return NewDocument(d.Selection.Eq(index)) +} + +func (d *Document) Find(s string) *Document { + return NewDocument(d.Selection.Find(s)) +} + +func (d *Document) Next() *Document { + return NewDocument(d.Selection.Next()) +} + +func (d *Document) NextAll() *Document { + return NewDocument(d.Selection.NextAll()) +} + +func (d *Document) NextUntil(s string) *Document { + return NewDocument(d.Selection.NextUntil(s)) +} + +func (d *Document) Prev() *Document { + return NewDocument(d.Selection.Prev()) +} + +func (d *Document) PrevAll() *Document { + return NewDocument(d.Selection.PrevAll()) +} + +func (d *Document) PrevUntil(s string) *Document { + return NewDocument(d.Selection.PrevUntil(s)) +} + +func (d *Document) Siblings() *Document { + return NewDocument(d.Selection.Siblings()) +} + +func (d *Document) Children() *Document { + return NewDocument(d.Selection.Children()) +} + +func (d *Document) Parent() *Document { + return NewDocument(d.Selection.Parent()) +} + +func (d *Document) Map(callback func(*Document, int) any) []any { + var vals []any + d.Selection.Map(func(i int, s *goquery.Selection) string { + vals = append(vals, callback(NewDocument(s), i)) + return "" + }) + return vals +} + +func (d *Document) Filter(callback func(*Document, int) bool) []*Document { + var vals []*Document + d.Selection.Each(func(i int, s *goquery.Selection) { + el := NewDocument(s) + ok := callback(el, i) + if ok { + vals = append(vals, el) + } + }) + return vals +} + +func (d *Document) All() []*Document { + ss := make([]*Document, 0, d.Length()) + d.Selection.Each(func(i int, s *goquery.Selection) { + ss = append(ss, NewDocument(s)) + }) + return ss +} diff --git a/flyscrape/flyscrape_test.go b/flyscrape/flyscrape_test.go new file mode 100644 index 0000000..8518caa --- /dev/null +++ b/flyscrape/flyscrape_test.go @@ -0,0 +1,45 @@ +package flyscrape_test + +import ( + "testing" + + "github.com/philippta/flyscrape/flyscrape" +) + +func TestRun(t *testing.T) { + cfg := flyscrape.Config{ + URL: "https://news.ycombinator.com/", + Browser: true, + Headless: false, + Output: flyscrape.ConfigOutput{ + Format: "ndjson", + }, + } + + fn := func(ctx flyscrape.Context) any { + post := ctx.Doc.Find(".athing.submission").First() + title := post.Find(".titleline > a").Text() + commentsLink := post.Next().Find("a").Last().Attr("href") + + comments := ctx.Scrape(commentsLink, func(ctx flyscrape.Context) any { + var comments []flyscrape.M + + for _, comment := range ctx.Doc.Find(".comtr").All() { + comments = append(comments, flyscrape.M{ + "author": comment.Find(".hnuser").Text(), + "text": comment.Find(".commtext").Text(), + }) + + } + + return comments + }) + + return flyscrape.M{ + "title": title, + "comments": comments, + } + } + + flyscrape.Run(cfg, fn) +} diff --git a/flyscrape/godbg.test b/flyscrape/godbg.test new file mode 100755 index 0000000..c7bf68f Binary files /dev/null and b/flyscrape/godbg.test differ -- cgit v1.2.3