summaryrefslogtreecommitdiff
path: root/scrape
diff options
context:
space:
mode:
authorPhilipp Tanlak <philipp.tanlak@gmail.com>2023-07-27 19:03:41 +0200
committerPhilipp Tanlak <philipp.tanlak@gmail.com>2023-07-27 19:03:41 +0200
commita9b61f84070cc7ca0d6e26f187c745619a91422a (patch)
treed69b67142b6de860d7da23bd5ff8c62af0aaca1e /scrape
init
Diffstat (limited to 'scrape')
-rw-r--r--scrape/parser.go70
-rw-r--r--scrape/parser_test.go83
-rw-r--r--scrape/query.go41
-rw-r--r--scrape/query_test.go23
4 files changed, 217 insertions, 0 deletions
diff --git a/scrape/parser.go b/scrape/parser.go
new file mode 100644
index 0000000..3304b77
--- /dev/null
+++ b/scrape/parser.go
@@ -0,0 +1,70 @@
+package scrape
+
+import (
+ "encoding/json"
+ "strings"
+
+ "github.com/PuerkitoBio/goquery"
+)
+
+func ParseFromJSON(html, input string) any {
+ var inputJSON map[string]any
+ json.Unmarshal([]byte(input), &inputJSON)
+ return Parse(html, inputJSON)
+}
+
+func Parse(html string, fields map[string]any) any {
+ return queryMap(Doc(html), fields)
+}
+
+func AddMeta(result any, key string, value any) {
+ switch res := result.(type) {
+ case []map[string]any:
+ for i := range res {
+ res[i][key] = value
+ }
+ case map[string]any:
+ res[key] = value
+ }
+}
+
+func walk(s *goquery.Selection, fields map[string]any) map[string]any {
+ out := map[string]any{}
+ for k, v := range fields {
+ if strings.HasPrefix(k, "#") {
+ continue
+ }
+
+ switch val := v.(type) {
+ case string:
+ segs := strings.SplitN(k, "#", 2)
+ if len(segs) == 2 && segs[1] == "html" {
+ out[segs[0]] = QueryHTML(s, val)
+ } else if len(segs) == 2 {
+ out[segs[0]] = QueryAttr(s, val, segs[1])
+ } else {
+ out[k] = Query(s, val)
+ }
+
+ case map[string]any:
+ out[k] = queryMap(s, val)
+ }
+ }
+ return out
+}
+
+func queryMap(s *goquery.Selection, fields map[string]any) any {
+ if sel, ok := fields["#each"].(string); ok {
+ rows := []map[string]any{}
+ QueryFunc(s, sel, func(s *goquery.Selection) {
+ rows = append(rows, walk(s, fields))
+ })
+ return rows
+ }
+
+ if sel, ok := fields["#element"].(string); ok {
+ return walk(s.Find(sel), fields)
+ }
+
+ return walk(s, fields)
+}
diff --git a/scrape/parser_test.go b/scrape/parser_test.go
new file mode 100644
index 0000000..4eb515d
--- /dev/null
+++ b/scrape/parser_test.go
@@ -0,0 +1,83 @@
+package scrape
+
+import (
+ "encoding/json"
+ "fmt"
+ "testing"
+
+ "github.com/stretchr/testify/require"
+)
+
+func TestParser(t *testing.T) {
+ o := ParseFromJSON(html, `{
+ "title": "head > title",
+ "headline": "body h1",
+ "sections": {
+ "#each": ".container",
+ "head": "h2",
+ "text": "p",
+ "inner": {
+ "#each": ".inner",
+ "headline": "h3"
+ },
+ "one": {
+ "#element": ".one",
+ "value": ".val"
+ }
+ }
+ }`)
+ require.Equal(t, o, nil)
+
+ b, _ := json.MarshalIndent(o, "", " ")
+ fmt.Println(string(b))
+}
+
+func TestParser2(t *testing.T) {
+ o := ParseFromJSON(html, `{
+ "#each": ".container",
+ "head": "h2",
+ "text": "p"
+ }`)
+
+ b, _ := json.MarshalIndent(o, "", " ")
+ fmt.Println(string(b))
+}
+
+var html = `
+<html>
+ <head>
+ <title>Title</title>
+ </head>
+ <body>
+ <h1>Headline</h1>
+ <div class="container">
+ <h2>Section 1</h2>
+ <p>
+ Paragraph 1
+ </p>
+ <div class="one">
+ <div class="val">One</div>
+ </div>
+ <div class="inner">
+ <h3>Inner H3</h3>
+ </div>
+ <div class="inner">
+ <h3>Inner H3 next</h3>
+ </div>
+ </div>
+ <div class="container">
+ <h2>Section 2</h2>
+ <p>
+ Paragraph 2
+ </p>
+ <div class="one"><div class="val">Two</div></div>
+ <div class="inner">
+ <h3>Inner H3 2</h3>
+ </div>
+ <div class="inner">
+ <h3>Inner H3 2 next</h3>
+ </div>
+ </div>
+ </body>
+</html>
+`
diff --git a/scrape/query.go b/scrape/query.go
new file mode 100644
index 0000000..1fe5ea4
--- /dev/null
+++ b/scrape/query.go
@@ -0,0 +1,41 @@
+package scrape
+
+import (
+ "strings"
+
+ "github.com/PuerkitoBio/goquery"
+)
+
+var emptyDoc, _ = goquery.NewDocumentFromReader(strings.NewReader(""))
+
+func Doc(html string) *goquery.Selection {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ return emptyDoc.Selection
+ }
+ return doc.Selection
+}
+
+func Query(s *goquery.Selection, selector string) string {
+ val := s.Find(selector).First().Text()
+ return strings.TrimSpace(val)
+}
+
+func QueryAttr(s *goquery.Selection, selector, attr string) string {
+ val := s.Find(selector).First().AttrOr(attr, "")
+ return strings.TrimSpace(val)
+}
+
+func QueryHTML(s *goquery.Selection, selector string) string {
+ val, err := goquery.OuterHtml(s.Find(selector))
+ if err != nil {
+ return ""
+ }
+ return strings.TrimSpace(val)
+}
+
+func QueryFunc(s *goquery.Selection, selector string, f func(*goquery.Selection)) {
+ s.Find(selector).Each(func(i int, s *goquery.Selection) {
+ f(s)
+ })
+}
diff --git a/scrape/query_test.go b/scrape/query_test.go
new file mode 100644
index 0000000..0e8423d
--- /dev/null
+++ b/scrape/query_test.go
@@ -0,0 +1,23 @@
+package scrape
+
+import (
+ "testing"
+
+ "github.com/stretchr/testify/require"
+)
+
+func TestQuery(t *testing.T) {
+ html := `<html>
+<body>
+ <h1 id="title">Page Title</h1>
+ <div id="posts">
+ <div class="post">First post</div>
+ <div class="post">Second post</div>
+ <div class="post">Third post</div>
+ </div>
+</body>
+</html>`
+
+ title := Query(Doc(html), "#title")
+ require.Equal(t, title, "Page Title")
+}