summaryrefslogtreecommitdiff
path: root/scrape
diff options
context:
space:
mode:
Diffstat (limited to 'scrape')
-rw-r--r--scrape/parser.go70
-rw-r--r--scrape/parser_test.go83
-rw-r--r--scrape/query.go41
-rw-r--r--scrape/query_test.go23
4 files changed, 0 insertions, 217 deletions
diff --git a/scrape/parser.go b/scrape/parser.go
deleted file mode 100644
index 3304b77..0000000
--- a/scrape/parser.go
+++ /dev/null
@@ -1,70 +0,0 @@
-package scrape
-
-import (
- "encoding/json"
- "strings"
-
- "github.com/PuerkitoBio/goquery"
-)
-
-func ParseFromJSON(html, input string) any {
- var inputJSON map[string]any
- json.Unmarshal([]byte(input), &inputJSON)
- return Parse(html, inputJSON)
-}
-
-func Parse(html string, fields map[string]any) any {
- return queryMap(Doc(html), fields)
-}
-
-func AddMeta(result any, key string, value any) {
- switch res := result.(type) {
- case []map[string]any:
- for i := range res {
- res[i][key] = value
- }
- case map[string]any:
- res[key] = value
- }
-}
-
-func walk(s *goquery.Selection, fields map[string]any) map[string]any {
- out := map[string]any{}
- for k, v := range fields {
- if strings.HasPrefix(k, "#") {
- continue
- }
-
- switch val := v.(type) {
- case string:
- segs := strings.SplitN(k, "#", 2)
- if len(segs) == 2 && segs[1] == "html" {
- out[segs[0]] = QueryHTML(s, val)
- } else if len(segs) == 2 {
- out[segs[0]] = QueryAttr(s, val, segs[1])
- } else {
- out[k] = Query(s, val)
- }
-
- case map[string]any:
- out[k] = queryMap(s, val)
- }
- }
- return out
-}
-
-func queryMap(s *goquery.Selection, fields map[string]any) any {
- if sel, ok := fields["#each"].(string); ok {
- rows := []map[string]any{}
- QueryFunc(s, sel, func(s *goquery.Selection) {
- rows = append(rows, walk(s, fields))
- })
- return rows
- }
-
- if sel, ok := fields["#element"].(string); ok {
- return walk(s.Find(sel), fields)
- }
-
- return walk(s, fields)
-}
diff --git a/scrape/parser_test.go b/scrape/parser_test.go
deleted file mode 100644
index 4eb515d..0000000
--- a/scrape/parser_test.go
+++ /dev/null
@@ -1,83 +0,0 @@
-package scrape
-
-import (
- "encoding/json"
- "fmt"
- "testing"
-
- "github.com/stretchr/testify/require"
-)
-
-func TestParser(t *testing.T) {
- o := ParseFromJSON(html, `{
- "title": "head > title",
- "headline": "body h1",
- "sections": {
- "#each": ".container",
- "head": "h2",
- "text": "p",
- "inner": {
- "#each": ".inner",
- "headline": "h3"
- },
- "one": {
- "#element": ".one",
- "value": ".val"
- }
- }
- }`)
- require.Equal(t, o, nil)
-
- b, _ := json.MarshalIndent(o, "", " ")
- fmt.Println(string(b))
-}
-
-func TestParser2(t *testing.T) {
- o := ParseFromJSON(html, `{
- "#each": ".container",
- "head": "h2",
- "text": "p"
- }`)
-
- b, _ := json.MarshalIndent(o, "", " ")
- fmt.Println(string(b))
-}
-
-var html = `
-<html>
- <head>
- <title>Title</title>
- </head>
- <body>
- <h1>Headline</h1>
- <div class="container">
- <h2>Section 1</h2>
- <p>
- Paragraph 1
- </p>
- <div class="one">
- <div class="val">One</div>
- </div>
- <div class="inner">
- <h3>Inner H3</h3>
- </div>
- <div class="inner">
- <h3>Inner H3 next</h3>
- </div>
- </div>
- <div class="container">
- <h2>Section 2</h2>
- <p>
- Paragraph 2
- </p>
- <div class="one"><div class="val">Two</div></div>
- <div class="inner">
- <h3>Inner H3 2</h3>
- </div>
- <div class="inner">
- <h3>Inner H3 2 next</h3>
- </div>
- </div>
- </body>
-</html>
-`
diff --git a/scrape/query.go b/scrape/query.go
deleted file mode 100644
index 1fe5ea4..0000000
--- a/scrape/query.go
+++ /dev/null
@@ -1,41 +0,0 @@
-package scrape
-
-import (
- "strings"
-
- "github.com/PuerkitoBio/goquery"
-)
-
-var emptyDoc, _ = goquery.NewDocumentFromReader(strings.NewReader(""))
-
-func Doc(html string) *goquery.Selection {
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
- if err != nil {
- return emptyDoc.Selection
- }
- return doc.Selection
-}
-
-func Query(s *goquery.Selection, selector string) string {
- val := s.Find(selector).First().Text()
- return strings.TrimSpace(val)
-}
-
-func QueryAttr(s *goquery.Selection, selector, attr string) string {
- val := s.Find(selector).First().AttrOr(attr, "")
- return strings.TrimSpace(val)
-}
-
-func QueryHTML(s *goquery.Selection, selector string) string {
- val, err := goquery.OuterHtml(s.Find(selector))
- if err != nil {
- return ""
- }
- return strings.TrimSpace(val)
-}
-
-func QueryFunc(s *goquery.Selection, selector string, f func(*goquery.Selection)) {
- s.Find(selector).Each(func(i int, s *goquery.Selection) {
- f(s)
- })
-}
diff --git a/scrape/query_test.go b/scrape/query_test.go
deleted file mode 100644
index 0e8423d..0000000
--- a/scrape/query_test.go
+++ /dev/null
@@ -1,23 +0,0 @@
-package scrape
-
-import (
- "testing"
-
- "github.com/stretchr/testify/require"
-)
-
-func TestQuery(t *testing.T) {
- html := `<html>
-<body>
- <h1 id="title">Page Title</h1>
- <div id="posts">
- <div class="post">First post</div>
- <div class="post">Second post</div>
- <div class="post">Third post</div>
- </div>
-</body>
-</html>`
-
- title := Query(Doc(html), "#title")
- require.Equal(t, title, "Page Title")
-}