diff options
Diffstat (limited to 'scrape')
| -rw-r--r-- | scrape/parser.go | 70 | ||||
| -rw-r--r-- | scrape/parser_test.go | 83 | ||||
| -rw-r--r-- | scrape/query.go | 41 | ||||
| -rw-r--r-- | scrape/query_test.go | 23 |
4 files changed, 217 insertions, 0 deletions
diff --git a/scrape/parser.go b/scrape/parser.go new file mode 100644 index 0000000..3304b77 --- /dev/null +++ b/scrape/parser.go @@ -0,0 +1,70 @@ +package scrape + +import ( + "encoding/json" + "strings" + + "github.com/PuerkitoBio/goquery" +) + +func ParseFromJSON(html, input string) any { + var inputJSON map[string]any + json.Unmarshal([]byte(input), &inputJSON) + return Parse(html, inputJSON) +} + +func Parse(html string, fields map[string]any) any { + return queryMap(Doc(html), fields) +} + +func AddMeta(result any, key string, value any) { + switch res := result.(type) { + case []map[string]any: + for i := range res { + res[i][key] = value + } + case map[string]any: + res[key] = value + } +} + +func walk(s *goquery.Selection, fields map[string]any) map[string]any { + out := map[string]any{} + for k, v := range fields { + if strings.HasPrefix(k, "#") { + continue + } + + switch val := v.(type) { + case string: + segs := strings.SplitN(k, "#", 2) + if len(segs) == 2 && segs[1] == "html" { + out[segs[0]] = QueryHTML(s, val) + } else if len(segs) == 2 { + out[segs[0]] = QueryAttr(s, val, segs[1]) + } else { + out[k] = Query(s, val) + } + + case map[string]any: + out[k] = queryMap(s, val) + } + } + return out +} + +func queryMap(s *goquery.Selection, fields map[string]any) any { + if sel, ok := fields["#each"].(string); ok { + rows := []map[string]any{} + QueryFunc(s, sel, func(s *goquery.Selection) { + rows = append(rows, walk(s, fields)) + }) + return rows + } + + if sel, ok := fields["#element"].(string); ok { + return walk(s.Find(sel), fields) + } + + return walk(s, fields) +} diff --git a/scrape/parser_test.go b/scrape/parser_test.go new file mode 100644 index 0000000..4eb515d --- /dev/null +++ b/scrape/parser_test.go @@ -0,0 +1,83 @@ +package scrape + +import ( + "encoding/json" + "fmt" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestParser(t *testing.T) { + o := ParseFromJSON(html, `{ + "title": "head > title", + "headline": "body h1", + "sections": { + "#each": ".container", + "head": "h2", + "text": "p", + "inner": { + "#each": ".inner", + "headline": "h3" + }, + "one": { + "#element": ".one", + "value": ".val" + } + } + }`) + require.Equal(t, o, nil) + + b, _ := json.MarshalIndent(o, "", " ") + fmt.Println(string(b)) +} + +func TestParser2(t *testing.T) { + o := ParseFromJSON(html, `{ + "#each": ".container", + "head": "h2", + "text": "p" + }`) + + b, _ := json.MarshalIndent(o, "", " ") + fmt.Println(string(b)) +} + +var html = ` +<html> + <head> + <title>Title</title> + </head> + <body> + <h1>Headline</h1> + <div class="container"> + <h2>Section 1</h2> + <p> + Paragraph 1 + </p> + <div class="one"> + <div class="val">One</div> + </div> + <div class="inner"> + <h3>Inner H3</h3> + </div> + <div class="inner"> + <h3>Inner H3 next</h3> + </div> + </div> + <div class="container"> + <h2>Section 2</h2> + <p> + Paragraph 2 + </p> + <div class="one"><div class="val">Two</div></div> + <div class="inner"> + <h3>Inner H3 2</h3> + </div> + <div class="inner"> + <h3>Inner H3 2 next</h3> + </div> + </div> + </body> +</html> +` diff --git a/scrape/query.go b/scrape/query.go new file mode 100644 index 0000000..1fe5ea4 --- /dev/null +++ b/scrape/query.go @@ -0,0 +1,41 @@ +package scrape + +import ( + "strings" + + "github.com/PuerkitoBio/goquery" +) + +var emptyDoc, _ = goquery.NewDocumentFromReader(strings.NewReader("")) + +func Doc(html string) *goquery.Selection { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + return emptyDoc.Selection + } + return doc.Selection +} + +func Query(s *goquery.Selection, selector string) string { + val := s.Find(selector).First().Text() + return strings.TrimSpace(val) +} + +func QueryAttr(s *goquery.Selection, selector, attr string) string { + val := s.Find(selector).First().AttrOr(attr, "") + return strings.TrimSpace(val) +} + +func QueryHTML(s *goquery.Selection, selector string) string { + val, err := goquery.OuterHtml(s.Find(selector)) + if err != nil { + return "" + } + return strings.TrimSpace(val) +} + +func QueryFunc(s *goquery.Selection, selector string, f func(*goquery.Selection)) { + s.Find(selector).Each(func(i int, s *goquery.Selection) { + f(s) + }) +} diff --git a/scrape/query_test.go b/scrape/query_test.go new file mode 100644 index 0000000..0e8423d --- /dev/null +++ b/scrape/query_test.go @@ -0,0 +1,23 @@ +package scrape + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestQuery(t *testing.T) { + html := `<html> +<body> + <h1 id="title">Page Title</h1> + <div id="posts"> + <div class="post">First post</div> + <div class="post">Second post</div> + <div class="post">Third post</div> + </div> +</body> +</html>` + + title := Query(Doc(html), "#title") + require.Equal(t, title, "Page Title") +} |