summaryrefslogtreecommitdiff
path: root/scrape/parser.go
diff options
context:
space:
mode:
Diffstat (limited to 'scrape/parser.go')
-rw-r--r--scrape/parser.go70
1 files changed, 70 insertions, 0 deletions
diff --git a/scrape/parser.go b/scrape/parser.go
new file mode 100644
index 0000000..3304b77
--- /dev/null
+++ b/scrape/parser.go
@@ -0,0 +1,70 @@
+package scrape
+
+import (
+ "encoding/json"
+ "strings"
+
+ "github.com/PuerkitoBio/goquery"
+)
+
+func ParseFromJSON(html, input string) any {
+ var inputJSON map[string]any
+ json.Unmarshal([]byte(input), &inputJSON)
+ return Parse(html, inputJSON)
+}
+
+func Parse(html string, fields map[string]any) any {
+ return queryMap(Doc(html), fields)
+}
+
+func AddMeta(result any, key string, value any) {
+ switch res := result.(type) {
+ case []map[string]any:
+ for i := range res {
+ res[i][key] = value
+ }
+ case map[string]any:
+ res[key] = value
+ }
+}
+
+func walk(s *goquery.Selection, fields map[string]any) map[string]any {
+ out := map[string]any{}
+ for k, v := range fields {
+ if strings.HasPrefix(k, "#") {
+ continue
+ }
+
+ switch val := v.(type) {
+ case string:
+ segs := strings.SplitN(k, "#", 2)
+ if len(segs) == 2 && segs[1] == "html" {
+ out[segs[0]] = QueryHTML(s, val)
+ } else if len(segs) == 2 {
+ out[segs[0]] = QueryAttr(s, val, segs[1])
+ } else {
+ out[k] = Query(s, val)
+ }
+
+ case map[string]any:
+ out[k] = queryMap(s, val)
+ }
+ }
+ return out
+}
+
+func queryMap(s *goquery.Selection, fields map[string]any) any {
+ if sel, ok := fields["#each"].(string); ok {
+ rows := []map[string]any{}
+ QueryFunc(s, sel, func(s *goquery.Selection) {
+ rows = append(rows, walk(s, fields))
+ })
+ return rows
+ }
+
+ if sel, ok := fields["#element"].(string); ok {
+ return walk(s.Find(sel), fields)
+ }
+
+ return walk(s, fields)
+}