diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-07-27 19:03:41 +0200 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-07-27 19:03:41 +0200 |
| commit | a9b61f84070cc7ca0d6e26f187c745619a91422a (patch) | |
| tree | d69b67142b6de860d7da23bd5ff8c62af0aaca1e /scrape/parser.go | |
init
Diffstat (limited to 'scrape/parser.go')
| -rw-r--r-- | scrape/parser.go | 70 |
1 files changed, 70 insertions, 0 deletions
diff --git a/scrape/parser.go b/scrape/parser.go new file mode 100644 index 0000000..3304b77 --- /dev/null +++ b/scrape/parser.go @@ -0,0 +1,70 @@ +package scrape + +import ( + "encoding/json" + "strings" + + "github.com/PuerkitoBio/goquery" +) + +func ParseFromJSON(html, input string) any { + var inputJSON map[string]any + json.Unmarshal([]byte(input), &inputJSON) + return Parse(html, inputJSON) +} + +func Parse(html string, fields map[string]any) any { + return queryMap(Doc(html), fields) +} + +func AddMeta(result any, key string, value any) { + switch res := result.(type) { + case []map[string]any: + for i := range res { + res[i][key] = value + } + case map[string]any: + res[key] = value + } +} + +func walk(s *goquery.Selection, fields map[string]any) map[string]any { + out := map[string]any{} + for k, v := range fields { + if strings.HasPrefix(k, "#") { + continue + } + + switch val := v.(type) { + case string: + segs := strings.SplitN(k, "#", 2) + if len(segs) == 2 && segs[1] == "html" { + out[segs[0]] = QueryHTML(s, val) + } else if len(segs) == 2 { + out[segs[0]] = QueryAttr(s, val, segs[1]) + } else { + out[k] = Query(s, val) + } + + case map[string]any: + out[k] = queryMap(s, val) + } + } + return out +} + +func queryMap(s *goquery.Selection, fields map[string]any) any { + if sel, ok := fields["#each"].(string); ok { + rows := []map[string]any{} + QueryFunc(s, sel, func(s *goquery.Selection) { + rows = append(rows, walk(s, fields)) + }) + return rows + } + + if sel, ok := fields["#element"].(string); ok { + return walk(s.Find(sel), fields) + } + + return walk(s, fields) +} |