diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-10 18:18:01 +0200 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2023-08-10 18:18:01 +0200 |
| commit | 7e4cf39a0ba6ccbd5cc036700a8b1ff9358ecc3d (patch) | |
| tree | 0f48b46e70162bad117f9f50d297487dee33266f /scrape | |
| parent | a9b61f84070cc7ca0d6e26f187c745619a91422a (diff) | |
improve
Diffstat (limited to 'scrape')
| -rw-r--r-- | scrape/parser.go | 70 | ||||
| -rw-r--r-- | scrape/parser_test.go | 83 | ||||
| -rw-r--r-- | scrape/query.go | 41 | ||||
| -rw-r--r-- | scrape/query_test.go | 23 |
4 files changed, 0 insertions, 217 deletions
diff --git a/scrape/parser.go b/scrape/parser.go deleted file mode 100644 index 3304b77..0000000 --- a/scrape/parser.go +++ /dev/null @@ -1,70 +0,0 @@ -package scrape - -import ( - "encoding/json" - "strings" - - "github.com/PuerkitoBio/goquery" -) - -func ParseFromJSON(html, input string) any { - var inputJSON map[string]any - json.Unmarshal([]byte(input), &inputJSON) - return Parse(html, inputJSON) -} - -func Parse(html string, fields map[string]any) any { - return queryMap(Doc(html), fields) -} - -func AddMeta(result any, key string, value any) { - switch res := result.(type) { - case []map[string]any: - for i := range res { - res[i][key] = value - } - case map[string]any: - res[key] = value - } -} - -func walk(s *goquery.Selection, fields map[string]any) map[string]any { - out := map[string]any{} - for k, v := range fields { - if strings.HasPrefix(k, "#") { - continue - } - - switch val := v.(type) { - case string: - segs := strings.SplitN(k, "#", 2) - if len(segs) == 2 && segs[1] == "html" { - out[segs[0]] = QueryHTML(s, val) - } else if len(segs) == 2 { - out[segs[0]] = QueryAttr(s, val, segs[1]) - } else { - out[k] = Query(s, val) - } - - case map[string]any: - out[k] = queryMap(s, val) - } - } - return out -} - -func queryMap(s *goquery.Selection, fields map[string]any) any { - if sel, ok := fields["#each"].(string); ok { - rows := []map[string]any{} - QueryFunc(s, sel, func(s *goquery.Selection) { - rows = append(rows, walk(s, fields)) - }) - return rows - } - - if sel, ok := fields["#element"].(string); ok { - return walk(s.Find(sel), fields) - } - - return walk(s, fields) -} diff --git a/scrape/parser_test.go b/scrape/parser_test.go deleted file mode 100644 index 4eb515d..0000000 --- a/scrape/parser_test.go +++ /dev/null @@ -1,83 +0,0 @@ -package scrape - -import ( - "encoding/json" - "fmt" - "testing" - - "github.com/stretchr/testify/require" -) - -func TestParser(t *testing.T) { - o := ParseFromJSON(html, `{ - "title": "head > title", - "headline": "body h1", - "sections": { - "#each": ".container", - "head": "h2", - "text": "p", - "inner": { - "#each": ".inner", - "headline": "h3" - }, - "one": { - "#element": ".one", - "value": ".val" - } - } - }`) - require.Equal(t, o, nil) - - b, _ := json.MarshalIndent(o, "", " ") - fmt.Println(string(b)) -} - -func TestParser2(t *testing.T) { - o := ParseFromJSON(html, `{ - "#each": ".container", - "head": "h2", - "text": "p" - }`) - - b, _ := json.MarshalIndent(o, "", " ") - fmt.Println(string(b)) -} - -var html = ` -<html> - <head> - <title>Title</title> - </head> - <body> - <h1>Headline</h1> - <div class="container"> - <h2>Section 1</h2> - <p> - Paragraph 1 - </p> - <div class="one"> - <div class="val">One</div> - </div> - <div class="inner"> - <h3>Inner H3</h3> - </div> - <div class="inner"> - <h3>Inner H3 next</h3> - </div> - </div> - <div class="container"> - <h2>Section 2</h2> - <p> - Paragraph 2 - </p> - <div class="one"><div class="val">Two</div></div> - <div class="inner"> - <h3>Inner H3 2</h3> - </div> - <div class="inner"> - <h3>Inner H3 2 next</h3> - </div> - </div> - </body> -</html> -` diff --git a/scrape/query.go b/scrape/query.go deleted file mode 100644 index 1fe5ea4..0000000 --- a/scrape/query.go +++ /dev/null @@ -1,41 +0,0 @@ -package scrape - -import ( - "strings" - - "github.com/PuerkitoBio/goquery" -) - -var emptyDoc, _ = goquery.NewDocumentFromReader(strings.NewReader("")) - -func Doc(html string) *goquery.Selection { - doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) - if err != nil { - return emptyDoc.Selection - } - return doc.Selection -} - -func Query(s *goquery.Selection, selector string) string { - val := s.Find(selector).First().Text() - return strings.TrimSpace(val) -} - -func QueryAttr(s *goquery.Selection, selector, attr string) string { - val := s.Find(selector).First().AttrOr(attr, "") - return strings.TrimSpace(val) -} - -func QueryHTML(s *goquery.Selection, selector string) string { - val, err := goquery.OuterHtml(s.Find(selector)) - if err != nil { - return "" - } - return strings.TrimSpace(val) -} - -func QueryFunc(s *goquery.Selection, selector string, f func(*goquery.Selection)) { - s.Find(selector).Each(func(i int, s *goquery.Selection) { - f(s) - }) -} diff --git a/scrape/query_test.go b/scrape/query_test.go deleted file mode 100644 index 0e8423d..0000000 --- a/scrape/query_test.go +++ /dev/null @@ -1,23 +0,0 @@ -package scrape - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -func TestQuery(t *testing.T) { - html := `<html> -<body> - <h1 id="title">Page Title</h1> - <div id="posts"> - <div class="post">First post</div> - <div class="post">Second post</div> - <div class="post">Third post</div> - </div> -</body> -</html>` - - title := Query(Doc(html), "#title") - require.Equal(t, title, "Page Title") -} |