diff options
| -rw-r--r-- | js/template.js | 17 | ||||
| -rw-r--r-- | scrape.go | 19 |
2 files changed, 26 insertions, 10 deletions
diff --git a/js/template.js b/js/template.js index a2b4518..f75df28 100644 --- a/js/template.js +++ b/js/template.js @@ -2,17 +2,24 @@ import { parse } from "flyscrape"; export const options = { url: "https://news.ycombinator.com/", // Specify the URL to start scraping from. - depth: 1, // Specify how deep links should be followed (0 = no follow). - allowedDomains: ["news.ycombinator.com"], // Specify the allowed domains to follow. - rate: 100, // Specify the request rate in requests per second. + depth: 1, // Specify how deep links should be followed. (default = 0, no follow) + allowedDomains: ["news.ycombinator.com"], // Specify the allowed domains. (default = domain from url) + blockedDomains: [], // Specify the blocked domains. (default = none) + rate: 100, // Specify the rate in requests per second. (default = 100) } export default function({ html, url }) { const $ = parse(html); + const title = $('title'); + const entries = $('.athing').toArray(); + + if (entries.length == 0) { + return null; // Omits scraped pages without entries. + } return { - title: $('title').text(), - entries: $('.athing').toArray().map(entry => { + title: title.text(), // Extract the page title. + entries: entries.map(entry => { // Extract all news entries. const link = $(entry).find('.titleline > a'); const rank = $(entry).find('.rank'); const points = $(entry).next().find('.score'); @@ -19,6 +19,7 @@ type ScrapeParams struct { type ScrapeOptions struct { URL string `json:"url"` AllowedDomains []string `json:"allowedDomains"` + BlockedDomains []string `json:"blockedDomains"` Depth int `json:"depth"` Rate float64 `json:"rate"` } @@ -135,7 +136,9 @@ func (s *Scraper) worker(jobs chan target, results chan<- result) { } } - results <- res + if res.err != nil || res.data != nil { + results <- res + } s.wg.Done() }() } @@ -163,17 +166,23 @@ func (s *Scraper) isURLAllowed(rawurl string) bool { } host := u.Host() + ok := false for _, domain := range s.ScrapeOptions.AllowedDomains { - if domain == "*" { - return true + if domain == "*" || host == domain { + ok = true + break } + } + + for _, domain := range s.ScrapeOptions.BlockedDomains { if host == domain { - return true + ok = false + break } } - return false + return ok } func Links(html string, origin string) []string { |