summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--js/template.js17
-rw-r--r--scrape.go19
2 files changed, 26 insertions, 10 deletions
diff --git a/js/template.js b/js/template.js
index a2b4518..f75df28 100644
--- a/js/template.js
+++ b/js/template.js
@@ -2,17 +2,24 @@ import { parse } from "flyscrape";
export const options = {
url: "https://news.ycombinator.com/", // Specify the URL to start scraping from.
- depth: 1, // Specify how deep links should be followed (0 = no follow).
- allowedDomains: ["news.ycombinator.com"], // Specify the allowed domains to follow.
- rate: 100, // Specify the request rate in requests per second.
+ depth: 1, // Specify how deep links should be followed. (default = 0, no follow)
+ allowedDomains: ["news.ycombinator.com"], // Specify the allowed domains. (default = domain from url)
+ blockedDomains: [], // Specify the blocked domains. (default = none)
+ rate: 100, // Specify the rate in requests per second. (default = 100)
}
export default function({ html, url }) {
const $ = parse(html);
+ const title = $('title');
+ const entries = $('.athing').toArray();
+
+ if (entries.length == 0) {
+ return null; // Omits scraped pages without entries.
+ }
return {
- title: $('title').text(),
- entries: $('.athing').toArray().map(entry => {
+ title: title.text(), // Extract the page title.
+ entries: entries.map(entry => { // Extract all news entries.
const link = $(entry).find('.titleline > a');
const rank = $(entry).find('.rank');
const points = $(entry).next().find('.score');
diff --git a/scrape.go b/scrape.go
index 6ff92dc..793e418 100644
--- a/scrape.go
+++ b/scrape.go
@@ -19,6 +19,7 @@ type ScrapeParams struct {
type ScrapeOptions struct {
URL string `json:"url"`
AllowedDomains []string `json:"allowedDomains"`
+ BlockedDomains []string `json:"blockedDomains"`
Depth int `json:"depth"`
Rate float64 `json:"rate"`
}
@@ -135,7 +136,9 @@ func (s *Scraper) worker(jobs chan target, results chan<- result) {
}
}
- results <- res
+ if res.err != nil || res.data != nil {
+ results <- res
+ }
s.wg.Done()
}()
}
@@ -163,17 +166,23 @@ func (s *Scraper) isURLAllowed(rawurl string) bool {
}
host := u.Host()
+ ok := false
for _, domain := range s.ScrapeOptions.AllowedDomains {
- if domain == "*" {
- return true
+ if domain == "*" || host == domain {
+ ok = true
+ break
}
+ }
+
+ for _, domain := range s.ScrapeOptions.BlockedDomains {
if host == domain {
- return true
+ ok = false
+ break
}
}
- return false
+ return ok
}
func Links(html string, origin string) []string {