From c796f4164c13e30135246c08304acd7142673f60 Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Sat, 17 Feb 2024 23:42:55 +0100 Subject: Make urls more fault tolerant --- examples/urls_from_file.js | 2 +- scrape.go | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/urls_from_file.js b/examples/urls_from_file.js index 0231032..4633c9c 100644 --- a/examples/urls_from_file.js +++ b/examples/urls_from_file.js @@ -1,7 +1,7 @@ import urls from "./urls.txt" export const config = { - urls: urls.split("\n").filter(Boolean) + urls: urls.split("\n") }; export default function({ doc }) { diff --git a/scrape.go b/scrape.go index 019849d..f09cba6 100644 --- a/scrape.go +++ b/scrape.go @@ -10,6 +10,7 @@ import ( "log" "net/http" "net/http/cookiejar" + "strings" "sync" "github.com/cornelk/hashmap" @@ -210,6 +211,11 @@ func (s *Scraper) process(url string, depth int) { } func (s *Scraper) enqueueJob(url string, depth int) { + url = strings.TrimSpace(url) + if url == "" { + return + } + if _, ok := s.visited.Get(url); ok { return } -- cgit v1.2.3