diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2024-02-17 23:42:55 +0100 |
|---|---|---|
| committer | Philipp Tanlak <philipp.tanlak@gmail.com> | 2024-02-17 23:42:55 +0100 |
| commit | c796f4164c13e30135246c08304acd7142673f60 (patch) | |
| tree | 739f498d883df995313da1aefc68d5a9bd7b78af | |
| parent | f4a69b75da6d29680c0ebcded88f67016cf6fdc4 (diff) | |
Make urls more fault tolerantv0.7.2
| -rw-r--r-- | examples/urls_from_file.js | 2 | ||||
| -rw-r--r-- | scrape.go | 6 |
2 files changed, 7 insertions, 1 deletions
diff --git a/examples/urls_from_file.js b/examples/urls_from_file.js index 0231032..4633c9c 100644 --- a/examples/urls_from_file.js +++ b/examples/urls_from_file.js @@ -1,7 +1,7 @@ import urls from "./urls.txt" export const config = { - urls: urls.split("\n").filter(Boolean) + urls: urls.split("\n") }; export default function({ doc }) { @@ -10,6 +10,7 @@ import ( "log" "net/http" "net/http/cookiejar" + "strings" "sync" "github.com/cornelk/hashmap" @@ -210,6 +211,11 @@ func (s *Scraper) process(url string, depth int) { } func (s *Scraper) enqueueJob(url string, depth int) { + url = strings.TrimSpace(url) + if url == "" { + return + } + if _, ok := s.visited.Get(url); ok { return } |