// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package flyscrape_test
import (
"net/http"
"net/http/httptest"
"testing"
"time"
"flyscrape"
"github.com/stretchr/testify/require"
)
func TestScrapeFollowLinks(t *testing.T) {
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
URL: "http://www.example.com/foo/bar",
Depth: 1,
AllowDomains: []string{"www.google.com"},
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return "foobar", nil
},
FetchFunc: func(url string) (string, error) {
return `Baz
Baz
Google`, nil
},
}
urls := make(map[string]struct{})
for res := range scr.Scrape() {
urls[res.URL] = struct{}{}
}
require.Len(t, urls, 4)
require.Contains(t, urls, "http://www.example.com/baz")
require.Contains(t, urls, "http://www.example.com/foo/bar")
require.Contains(t, urls, "http://www.example.com/foo/baz")
require.Contains(t, urls, "http://www.google.com/")
}
func TestScrapeDepth(t *testing.T) {
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
URL: "http://www.example.com/",
Depth: 2,
AllowDomains: []string{"*"},
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return "foobar", nil
},
FetchFunc: func(url string) (string, error) {
switch url {
case "http://www.example.com/":
return `Google`, nil
case "http://www.google.com/":
return `DuckDuckGo`, nil
case "http://www.duckduckgo.com/":
return `Example`, nil
}
return "", nil
},
}
urls := make(map[string]struct{})
for res := range scr.Scrape() {
urls[res.URL] = struct{}{}
}
require.Len(t, urls, 3)
require.Contains(t, urls, "http://www.example.com/")
require.Contains(t, urls, "http://www.google.com/")
require.Contains(t, urls, "http://www.duckduckgo.com/")
}
func TestScrapeAllowDomains(t *testing.T) {
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
URL: "http://www.example.com/",
Depth: 1,
AllowDomains: []string{"www.google.com"},
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return "foobar", nil
},
FetchFunc: func(url string) (string, error) {
return `Google
DuckDuckGo`, nil
},
}
urls := make(map[string]struct{})
for res := range scr.Scrape() {
urls[res.URL] = struct{}{}
}
require.Len(t, urls, 2)
require.Contains(t, urls, "http://www.example.com/")
require.Contains(t, urls, "http://www.google.com/")
}
func TestScrapeAllowDomainsAll(t *testing.T) {
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
URL: "http://www.example.com/",
Depth: 1,
AllowDomains: []string{"*"},
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return "foobar", nil
},
FetchFunc: func(url string) (string, error) {
return `Google
DuckDuckGo`, nil
},
}
urls := make(map[string]struct{})
for res := range scr.Scrape() {
urls[res.URL] = struct{}{}
}
require.Len(t, urls, 3)
require.Contains(t, urls, "http://www.example.com/")
require.Contains(t, urls, "http://www.duckduckgo.com/")
require.Contains(t, urls, "http://www.google.com/")
}
func TestScrapeDenyDomains(t *testing.T) {
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
URL: "http://www.example.com/",
Depth: 1,
AllowDomains: []string{"*"},
DenyDomains: []string{"www.google.com"},
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return "foobar", nil
},
FetchFunc: func(url string) (string, error) {
return `Google
DuckDuckGo`, nil
},
}
urls := make(map[string]struct{})
for res := range scr.Scrape() {
urls[res.URL] = struct{}{}
}
require.Len(t, urls, 2)
require.Contains(t, urls, "http://www.example.com/")
require.Contains(t, urls, "http://www.duckduckgo.com/")
}
func TestScrapeAllowURLs(t *testing.T) {
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
URL: "http://www.example.com/",
Depth: 1,
AllowURLs: []string{`/foo\?id=\d+`, `/bar$`},
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return "foobar", nil
},
FetchFunc: func(url string) (string, error) {
return `123
ABC
bar
barz`, nil
},
}
urls := make(map[string]struct{})
for res := range scr.Scrape() {
urls[res.URL] = struct{}{}
}
require.Len(t, urls, 3)
require.Contains(t, urls, "http://www.example.com/")
require.Contains(t, urls, "http://www.example.com/foo?id=123")
require.Contains(t, urls, "http://www.example.com/bar")
}
func TestScrapeRate(t *testing.T) {
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
URL: "http://www.example.com/",
Depth: 1,
Rate: 100, // every 10ms
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return "foobar", nil
},
FetchFunc: func(url string) (string, error) {
return `foo`, nil
},
}
res := scr.Scrape()
start := time.Now()
<-res
first := time.Now().Add(-10 * time.Millisecond)
<-res
second := time.Now().Add(-20 * time.Millisecond)
require.Less(t, first.Sub(start), 2*time.Millisecond)
require.Less(t, second.Sub(start), 2*time.Millisecond)
}
func TestScrapeProxy(t *testing.T) {
proxyCalled := false
proxy := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
proxyCalled = true
w.Write([]byte(`Google`))
}))
scr := flyscrape.Scraper{
ScrapeOptions: flyscrape.ScrapeOptions{
URL: "http://www.example.com/",
Proxy: proxy.URL,
},
ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
return "foobar", nil
},
}
res := <-scr.Scrape()
require.True(t, proxyCalled)
require.Equal(t, "http://www.example.com/", res.URL)
}