1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package starturl_test
import (
"net/http"
"testing"
"github.com/philippta/flyscrape"
"github.com/philippta/flyscrape/modules/hook"
"github.com/philippta/flyscrape/modules/starturl"
"github.com/stretchr/testify/require"
)
func TestStartURL(t *testing.T) {
var url string
var depth int
mods := []flyscrape.Module{
&starturl.Module{URL: "http://www.example.com/foo/bar"},
hook.Module{
AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
return flyscrape.MockTransport(200, "")
},
BuildRequestFn: func(r *flyscrape.Request) {
url = r.URL
depth = r.Depth
},
},
}
scraper := flyscrape.NewScraper()
scraper.Modules = mods
scraper.Run()
require.Equal(t, "http://www.example.com/foo/bar", url)
require.Equal(t, 0, depth)
}
func TestStartURL_MultipleStartingURLs(t *testing.T) {
testCases := []struct {
name string
startURLModFn func() *starturl.Module
urls []string
}{
{
name: ".URL and .URLs",
startURLModFn: func() *starturl.Module {
return &starturl.Module{
URL: "http://www.example.com/foo",
URLs: []string{
"http://www.example.com/bar",
"http://www.example.com/baz",
},
}
},
urls: []string{
"http://www.example.com/foo",
"http://www.example.com/bar",
"http://www.example.com/baz",
},
},
{
name: "only .URL",
startURLModFn: func() *starturl.Module {
return &starturl.Module{
URL: "http://www.example.com/foo",
}
},
urls: []string{
"http://www.example.com/foo",
},
},
{
name: "only .URLs",
startURLModFn: func() *starturl.Module {
return &starturl.Module{
URLs: []string{
"http://www.example.com/bar",
"http://www.example.com/baz",
},
}
},
urls: []string{
"http://www.example.com/bar",
"http://www.example.com/baz",
},
},
{
name: "empty",
startURLModFn: func() *starturl.Module {
return &starturl.Module{}
},
urls: []string{},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
urls := []string{}
mods := []flyscrape.Module{
tc.startURLModFn(),
hook.Module{
AdaptTransportFn: func(rt http.RoundTripper) http.RoundTripper {
return flyscrape.MockTransport(http.StatusOK, "")
},
BuildRequestFn: func(r *flyscrape.Request) {
urls = append(urls, r.URL)
},
},
}
scraper := flyscrape.NewScraper()
scraper.Modules = mods
scraper.Run()
require.ElementsMatch(t, tc.urls, urls)
})
}
}
|