summaryrefslogblamecommitdiff
path: root/scrape_test.go
blob: 643b10dc58c1fb20fc5d3d90fa25931057c1bc74 (plain) (tree)
1
2
3
4
5
6
7





                      
                   




                                             
                                 

                                                       


                                                                                  
                  

                                                                              



















                                                              
                               



                                                               





















                                                                
package flyscrape_test

import (
	"sort"
	"testing"

	"flyscrape"

	"github.com/stretchr/testify/require"
)

func TestScrape(t *testing.T) {
	svc := flyscrape.Scraper{
		Concurrency: 10,
		ScrapeOptions: flyscrape.ScrapeOptions{
			URL:            "http://example.com/foo/bar",
			Depth:          1,
			AllowedDomains: []string{"example.com", "www.google.com"},
		},
		ScrapeFunc: func(params flyscrape.ScrapeParams) (any, error) {
			return map[string]any{
				"url": params.URL,
			}, nil
		},
		FetchFunc: func(url string) (string, error) {
			return `<html>
                <body>
                    <a href="/baz">Baz</a>
                    <a href="baz">Baz</a>
                    <a href="http://www.google.com">Google</a>
                </body>
            </html>`, nil
		},
	}

	var urls []string
	for res := range svc.Scrape() {
		urls = append(urls, res.URL)
	}
	sort.Strings(urls)

	require.Len(t, urls, 4)
	require.Equal(t, "http://example.com/baz", urls[0])
	require.Equal(t, "http://example.com/foo/bar", urls[1])
	require.Equal(t, "http://example.com/foo/baz", urls[2])
	require.Equal(t, "http://www.google.com/", urls[3])
}

func TestFindLinks(t *testing.T) {
	origin := "http://example.com/foo/bar"
	html := `
        <html>
            <body>
                <a href="/baz">Baz</a>
                <a href="baz">Baz</a>
                <a href="http://www.google.com">Google</a>
                <a href="javascript:void(0)">Google</a>
                <a href="/foo#hello">Anchor</a>
            </body>
        </html>`

	links := flyscrape.Links(html, origin)
	require.Len(t, links, 4)
	require.Equal(t, "http://example.com/baz", links[0])
	require.Equal(t, "http://example.com/foo/baz", links[1])
	require.Equal(t, "http://www.google.com/", links[2])
	require.Equal(t, "http://example.com/foo", links[3])
}