summaryrefslogblamecommitdiff
path: root/modules/urlfilter/urlfilter.go
blob: 58675e86a42f4573fe56dd4a8d061d8c6271c9e1 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
12
13












                                                                      
                                          



                                         
                                          






                                                 











                                                                    
















                                                             




                                                             
                         
                           

                           




                                   











                                                                   
                                          





                                            
                                          







                                  



                                                                 
     

                                                     
 
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package urlfilter

import (
	"regexp"

	"github.com/philippta/flyscrape"
)

func init() {
	flyscrape.RegisterModule(Module{})
}

type Module struct {
	URL         string   `json:"url"`
	URLs        []string `json:"urls"`
	AllowedURLs []string `json:"allowedURLs"`
	BlockedURLs []string `json:"blockedURLs"`

	allowedURLsRE []*regexp.Regexp
	blockedURLsRE []*regexp.Regexp
}

func (Module) ModuleInfo() flyscrape.ModuleInfo {
	return flyscrape.ModuleInfo{
		ID:  "urlfilter",
		New: func() flyscrape.Module { return new(Module) },
	}
}

func (m *Module) Provision(v flyscrape.Context) {
	if m.disabled() {
		return
	}

	for _, pat := range m.AllowedURLs {
		re, err := regexp.Compile(pat)
		if err != nil {
			continue
		}
		m.allowedURLsRE = append(m.allowedURLsRE, re)
	}

	for _, pat := range m.BlockedURLs {
		re, err := regexp.Compile(pat)
		if err != nil {
			continue
		}
		m.blockedURLsRE = append(m.blockedURLsRE, re)
	}
}

func (m *Module) ValidateRequest(r *flyscrape.Request) bool {
	if m.disabled() {
		return true
	}

	// allow root url
	if r.URL == m.URL {
		return true
	}
	for _, u := range m.URLs {
		if r.URL == u {
			return true
		}
	}

	// allow if no filter is set
	if len(m.allowedURLsRE) == 0 && len(m.blockedURLsRE) == 0 {
		return true
	}

	ok := false
	if len(m.allowedURLsRE) == 0 {
		ok = true
	}

	for _, re := range m.allowedURLsRE {
		if re.MatchString(r.URL) {
			ok = true
			break
		}
	}

	for _, re := range m.blockedURLsRE {
		if re.MatchString(r.URL) {
			ok = false
			break
		}
	}

	return ok
}

func (m *Module) disabled() bool {
	return len(m.AllowedURLs) == 0 && len(m.BlockedURLs) == 0
}

var (
	_ flyscrape.RequestValidator = (*Module)(nil)
	_ flyscrape.Provisioner      = (*Module)(nil)
)