summaryrefslogtreecommitdiff
path: root/scrape.go
diff options
context:
space:
mode:
Diffstat (limited to 'scrape.go')
-rw-r--r--scrape.go54
1 files changed, 53 insertions, 1 deletions
diff --git a/scrape.go b/scrape.go
index 1667f42..a183681 100644
--- a/scrape.go
+++ b/scrape.go
@@ -203,7 +203,13 @@ func (s *Scraper) process(url string, depth int) {
}
}()
- response.Data, err = s.ScrapeFunc(ScrapeParams{HTML: string(response.Body), URL: request.URL})
+ p := ScrapeParams{
+ HTML: string(response.Body),
+ URL: request.URL,
+ Process: s.processImmediate,
+ }
+
+ response.Data, err = s.ScrapeFunc(p)
if err != nil {
response.Error = err
return
@@ -212,6 +218,52 @@ func (s *Scraper) process(url string, depth int) {
}
}
+func (s *Scraper) processImmediate(url string) ([]byte, error) {
+ request := &Request{
+ Method: http.MethodGet,
+ URL: url,
+ Headers: http.Header{},
+ Cookies: s.Client.Jar,
+ }
+
+ for _, mod := range s.Modules {
+ if v, ok := mod.(RequestBuilder); ok {
+ v.BuildRequest(request)
+ }
+ }
+
+ req, err := http.NewRequest(request.Method, request.URL, nil)
+ if err != nil {
+ return nil, err
+ }
+ req.Header = request.Headers
+
+ for _, mod := range s.Modules {
+ if v, ok := mod.(RequestValidator); ok {
+ if !v.ValidateRequest(request) {
+ return nil, nil
+ }
+ }
+ }
+
+ resp, err := s.Client.Do(req)
+ if err != nil {
+ return nil, err
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+ return nil, fmt.Errorf("%d %s", resp.StatusCode, http.StatusText(resp.StatusCode))
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, err
+ }
+
+ return body, nil
+}
+
func (s *Scraper) enqueueJob(url string, depth int) {
url = strings.TrimSpace(url)
if url == "" {