From bf99c233a18c3165e0d4d251b41224e5bc6eb93d Mon Sep 17 00:00:00 2001 From: Philipp Tanlak Date: Fri, 10 Jan 2025 12:49:32 +0100 Subject: Implement nested scraping (#81) --- scrape.go | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) (limited to 'scrape.go') diff --git a/scrape.go b/scrape.go index 1667f42..a183681 100644 --- a/scrape.go +++ b/scrape.go @@ -203,7 +203,13 @@ func (s *Scraper) process(url string, depth int) { } }() - response.Data, err = s.ScrapeFunc(ScrapeParams{HTML: string(response.Body), URL: request.URL}) + p := ScrapeParams{ + HTML: string(response.Body), + URL: request.URL, + Process: s.processImmediate, + } + + response.Data, err = s.ScrapeFunc(p) if err != nil { response.Error = err return @@ -212,6 +218,52 @@ func (s *Scraper) process(url string, depth int) { } } +func (s *Scraper) processImmediate(url string) ([]byte, error) { + request := &Request{ + Method: http.MethodGet, + URL: url, + Headers: http.Header{}, + Cookies: s.Client.Jar, + } + + for _, mod := range s.Modules { + if v, ok := mod.(RequestBuilder); ok { + v.BuildRequest(request) + } + } + + req, err := http.NewRequest(request.Method, request.URL, nil) + if err != nil { + return nil, err + } + req.Header = request.Headers + + for _, mod := range s.Modules { + if v, ok := mod.(RequestValidator); ok { + if !v.ValidateRequest(request) { + return nil, nil + } + } + } + + resp, err := s.Client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, fmt.Errorf("%d %s", resp.StatusCode, http.StatusText(resp.StatusCode)) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + return body, nil +} + func (s *Scraper) enqueueJob(url string, depth int) { url = strings.TrimSpace(url) if url == "" { -- cgit v1.2.3