diff options
| author | Philipp Tanlak <philipp.tanlak@gmail.com> | 2025-01-10 12:49:32 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-01-10 12:49:32 +0100 |
| commit | bf99c233a18c3165e0d4d251b41224e5bc6eb93d (patch) | |
| tree | d32f0fd0770a049552cdd0d51e9402d594e9a35e /scrape.go | |
| parent | 924184f37ef0d3e244f8e8991c259affb45d0ae2 (diff) | |
Implement nested scraping (#81)
Diffstat (limited to 'scrape.go')
| -rw-r--r-- | scrape.go | 54 |
1 files changed, 53 insertions, 1 deletions
@@ -203,7 +203,13 @@ func (s *Scraper) process(url string, depth int) { } }() - response.Data, err = s.ScrapeFunc(ScrapeParams{HTML: string(response.Body), URL: request.URL}) + p := ScrapeParams{ + HTML: string(response.Body), + URL: request.URL, + Process: s.processImmediate, + } + + response.Data, err = s.ScrapeFunc(p) if err != nil { response.Error = err return @@ -212,6 +218,52 @@ func (s *Scraper) process(url string, depth int) { } } +func (s *Scraper) processImmediate(url string) ([]byte, error) { + request := &Request{ + Method: http.MethodGet, + URL: url, + Headers: http.Header{}, + Cookies: s.Client.Jar, + } + + for _, mod := range s.Modules { + if v, ok := mod.(RequestBuilder); ok { + v.BuildRequest(request) + } + } + + req, err := http.NewRequest(request.Method, request.URL, nil) + if err != nil { + return nil, err + } + req.Header = request.Headers + + for _, mod := range s.Modules { + if v, ok := mod.(RequestValidator); ok { + if !v.ValidateRequest(request) { + return nil, nil + } + } + } + + resp, err := s.Client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, fmt.Errorf("%d %s", resp.StatusCode, http.StatusText(resp.StatusCode)) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + return body, nil +} + func (s *Scraper) enqueueJob(url string, depth int) { url = strings.TrimSpace(url) if url == "" { |