From f2c77e787e8e5edb67ed08fba9936bdf04d2aa14 Mon Sep 17 00:00:00 2001 From: Unbewohnte Date: Sun, 25 Dec 2022 09:54:21 +0300 Subject: [PATCH] Do not output the same images of the same page --- README.md | 2 +- src/web/requests.go | 4 +- src/worker/worker.go | 100 ++++++++++++++++++++++++++----------------- 3 files changed, 64 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 658df28..8fade5b 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ The parsing starts from `initial_pages` and goes deeper while ignoring the pages if `is_regexp` is `false`, then `query` is the text to be searched for, but there are some special values: - `links` - tells `webscrape` to search for all links there are on the page -- `images` - find all image links and output to the `output_dir` +- `images` - find all image links and output to the `output_dir` (**IMPORTANT**: set `wait_timeout_ms` to `0` so the images load fully) When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it. diff --git a/src/web/requests.go b/src/web/requests.go index ffbd3d8..3c267a5 100644 --- a/src/web/requests.go +++ b/src/web/requests.go @@ -31,7 +31,9 @@ func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) { } req.Header.Set("User-Agent", userAgent) - http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond)) + if timeOutMs != 0 { + http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond)) + } response, err := http.DefaultClient.Do(req) if err != nil { return nil, err diff --git a/src/worker/worker.go b/src/worker/worker.go index c000b43..0aa5180 100644 --- a/src/worker/worker.go +++ b/src/worker/worker.go @@ -59,6 +59,62 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi } } +func (w *Worker) outputImages(baseURL *url.URL, imageLinks []string) { + var alreadyProcessedImgUrls []string + for count, imageLink := range imageLinks { + // check if this URL has been processed already + var skipImage bool = false + for _, processedURL := range alreadyProcessedImgUrls { + if imageLink == processedURL { + skipImage = true + break + } + } + if skipImage { + skipImage = false + continue + } else { + alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink) + } + + var imageName string = fmt.Sprintf("%s_%d_%s", baseURL.Host, count, path.Base(imageLink)) + + response, err := http.Get(imageLink) + if err != nil { + logger.Error("Failed to get %s", imageLink) + continue + } + + imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName)) + if err != nil { + logger.Error("Failed to create image file \"%s\": %s", imageName, err) + continue + } + + _, _ = io.Copy(imageFile, response.Body) + + response.Body.Close() + imageFile.Close() + + logger.Info("Outputted \"%s\"", imageName) + w.stats.MatchesFound++ + } +} + +func (w *Worker) savePage(baseURL *url.URL, pageData []byte) { + if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" { + var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String())) + pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, pageName)) + if err != nil { + logger.Error("Failed to create page of \"%s\": %s", baseURL.String(), err) + } else { + pageFile.Write(pageData) + } + + pageFile.Close() + } +} + func (w *Worker) Work() { if w.Stopped { return @@ -153,32 +209,10 @@ func (w *Worker) Work() { } case config.QueryImages: - // find image URLs, output data to the file + // find image URLs, output images to the file while not saving already outputted ones imageLinks := web.FindPageImages(pageData, parsedURL.Host) - for count, imageLink := range imageLinks { - var imageName string = fmt.Sprintf("%s_%d_%s", parsedURL.Host, count, path.Base(imageLink)) - - response, err := http.Get(imageLink) - if err != nil { - logger.Error("Failed to get %s", imageLink) - continue - } - - imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName)) - if err != nil { - logger.Error("Failed to create image file \"%s\": %s", imageName, err) - continue - } - - _, _ = io.Copy(imageFile, response.Body) - - response.Body.Close() - imageFile.Close() - - logger.Info("Outputted \"%s\"", imageName) - w.stats.MatchesFound++ - } + w.outputImages(parsedURL, imageLinks) if len(imageLinks) > 0 { savePage = true @@ -223,22 +257,8 @@ func (w *Worker) Work() { } // save page - if savePage && w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" { - url, err := url.Parse(job.URL) - if err != nil { - logger.Error("Failed to parse \"%s\" to save page: %s", job.URL, err) - break - } - - var pageName string = fmt.Sprintf("%s_%s.html", url.Host, path.Base(job.URL)) - pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, pageName)) - if err != nil { - logger.Error("Failed to create page of \"%s\": %s", job.URL, err) - } else { - pageFile.Write(pageData) - } - - pageFile.Close() + if savePage { + w.savePage(parsedURL, pageData) } // sleep before the next request