Do not output the same images of the same page

2 years ago · f2c77e787e
3 changed files with 64 additions and 42 deletions
--- a/README.md
+++ b/README.md
@ -17,7 +17,7 @@ The parsing starts from `initial_pages` and goes deeper while ignoring the pages
 if `is_regexp` is `false`, then `query` is the text to be searched for, but there are some special values:

 - `links` - tells `webscrape` to search for all links there are on the page
- `images` - find all image links and output to the `output_dir`
+- `images` - find all image links and output to the `output_dir` (**IMPORTANT**: set `wait_timeout_ms` to `0` so the images load fully)

 When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.

--- a/src/web/requests.go
+++ b/src/web/requests.go
@ -31,7 +31,9 @@ func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
 	}
 	req.Header.Set("User-Agent", userAgent)

-	http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond))
+	if timeOutMs != 0 {
+		http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond))
+	}
 	response, err := http.DefaultClient.Do(req)
 	if err != nil {
 		return nil, err
--- a/src/worker/worker.go
+++ b/src/worker/worker.go
@ -59,6 +59,62 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
 	}
 }

+func (w *Worker) outputImages(baseURL *url.URL, imageLinks []string) {
+	var alreadyProcessedImgUrls []string
+	for count, imageLink := range imageLinks {
+		// check if this URL has been processed already
+		var skipImage bool = false
+		for _, processedURL := range alreadyProcessedImgUrls {
+			if imageLink == processedURL {
+				skipImage = true
+				break
+			}
+		}
+		if skipImage {
+			skipImage = false
+			continue
+		} else {
+			alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
+		}
+
+		var imageName string = fmt.Sprintf("%s_%d_%s", baseURL.Host, count, path.Base(imageLink))
+
+		response, err := http.Get(imageLink)
+		if err != nil {
+			logger.Error("Failed to get %s", imageLink)
+			continue
+		}
+
+		imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName))
+		if err != nil {
+			logger.Error("Failed to create image file \"%s\": %s", imageName, err)
+			continue
+		}
+
+		_, _ = io.Copy(imageFile, response.Body)
+
+		response.Body.Close()
+		imageFile.Close()
+
+		logger.Info("Outputted \"%s\"", imageName)
+		w.stats.MatchesFound++
+	}
+}
+
+func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
+	if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" {
+		var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String()))
+		pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, pageName))
+		if err != nil {
+			logger.Error("Failed to create page of \"%s\": %s", baseURL.String(), err)
+		} else {
+			pageFile.Write(pageData)
+		}
+
+		pageFile.Close()
+	}
+}
+
 func (w *Worker) Work() {
 	if w.Stopped {
 		return
@ -153,32 +209,10 @@ func (w *Worker) Work() {
 			}

 		case config.QueryImages:
-			// find image URLs, output data to the file
+			// find image URLs, output images to the file while not saving already outputted ones
 			imageLinks := web.FindPageImages(pageData, parsedURL.Host)

-			for count, imageLink := range imageLinks {
-				var imageName string = fmt.Sprintf("%s_%d_%s", parsedURL.Host, count, path.Base(imageLink))
-
-				response, err := http.Get(imageLink)
-				if err != nil {
-					logger.Error("Failed to get %s", imageLink)
-					continue
-				}
-
-				imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName))
-				if err != nil {
-					logger.Error("Failed to create image file \"%s\": %s", imageName, err)
-					continue
-				}
-
-				_, _ = io.Copy(imageFile, response.Body)
-
-				response.Body.Close()
-				imageFile.Close()
-
-				logger.Info("Outputted \"%s\"", imageName)
-				w.stats.MatchesFound++
-			}
+			w.outputImages(parsedURL, imageLinks)

 			if len(imageLinks) > 0 {
 				savePage = true
@ -223,22 +257,8 @@ func (w *Worker) Work() {
 			}

 			// save page
-			if savePage && w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" {
-				url, err := url.Parse(job.URL)
-				if err != nil {
-					logger.Error("Failed to parse \"%s\" to save page: %s", job.URL, err)
-					break
-				}
-
-				var pageName string = fmt.Sprintf("%s_%s.html", url.Host, path.Base(job.URL))
-				pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, pageName))
-				if err != nil {
-					logger.Error("Failed to create page of \"%s\": %s", job.URL, err)
-				} else {
-					pageFile.Write(pageData)
-				}
-
-				pageFile.Close()
+			if savePage {
+				w.savePage(parsedURL, pageData)
 			}

 			// sleep before the next request