Corrected README text

2 years ago · 31f4b9e43e
3 changed files with 43 additions and 46 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,4 +5,5 @@ output.json
 websurf
 conf_mega_ita.json
 wecr
-release/
+release/
+scraped/
--- a/README.md
+++ b/README.md
@ -10,13 +10,13 @@ The flow of work fully depends on the configuration file. By default `conf.json`

 The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.

-The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains`. If all initial pages are happen to be blacklisted - the program will end.
+The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains` or are NOT in `allowed_domains`. If all initial pages are happen to be on blacklisted domains or are not in the allowed list - the program will get stuck.

 ### Search query

 if `is_regexp` is `false`, then `query` is the text to be searched for, but there are some special values:

- `links` - tells `webscrape` to search for all links there are on the page
+- `links` - tells `wecr` to search for all links there are on the page
 - `images` - find all image links and output to the `output_dir` (**IMPORTANT**: set `wait_timeout_ms` to `0` so the images load fully)

 When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.
--- a/src/worker/worker.go
+++ b/src/worker/worker.go
@ -60,48 +60,6 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
 	}
 }

-func (w *Worker) outputImages(baseURL *url.URL, imageLinks []string) {
-	var alreadyProcessedImgUrls []string
-	for count, imageLink := range imageLinks {
-		// check if this URL has been processed already
-		var skipImage bool = false
-		for _, processedURL := range alreadyProcessedImgUrls {
-			if imageLink == processedURL {
-				skipImage = true
-				break
-			}
-		}
-		if skipImage {
-			skipImage = false
-			continue
-		} else {
-			alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
-		}
-
-		var imageName string = fmt.Sprintf("%s_%d_%s", baseURL.Host, count, path.Base(imageLink))
-
-		response, err := http.Get(imageLink)
-		if err != nil {
-			logger.Error("Failed to get image %s", imageLink)
-			continue
-		}
-
-		imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName))
-		if err != nil {
-			logger.Error("Failed to create image file \"%s\": %s", imageName, err)
-			continue
-		}
-
-		_, _ = io.Copy(imageFile, response.Body)
-
-		response.Body.Close()
-		imageFile.Close()
-
-		logger.Info("Outputted \"%s\"", imageName)
-		w.stats.MatchesFound++
-	}
-}
-
 func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
 	if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" {
 		var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String()))
@ -227,7 +185,45 @@ func (w *Worker) Work() {
 			// find image URLs, output images to the file while not saving already outputted ones
 			imageLinks := web.FindPageImages(pageData, parsedURL.Host)

-			w.outputImages(parsedURL, imageLinks)
+			var alreadyProcessedImgUrls []string
+			for count, imageLink := range imageLinks {
+				// check if this URL has been processed already
+				var skipImage bool = false
+				for _, processedURL := range alreadyProcessedImgUrls {
+					if imageLink == processedURL {
+						skipImage = true
+						break
+					}
+				}
+				if skipImage {
+					skipImage = false
+					continue
+				} else {
+					alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
+				}
+
+				var imageName string = fmt.Sprintf("%s_%d_%s", parsedURL.Host, count, path.Base(imageLink))
+
+				response, err := http.Get(imageLink)
+				if err != nil {
+					logger.Error("Failed to get image %s", imageLink)
+					continue
+				}
+
+				imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName))
+				if err != nil {
+					logger.Error("Failed to create image file \"%s\": %s", imageName, err)
+					continue
+				}
+
+				_, _ = io.Copy(imageFile, response.Body)
+
+				response.Body.Close()
+				imageFile.Close()
+
+				logger.Info("Outputted \"%s\"", imageName)
+				w.stats.MatchesFound++
+			}

 			if len(imageLinks) > 0 {
 				savePage = true