diff --git a/.gitignore b/.gitignore index 94e46dc..1cec9a4 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ output.json websurf conf_mega_ita.json wecr -release/ \ No newline at end of file +release/ +scraped/ \ No newline at end of file diff --git a/README.md b/README.md index 8fade5b..247c75d 100644 --- a/README.md +++ b/README.md @@ -10,13 +10,13 @@ The flow of work fully depends on the configuration file. By default `conf.json` The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them. -The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains`. If all initial pages are happen to be blacklisted - the program will end. +The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains` or are NOT in `allowed_domains`. If all initial pages are happen to be on blacklisted domains or are not in the allowed list - the program will get stuck. ### Search query if `is_regexp` is `false`, then `query` is the text to be searched for, but there are some special values: -- `links` - tells `webscrape` to search for all links there are on the page +- `links` - tells `wecr` to search for all links there are on the page - `images` - find all image links and output to the `output_dir` (**IMPORTANT**: set `wait_timeout_ms` to `0` so the images load fully) When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it. diff --git a/src/worker/worker.go b/src/worker/worker.go index a129f0c..3e5961c 100644 --- a/src/worker/worker.go +++ b/src/worker/worker.go @@ -60,48 +60,6 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi } } -func (w *Worker) outputImages(baseURL *url.URL, imageLinks []string) { - var alreadyProcessedImgUrls []string - for count, imageLink := range imageLinks { - // check if this URL has been processed already - var skipImage bool = false - for _, processedURL := range alreadyProcessedImgUrls { - if imageLink == processedURL { - skipImage = true - break - } - } - if skipImage { - skipImage = false - continue - } else { - alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink) - } - - var imageName string = fmt.Sprintf("%s_%d_%s", baseURL.Host, count, path.Base(imageLink)) - - response, err := http.Get(imageLink) - if err != nil { - logger.Error("Failed to get image %s", imageLink) - continue - } - - imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName)) - if err != nil { - logger.Error("Failed to create image file \"%s\": %s", imageName, err) - continue - } - - _, _ = io.Copy(imageFile, response.Body) - - response.Body.Close() - imageFile.Close() - - logger.Info("Outputted \"%s\"", imageName) - w.stats.MatchesFound++ - } -} - func (w *Worker) savePage(baseURL *url.URL, pageData []byte) { if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" { var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String())) @@ -227,7 +185,45 @@ func (w *Worker) Work() { // find image URLs, output images to the file while not saving already outputted ones imageLinks := web.FindPageImages(pageData, parsedURL.Host) - w.outputImages(parsedURL, imageLinks) + var alreadyProcessedImgUrls []string + for count, imageLink := range imageLinks { + // check if this URL has been processed already + var skipImage bool = false + for _, processedURL := range alreadyProcessedImgUrls { + if imageLink == processedURL { + skipImage = true + break + } + } + if skipImage { + skipImage = false + continue + } else { + alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink) + } + + var imageName string = fmt.Sprintf("%s_%d_%s", parsedURL.Host, count, path.Base(imageLink)) + + response, err := http.Get(imageLink) + if err != nil { + logger.Error("Failed to get image %s", imageLink) + continue + } + + imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName)) + if err != nil { + logger.Error("Failed to create image file \"%s\": %s", imageName, err) + continue + } + + _, _ = io.Copy(imageFile, response.Body) + + response.Body.Close() + imageFile.Close() + + logger.Info("Outputted \"%s\"", imageName) + w.stats.MatchesFound++ + } if len(imageLinks) > 0 { savePage = true