Browse Source

Corrected README text

master v0.1.2
parent
commit
31f4b9e43e
  1. 3
      .gitignore
  2. 4
      README.md
  3. 82
      src/worker/worker.go

3
.gitignore vendored

@ -5,4 +5,5 @@ output.json
websurf
conf_mega_ita.json
wecr
release/
release/
scraped/

4
README.md

@ -10,13 +10,13 @@ The flow of work fully depends on the configuration file. By default `conf.json`
The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.
The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains`. If all initial pages are happen to be blacklisted - the program will end.
The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains` or are NOT in `allowed_domains`. If all initial pages are happen to be on blacklisted domains or are not in the allowed list - the program will get stuck.
### Search query
if `is_regexp` is `false`, then `query` is the text to be searched for, but there are some special values:
- `links` - tells `webscrape` to search for all links there are on the page
- `links` - tells `wecr` to search for all links there are on the page
- `images` - find all image links and output to the `output_dir` (**IMPORTANT**: set `wait_timeout_ms` to `0` so the images load fully)
When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.

82
src/worker/worker.go

@ -60,48 +60,6 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
}
}
func (w *Worker) outputImages(baseURL *url.URL, imageLinks []string) {
var alreadyProcessedImgUrls []string
for count, imageLink := range imageLinks {
// check if this URL has been processed already
var skipImage bool = false
for _, processedURL := range alreadyProcessedImgUrls {
if imageLink == processedURL {
skipImage = true
break
}
}
if skipImage {
skipImage = false
continue
} else {
alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
}
var imageName string = fmt.Sprintf("%s_%d_%s", baseURL.Host, count, path.Base(imageLink))
response, err := http.Get(imageLink)
if err != nil {
logger.Error("Failed to get image %s", imageLink)
continue
}
imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName))
if err != nil {
logger.Error("Failed to create image file \"%s\": %s", imageName, err)
continue
}
_, _ = io.Copy(imageFile, response.Body)
response.Body.Close()
imageFile.Close()
logger.Info("Outputted \"%s\"", imageName)
w.stats.MatchesFound++
}
}
func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" {
var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String()))
@ -227,7 +185,45 @@ func (w *Worker) Work() {
// find image URLs, output images to the file while not saving already outputted ones
imageLinks := web.FindPageImages(pageData, parsedURL.Host)
w.outputImages(parsedURL, imageLinks)
var alreadyProcessedImgUrls []string
for count, imageLink := range imageLinks {
// check if this URL has been processed already
var skipImage bool = false
for _, processedURL := range alreadyProcessedImgUrls {
if imageLink == processedURL {
skipImage = true
break
}
}
if skipImage {
skipImage = false
continue
} else {
alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
}
var imageName string = fmt.Sprintf("%s_%d_%s", parsedURL.Host, count, path.Base(imageLink))
response, err := http.Get(imageLink)
if err != nil {
logger.Error("Failed to get image %s", imageLink)
continue
}
imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName))
if err != nil {
logger.Error("Failed to create image file \"%s\": %s", imageName, err)
continue
}
_, _ = io.Copy(imageFile, response.Body)
response.Body.Close()
imageFile.Close()
logger.Info("Outputted \"%s\"", imageName)
w.stats.MatchesFound++
}
if len(imageLinks) > 0 {
savePage = true

Loading…
Cancel
Save