Browse Source

Do not output the same images of the same page

master
parent
commit
f2c77e787e
  1. 2
      README.md
  2. 4
      src/web/requests.go
  3. 100
      src/worker/worker.go

2
README.md

@ -17,7 +17,7 @@ The parsing starts from `initial_pages` and goes deeper while ignoring the pages
if `is_regexp` is `false`, then `query` is the text to be searched for, but there are some special values: if `is_regexp` is `false`, then `query` is the text to be searched for, but there are some special values:
- `links` - tells `webscrape` to search for all links there are on the page - `links` - tells `webscrape` to search for all links there are on the page
- `images` - find all image links and output to the `output_dir` - `images` - find all image links and output to the `output_dir` (**IMPORTANT**: set `wait_timeout_ms` to `0` so the images load fully)
When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it. When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.

4
src/web/requests.go

@ -31,7 +31,9 @@ func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
} }
req.Header.Set("User-Agent", userAgent) req.Header.Set("User-Agent", userAgent)
http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond)) if timeOutMs != 0 {
http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond))
}
response, err := http.DefaultClient.Do(req) response, err := http.DefaultClient.Do(req)
if err != nil { if err != nil {
return nil, err return nil, err

100
src/worker/worker.go

@ -59,6 +59,62 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
} }
} }
func (w *Worker) outputImages(baseURL *url.URL, imageLinks []string) {
var alreadyProcessedImgUrls []string
for count, imageLink := range imageLinks {
// check if this URL has been processed already
var skipImage bool = false
for _, processedURL := range alreadyProcessedImgUrls {
if imageLink == processedURL {
skipImage = true
break
}
}
if skipImage {
skipImage = false
continue
} else {
alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
}
var imageName string = fmt.Sprintf("%s_%d_%s", baseURL.Host, count, path.Base(imageLink))
response, err := http.Get(imageLink)
if err != nil {
logger.Error("Failed to get %s", imageLink)
continue
}
imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName))
if err != nil {
logger.Error("Failed to create image file \"%s\": %s", imageName, err)
continue
}
_, _ = io.Copy(imageFile, response.Body)
response.Body.Close()
imageFile.Close()
logger.Info("Outputted \"%s\"", imageName)
w.stats.MatchesFound++
}
}
func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" {
var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String()))
pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, pageName))
if err != nil {
logger.Error("Failed to create page of \"%s\": %s", baseURL.String(), err)
} else {
pageFile.Write(pageData)
}
pageFile.Close()
}
}
func (w *Worker) Work() { func (w *Worker) Work() {
if w.Stopped { if w.Stopped {
return return
@ -153,32 +209,10 @@ func (w *Worker) Work() {
} }
case config.QueryImages: case config.QueryImages:
// find image URLs, output data to the file // find image URLs, output images to the file while not saving already outputted ones
imageLinks := web.FindPageImages(pageData, parsedURL.Host) imageLinks := web.FindPageImages(pageData, parsedURL.Host)
for count, imageLink := range imageLinks { w.outputImages(parsedURL, imageLinks)
var imageName string = fmt.Sprintf("%s_%d_%s", parsedURL.Host, count, path.Base(imageLink))
response, err := http.Get(imageLink)
if err != nil {
logger.Error("Failed to get %s", imageLink)
continue
}
imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName))
if err != nil {
logger.Error("Failed to create image file \"%s\": %s", imageName, err)
continue
}
_, _ = io.Copy(imageFile, response.Body)
response.Body.Close()
imageFile.Close()
logger.Info("Outputted \"%s\"", imageName)
w.stats.MatchesFound++
}
if len(imageLinks) > 0 { if len(imageLinks) > 0 {
savePage = true savePage = true
@ -223,22 +257,8 @@ func (w *Worker) Work() {
} }
// save page // save page
if savePage && w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" { if savePage {
url, err := url.Parse(job.URL) w.savePage(parsedURL, pageData)
if err != nil {
logger.Error("Failed to parse \"%s\" to save page: %s", job.URL, err)
break
}
var pageName string = fmt.Sprintf("%s_%s.html", url.Host, path.Base(job.URL))
pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, pageName))
if err != nil {
logger.Error("Failed to create page of \"%s\": %s", job.URL, err)
} else {
pageFile.Write(pageData)
}
pageFile.Close()
} }
// sleep before the next request // sleep before the next request

Loading…
Cancel
Save