Removed links search; Added email search

2 years ago · d81f732b82
9 changed files with 107 additions and 48 deletions
--- a/README.md
+++ b/README.md
@ -16,10 +16,11 @@ The parsing starts from `initial_pages` and goes deeper while ignoring the pages
 There are some special `query` values:
- `links` - tells `wecr` to search for all links there are on the page
+- `email` - tells wecr to scrape email addresses and output to `output_file`
 - `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully)
 - `videos` - find and fetch files that look like videos
 - `audio` - find and fetch files that look like audio
 - `everything` - find and fetch images, audio and video
 When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.
@ -29,10 +30,10 @@ By default, if the query is not something of special values all the matches and
 ## TODO
- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - [x]
+- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - Done
- Search for videos - [x]
+- Search for videos - Done
- Search for audio - [x]
+- Search for audio - Done
- Search for documents - []
+- Search for documents
 ## License
 AGPLv3
--- a/src/config/config.go
+++ b/src/config/config.go
@ -25,10 +25,11 @@ import (
 )
 const (
 	QueryLinks  string = "links"
 	QueryImages     string = "images"
 	QueryVideos     string = "videos"
 	QueryAudio      string = "audio"
 	QueryEmail      string = "email"
 	QueryEverything string = "everything"
 )
 const (
@ -45,7 +46,7 @@ type Search struct {
 type Save struct {
 	OutputDir  string `json:"output_dir"`
-	OutputFile string `json:"save_file"`
+	OutputFile string `json:"output_file"`
 	SavePages  bool   `json:"save_pages"`
 }
--- a/src/main.go
+++ b/src/main.go
@ -36,7 +36,7 @@ import (
 	"unbewohnte/wecr/worker"
 )
-const version = "v0.2.0"
+const version = "v0.2.1"
 const (
 	defaultConfigFile string = "conf.json"
@ -275,14 +275,16 @@ func main() {
 	}
 	switch conf.Search.Query {
-	case config.QueryLinks:
+	case config.QueryEmail:
-		logger.Info("Looking for links")
+		logger.Info("Looking for emails")
 	case config.QueryImages:
 		logger.Info("Looking for images (%+s)", web.ImageExtentions)
 	case config.QueryVideos:
 		logger.Info("Looking for videos (%+s)", web.VideoExtentions)
 	case config.QueryAudio:
 		logger.Info("Looking for audio (%+s)", web.AudioExtentions)
 	case config.QueryEverything:
 		logger.Info("Looking for emails, images, videos and audio (%+s - %+s - %+s)", web.ImageExtentions, web.VideoExtentions, web.AudioExtentions)
 	default:
 		if conf.Search.IsRegexp {
 			logger.Info("Looking for RegExp matches (%s)", conf.Search.Query)
@ -359,7 +361,7 @@ func main() {
 		}()
 	}
-	// get results and write them to the output file
+	// get text results and write them to the output file (files are handled by each worker separately)
 	for {
 		result, ok := <-results
 		if !ok {
--- a/src/web/audio.go
+++ b/src/web/audio.go
@ -23,7 +23,7 @@ import (
 	"strings"
 )
-func hasAudioExtention(url string) bool {
+func HasAudioExtention(url string) bool {
 	for _, extention := range AudioExtentions {
 		if strings.HasSuffix(url, extention) {
 			return true
@ -70,7 +70,7 @@ func FindPageAudio(pageBody []byte, from *url.URL) []string {
 		}
 		linkResolved := ResolveLink(link, from.Host)
-		if hasAudioExtention(linkResolved) {
+		if HasAudioExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
@ -108,7 +108,7 @@ func FindPageAudio(pageBody []byte, from *url.URL) []string {
 		}
 		linkResolved := ResolveLink(link, from.Host)
-		if hasAudioExtention(linkResolved) {
+		if HasAudioExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
--- a/src/web/extentions.go
+++ b/src/web/extentions.go
@ -82,7 +82,3 @@ var VideoExtentions = []string{
 	".vob",
 	".ogv",
 }
 var DocumentExtentions = []string{
 	"",
 }
--- a/src/web/images.go
+++ b/src/web/images.go
@ -23,7 +23,7 @@ import (
 	"strings"
 )
-func hasImageExtention(url string) bool {
+func HasImageExtention(url string) bool {
 	for _, extention := range ImageExtentions {
 		if strings.HasSuffix(url, extention) {
 			return true
@ -70,7 +70,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string {
 		}
 		linkResolved := ResolveLink(link, from.Host)
-		if hasImageExtention(linkResolved) {
+		if HasImageExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
@ -108,7 +108,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string {
 		}
 		linkResolved := ResolveLink(link, from.Host)
-		if hasImageExtention(linkResolved) {
+		if HasImageExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
--- a/src/web/text.go
+++ b/src/web/text.go
@ -21,6 +21,7 @@ package web
 import (
 	"bufio"
 	"bytes"
 	"net/mail"
 	"net/url"
 	"regexp"
 	"strings"
@ -32,6 +33,8 @@ var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|'
 // matches src="link" or even something along the lines of SrC    =  'link'
 var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`)
 var emailRegexp *regexp.Regexp = regexp.MustCompile(`[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`)
 // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
 func ResolveLink(url *url.URL, fromHost string) string {
 	if !url.IsAbs() {
@ -115,3 +118,33 @@ func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool {
 func FindPageRegexp(re *regexp.Regexp, pageBody []byte) []string {
 	return re.FindAllString(string(pageBody), -1)
 }
 // Extract clear email addresses on the page
 func FindPageEmails(pageBody []byte) []string {
 	var emailAddresses []string
 	var skip bool
 	for _, email := range emailRegexp.FindAllString(string(pageBody), -1) {
 		skip = false
 		_, err := mail.ParseAddress(email)
 		if err != nil {
 			continue
 		}
 		for _, visitedEmail := range emailAddresses {
 			if email == visitedEmail {
 				skip = true
 				break
 			}
 		}
 		if skip {
 			continue
 		}
 		emailAddresses = append(emailAddresses, email)
 	}
 	return emailAddresses
 }
--- a/src/web/videos.go
+++ b/src/web/videos.go
@ -23,7 +23,7 @@ import (
 	"strings"
 )
-func hasVideoExtention(url string) bool {
+func HasVideoExtention(url string) bool {
 	for _, extention := range VideoExtentions {
 		if strings.HasSuffix(url, extention) {
 			return true
@ -70,7 +70,7 @@ func FindPageVideos(pageBody []byte, from *url.URL) []string {
 		}
 		linkResolved := ResolveLink(link, from.Host)
-		if hasVideoExtention(linkResolved) {
+		if HasVideoExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
@ -108,7 +108,7 @@ func FindPageVideos(pageBody []byte, from *url.URL) []string {
 		}
 		linkResolved := ResolveLink(link, from.Host)
-		if hasVideoExtention(linkResolved) {
+		if HasVideoExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
--- a/src/worker/worker.go
+++ b/src/worker/worker.go
@ -61,7 +61,7 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
 	}
 }
-func (w *Worker) saveContent(contenType string, links []string, pageURL *url.URL) {
+func (w *Worker) saveContent(links []string, pageURL *url.URL) {
 	var alreadyProcessedUrls []string
 	for count, link := range links {
 		// check if this URL has been processed already
@ -83,14 +83,13 @@ func (w *Worker) saveContent(contenType string, links []string, pageURL *url.URL
 		var fileName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(link))
 		var filePath string
-		switch contenType {
+		if web.HasImageExtention(link) {
 		case config.QueryImages:
 			filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveImagesDir, fileName)
-		case config.QueryVideos:
+		} else if web.HasVideoExtention(link) {
 			filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName)
-		case config.QueryAudio:
+		} else if web.HasAudioExtention(link) {
 			filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName)
-		default:
+		} else {
 			filePath = filepath.Join(w.Conf.Save.OutputDir, fileName)
 		}
@ -229,22 +228,10 @@ func (w *Worker) Work() {
 		var savePage bool = false
 		switch job.Search.Query {
 		case config.QueryLinks:
 			// simply output links
 			if len(pageLinks) > 0 {
 				w.Results <- web.Result{
 					PageURL: job.URL,
 					Search:  job.Search,
 					Data:    pageLinks,
 				}
 				w.stats.MatchesFound += uint64(len(pageLinks))
 				savePage = true
 			}
 		case config.QueryImages:
 			// find image URLs, output images to the file while not saving already outputted ones
 			imageLinks := web.FindPageImages(pageData, pageURL)
-			w.saveContent(config.QueryImages, imageLinks, pageURL)
+			w.saveContent(imageLinks, pageURL)
 			if len(imageLinks) > 0 {
 				savePage = true
 			}
@ -253,7 +240,7 @@ func (w *Worker) Work() {
 			// search for videos
 			// find video URLs, output videos to the files while not saving already outputted ones
 			videoLinks := web.FindPageVideos(pageData, pageURL)
-			w.saveContent(config.QueryVideos, videoLinks, pageURL)
+			w.saveContent(videoLinks, pageURL)
 			if len(videoLinks) > 0 {
 				savePage = true
 			}
@ -262,11 +249,50 @@ func (w *Worker) Work() {
 			// search for audio
 			// find audio URLs, output audio to the file while not saving already outputted ones
 			audioLinks := web.FindPageAudio(pageData, pageURL)
-			w.saveContent(config.QueryAudio, audioLinks, pageURL)
+			w.saveContent(audioLinks, pageURL)
 			if len(audioLinks) > 0 {
 				savePage = true
 			}
 		case config.QueryEmail:
 			// search for email
 			emailAddresses := web.FindPageEmails(pageData)
 			if len(emailAddresses) > 0 {
 				w.Results <- web.Result{
 					PageURL: job.URL,
 					Search:  job.Search,
 					Data:    emailAddresses,
 				}
 				w.stats.MatchesFound += uint64(len(emailAddresses))
 				savePage = true
 			}
 		case config.QueryEverything:
 			// search for everything
 			// files
 			var contentLinks []string
 			contentLinks = append(contentLinks, web.FindPageImages(pageData, pageURL)...)
 			contentLinks = append(contentLinks, web.FindPageAudio(pageData, pageURL)...)
 			contentLinks = append(contentLinks, web.FindPageVideos(pageData, pageURL)...)
 			w.saveContent(contentLinks, pageURL)
 			// email
 			emailAddresses := web.FindPageEmails(pageData)
 			if len(emailAddresses) > 0 {
 				w.Results <- web.Result{
 					PageURL: job.URL,
 					Search:  job.Search,
 					Data:    emailAddresses,
 				}
 				w.stats.MatchesFound += uint64(len(emailAddresses))
 				savePage = true
 			}
 			if len(contentLinks) > 0 || len(emailAddresses) > 0 {
 				savePage = true
 			}
 		default:
 			// text search
 			switch job.Search.IsRegexp {