Removed links search; Added email search

2 years ago · d81f732b82
9 changed files with 107 additions and 48 deletions
--- a/README.md
+++ b/README.md
@ -16,10 +16,11 @@ The parsing starts from `initial_pages` and goes deeper while ignoring the pages

 There are some special `query` values:

- `links` - tells `wecr` to search for all links there are on the page
+- `email` - tells wecr to scrape email addresses and output to `output_file`
 - `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully)
 - `videos` - find and fetch files that look like videos
 - `audio` - find and fetch files that look like audio
+- `everything` - find and fetch images, audio and video

 When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.

@ -29,10 +30,10 @@ By default, if the query is not something of special values all the matches and

 ## TODO

- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - [x]
- Search for videos - [x]
- Search for audio - [x]
- Search for documents - []
+- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - Done
+- Search for videos - Done
+- Search for audio - Done
+- Search for documents

 ## License
 AGPLv3
--- a/src/config/config.go
+++ b/src/config/config.go
@ -25,10 +25,11 @@ import (
 )

 const (
-	QueryLinks  string = "links"
-	QueryImages string = "images"
-	QueryVideos string = "videos"
-	QueryAudio  string = "audio"
+	QueryImages     string = "images"
+	QueryVideos     string = "videos"
+	QueryAudio      string = "audio"
+	QueryEmail      string = "email"
+	QueryEverything string = "everything"
 )

 const (
@ -45,7 +46,7 @@ type Search struct {

 type Save struct {
 	OutputDir  string `json:"output_dir"`
-	OutputFile string `json:"save_file"`
+	OutputFile string `json:"output_file"`
 	SavePages  bool   `json:"save_pages"`
 }

--- a/src/main.go
+++ b/src/main.go
@ -36,7 +36,7 @@ import (
 	"unbewohnte/wecr/worker"
 )

-const version = "v0.2.0"
+const version = "v0.2.1"

 const (
 	defaultConfigFile string = "conf.json"
@ -275,14 +275,16 @@ func main() {
 	}

 	switch conf.Search.Query {
-	case config.QueryLinks:
-		logger.Info("Looking for links")
+	case config.QueryEmail:
+		logger.Info("Looking for emails")
 	case config.QueryImages:
 		logger.Info("Looking for images (%+s)", web.ImageExtentions)
 	case config.QueryVideos:
 		logger.Info("Looking for videos (%+s)", web.VideoExtentions)
 	case config.QueryAudio:
 		logger.Info("Looking for audio (%+s)", web.AudioExtentions)
+	case config.QueryEverything:
+		logger.Info("Looking for emails, images, videos and audio (%+s - %+s - %+s)", web.ImageExtentions, web.VideoExtentions, web.AudioExtentions)
 	default:
 		if conf.Search.IsRegexp {
 			logger.Info("Looking for RegExp matches (%s)", conf.Search.Query)
@ -359,7 +361,7 @@ func main() {
 		}()
 	}

-	// get results and write them to the output file
+	// get text results and write them to the output file (files are handled by each worker separately)
 	for {
 		result, ok := <-results
 		if !ok {
--- a/src/web/audio.go
+++ b/src/web/audio.go
@ -23,7 +23,7 @@ import (
 	"strings"
 )

-func hasAudioExtention(url string) bool {
+func HasAudioExtention(url string) bool {
 	for _, extention := range AudioExtentions {
 		if strings.HasSuffix(url, extention) {
 			return true
@ -70,7 +70,7 @@ func FindPageAudio(pageBody []byte, from *url.URL) []string {
 		}

 		linkResolved := ResolveLink(link, from.Host)
-		if hasAudioExtention(linkResolved) {
+		if HasAudioExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
@ -108,7 +108,7 @@ func FindPageAudio(pageBody []byte, from *url.URL) []string {
 		}

 		linkResolved := ResolveLink(link, from.Host)
-		if hasAudioExtention(linkResolved) {
+		if HasAudioExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
--- a/src/web/extentions.go
+++ b/src/web/extentions.go
@ -82,7 +82,3 @@ var VideoExtentions = []string{
 	".vob",
 	".ogv",
 }
-
-var DocumentExtentions = []string{
-	"",
-}
--- a/src/web/images.go
+++ b/src/web/images.go
@ -23,7 +23,7 @@ import (
 	"strings"
 )

-func hasImageExtention(url string) bool {
+func HasImageExtention(url string) bool {
 	for _, extention := range ImageExtentions {
 		if strings.HasSuffix(url, extention) {
 			return true
@ -70,7 +70,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string {
 		}

 		linkResolved := ResolveLink(link, from.Host)
-		if hasImageExtention(linkResolved) {
+		if HasImageExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
@ -108,7 +108,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string {
 		}

 		linkResolved := ResolveLink(link, from.Host)
-		if hasImageExtention(linkResolved) {
+		if HasImageExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
--- a/src/web/text.go
+++ b/src/web/text.go
@ -21,6 +21,7 @@ package web
 import (
 	"bufio"
 	"bytes"
+	"net/mail"
 	"net/url"
 	"regexp"
 	"strings"
@ -32,6 +33,8 @@ var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|'
 // matches src="link" or even something along the lines of SrC    =  'link'
 var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`)

+var emailRegexp *regexp.Regexp = regexp.MustCompile(`[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`)
+
 // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
 func ResolveLink(url *url.URL, fromHost string) string {
 	if !url.IsAbs() {
@ -115,3 +118,33 @@ func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool {
 func FindPageRegexp(re *regexp.Regexp, pageBody []byte) []string {
 	return re.FindAllString(string(pageBody), -1)
 }
+
+// Extract clear email addresses on the page
+func FindPageEmails(pageBody []byte) []string {
+	var emailAddresses []string
+
+	var skip bool
+	for _, email := range emailRegexp.FindAllString(string(pageBody), -1) {
+		skip = false
+
+		_, err := mail.ParseAddress(email)
+		if err != nil {
+			continue
+		}
+
+		for _, visitedEmail := range emailAddresses {
+			if email == visitedEmail {
+				skip = true
+				break
+			}
+		}
+
+		if skip {
+			continue
+		}
+
+		emailAddresses = append(emailAddresses, email)
+	}
+
+	return emailAddresses
+}
--- a/src/web/videos.go
+++ b/src/web/videos.go
@ -23,7 +23,7 @@ import (
 	"strings"
 )

-func hasVideoExtention(url string) bool {
+func HasVideoExtention(url string) bool {
 	for _, extention := range VideoExtentions {
 		if strings.HasSuffix(url, extention) {
 			return true
@ -70,7 +70,7 @@ func FindPageVideos(pageBody []byte, from *url.URL) []string {
 		}

 		linkResolved := ResolveLink(link, from.Host)
-		if hasVideoExtention(linkResolved) {
+		if HasVideoExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
@ -108,7 +108,7 @@ func FindPageVideos(pageBody []byte, from *url.URL) []string {
 		}

 		linkResolved := ResolveLink(link, from.Host)
-		if hasVideoExtention(linkResolved) {
+		if HasVideoExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
--- a/src/worker/worker.go
+++ b/src/worker/worker.go
@ -61,7 +61,7 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
 	}
 }

-func (w *Worker) saveContent(contenType string, links []string, pageURL *url.URL) {
+func (w *Worker) saveContent(links []string, pageURL *url.URL) {
 	var alreadyProcessedUrls []string
 	for count, link := range links {
 		// check if this URL has been processed already
@ -83,14 +83,13 @@ func (w *Worker) saveContent(contenType string, links []string, pageURL *url.URL
 		var fileName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(link))

 		var filePath string
-		switch contenType {
-		case config.QueryImages:
+		if web.HasImageExtention(link) {
 			filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveImagesDir, fileName)
-		case config.QueryVideos:
+		} else if web.HasVideoExtention(link) {
 			filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName)
-		case config.QueryAudio:
+		} else if web.HasAudioExtention(link) {
 			filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName)
-		default:
+		} else {
 			filePath = filepath.Join(w.Conf.Save.OutputDir, fileName)
 		}

@ -229,22 +228,10 @@ func (w *Worker) Work() {
 		var savePage bool = false

 		switch job.Search.Query {
-		case config.QueryLinks:
-			// simply output links
-			if len(pageLinks) > 0 {
-				w.Results <- web.Result{
-					PageURL: job.URL,
-					Search:  job.Search,
-					Data:    pageLinks,
-				}
-				w.stats.MatchesFound += uint64(len(pageLinks))
-				savePage = true
-			}
-
 		case config.QueryImages:
 			// find image URLs, output images to the file while not saving already outputted ones
 			imageLinks := web.FindPageImages(pageData, pageURL)
-			w.saveContent(config.QueryImages, imageLinks, pageURL)
+			w.saveContent(imageLinks, pageURL)
 			if len(imageLinks) > 0 {
 				savePage = true
 			}
@ -253,7 +240,7 @@ func (w *Worker) Work() {
 			// search for videos
 			// find video URLs, output videos to the files while not saving already outputted ones
 			videoLinks := web.FindPageVideos(pageData, pageURL)
-			w.saveContent(config.QueryVideos, videoLinks, pageURL)
+			w.saveContent(videoLinks, pageURL)
 			if len(videoLinks) > 0 {
 				savePage = true
 			}
@ -262,11 +249,50 @@ func (w *Worker) Work() {
 			// search for audio
 			// find audio URLs, output audio to the file while not saving already outputted ones
 			audioLinks := web.FindPageAudio(pageData, pageURL)
-			w.saveContent(config.QueryAudio, audioLinks, pageURL)
+			w.saveContent(audioLinks, pageURL)
 			if len(audioLinks) > 0 {
 				savePage = true
 			}

+		case config.QueryEmail:
+			// search for email
+			emailAddresses := web.FindPageEmails(pageData)
+			if len(emailAddresses) > 0 {
+				w.Results <- web.Result{
+					PageURL: job.URL,
+					Search:  job.Search,
+					Data:    emailAddresses,
+				}
+				w.stats.MatchesFound += uint64(len(emailAddresses))
+				savePage = true
+			}
+
+		case config.QueryEverything:
+			// search for everything
+
+			// files
+			var contentLinks []string
+			contentLinks = append(contentLinks, web.FindPageImages(pageData, pageURL)...)
+			contentLinks = append(contentLinks, web.FindPageAudio(pageData, pageURL)...)
+			contentLinks = append(contentLinks, web.FindPageVideos(pageData, pageURL)...)
+			w.saveContent(contentLinks, pageURL)
+
+			// email
+			emailAddresses := web.FindPageEmails(pageData)
+			if len(emailAddresses) > 0 {
+				w.Results <- web.Result{
+					PageURL: job.URL,
+					Search:  job.Search,
+					Data:    emailAddresses,
+				}
+				w.stats.MatchesFound += uint64(len(emailAddresses))
+				savePage = true
+			}
+
+			if len(contentLinks) > 0 || len(emailAddresses) > 0 {
+				savePage = true
+			}
+
 		default:
 			// text search
 			switch job.Search.IsRegexp {