diff --git a/src/main.go b/src/main.go index f2edd5e..7b718ca 100644 --- a/src/main.go +++ b/src/main.go @@ -276,7 +276,7 @@ func main() { switch conf.Search.Query { case config.QueryEmail: - logger.Info("Looking for emails") + logger.Info("Looking for email addresses") case config.QueryImages: logger.Info("Looking for images (%+s)", web.ImageExtentions) case config.QueryVideos: @@ -284,7 +284,7 @@ func main() { case config.QueryAudio: logger.Info("Looking for audio (%+s)", web.AudioExtentions) case config.QueryEverything: - logger.Info("Looking for emails, images, videos and audio (%+s - %+s - %+s)", web.ImageExtentions, web.VideoExtentions, web.AudioExtentions) + logger.Info("Looking for email addresses, images, videos and audio (%+s - %+s - %+s)", web.ImageExtentions, web.VideoExtentions, web.AudioExtentions) default: if conf.Search.IsRegexp { logger.Info("Looking for RegExp matches (%s)", conf.Search.Query) diff --git a/src/web/text.go b/src/web/text.go index f4c81dd..e52f58d 100644 --- a/src/web/text.go +++ b/src/web/text.go @@ -21,6 +21,7 @@ package web import ( "bufio" "bytes" + "net" "net/mail" "net/url" "regexp" @@ -33,7 +34,9 @@ var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|' // matches src="link" or even something along the lines of SrC = 'link' var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`) -var emailRegexp *regexp.Regexp = regexp.MustCompile(`[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`) +// var emailRegexp *regexp.Regexp = regexp.MustCompile(`[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`) + +var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*") // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright func ResolveLink(url *url.URL, fromHost string) string { @@ -148,3 +151,20 @@ func FindPageEmails(pageBody []byte) []string { return emailAddresses } + +// Extract clear email addresses on the page and make an MX lookup +func FindPageEmailsWithCheck(pageBody []byte) []string { + var filteredEmailAddresses []string + + emailAddresses := FindPageEmails(pageBody) + for _, email := range emailAddresses { + _, err := net.LookupMX(strings.Split(email, "@")[1]) + if err != nil { + continue + } + + filteredEmailAddresses = append(filteredEmailAddresses, email) + } + + return filteredEmailAddresses +} diff --git a/src/worker/worker.go b/src/worker/worker.go index 8aa5da4..e93007a 100644 --- a/src/worker/worker.go +++ b/src/worker/worker.go @@ -256,7 +256,7 @@ func (w *Worker) Work() { case config.QueryEmail: // search for email - emailAddresses := web.FindPageEmails(pageData) + emailAddresses := web.FindPageEmailsWithCheck(pageData) if len(emailAddresses) > 0 { w.Results <- web.Result{ PageURL: job.URL, @@ -278,7 +278,7 @@ func (w *Worker) Work() { w.saveContent(contentLinks, pageURL) // email - emailAddresses := web.FindPageEmails(pageData) + emailAddresses := web.FindPageEmailsWithCheck(pageData) if len(emailAddresses) > 0 { w.Results <- web.Result{ PageURL: job.URL,