Browse Source

Better email address regexp; MX lookup check

master
parent
commit
963c3b3ca4
  1. 4
      src/main.go
  2. 22
      src/web/text.go
  3. 4
      src/worker/worker.go

4
src/main.go

@ -276,7 +276,7 @@ func main() {
switch conf.Search.Query {
case config.QueryEmail:
logger.Info("Looking for emails")
logger.Info("Looking for email addresses")
case config.QueryImages:
logger.Info("Looking for images (%+s)", web.ImageExtentions)
case config.QueryVideos:
@ -284,7 +284,7 @@ func main() {
case config.QueryAudio:
logger.Info("Looking for audio (%+s)", web.AudioExtentions)
case config.QueryEverything:
logger.Info("Looking for emails, images, videos and audio (%+s - %+s - %+s)", web.ImageExtentions, web.VideoExtentions, web.AudioExtentions)
logger.Info("Looking for email addresses, images, videos and audio (%+s - %+s - %+s)", web.ImageExtentions, web.VideoExtentions, web.AudioExtentions)
default:
if conf.Search.IsRegexp {
logger.Info("Looking for RegExp matches (%s)", conf.Search.Query)

22
src/web/text.go

@ -21,6 +21,7 @@ package web
import (
"bufio"
"bytes"
"net"
"net/mail"
"net/url"
"regexp"
@ -33,7 +34,9 @@ var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|'
// matches src="link" or even something along the lines of SrC = 'link'
var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`)
var emailRegexp *regexp.Regexp = regexp.MustCompile(`[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`)
// var emailRegexp *regexp.Regexp = regexp.MustCompile(`[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`)
var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*")
// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
func ResolveLink(url *url.URL, fromHost string) string {
@ -148,3 +151,20 @@ func FindPageEmails(pageBody []byte) []string {
return emailAddresses
}
// Extract clear email addresses on the page and make an MX lookup
func FindPageEmailsWithCheck(pageBody []byte) []string {
var filteredEmailAddresses []string
emailAddresses := FindPageEmails(pageBody)
for _, email := range emailAddresses {
_, err := net.LookupMX(strings.Split(email, "@")[1])
if err != nil {
continue
}
filteredEmailAddresses = append(filteredEmailAddresses, email)
}
return filteredEmailAddresses
}

4
src/worker/worker.go

@ -256,7 +256,7 @@ func (w *Worker) Work() {
case config.QueryEmail:
// search for email
emailAddresses := web.FindPageEmails(pageData)
emailAddresses := web.FindPageEmailsWithCheck(pageData)
if len(emailAddresses) > 0 {
w.Results <- web.Result{
PageURL: job.URL,
@ -278,7 +278,7 @@ func (w *Worker) Work() {
w.saveContent(contentLinks, pageURL)
// email
emailAddresses := web.FindPageEmails(pageData)
emailAddresses := web.FindPageEmailsWithCheck(pageData)
if len(emailAddresses) > 0 {
w.Results <- web.Result{
PageURL: job.URL,

Loading…
Cancel
Save