|
|
|
@ -21,6 +21,7 @@ package web
|
|
|
|
|
import ( |
|
|
|
|
"bufio" |
|
|
|
|
"bytes" |
|
|
|
|
"net" |
|
|
|
|
"net/mail" |
|
|
|
|
"net/url" |
|
|
|
|
"regexp" |
|
|
|
@ -33,7 +34,9 @@ var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|'
|
|
|
|
|
// matches src="link" or even something along the lines of SrC = 'link'
|
|
|
|
|
var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`) |
|
|
|
|
|
|
|
|
|
var emailRegexp *regexp.Regexp = regexp.MustCompile(`[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`) |
|
|
|
|
// var emailRegexp *regexp.Regexp = regexp.MustCompile(`[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`)
|
|
|
|
|
|
|
|
|
|
var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*") |
|
|
|
|
|
|
|
|
|
// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
|
|
|
|
|
func ResolveLink(url *url.URL, fromHost string) string { |
|
|
|
@ -148,3 +151,20 @@ func FindPageEmails(pageBody []byte) []string {
|
|
|
|
|
|
|
|
|
|
return emailAddresses |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Extract clear email addresses on the page and make an MX lookup
|
|
|
|
|
func FindPageEmailsWithCheck(pageBody []byte) []string { |
|
|
|
|
var filteredEmailAddresses []string |
|
|
|
|
|
|
|
|
|
emailAddresses := FindPageEmails(pageBody) |
|
|
|
|
for _, email := range emailAddresses { |
|
|
|
|
_, err := net.LookupMX(strings.Split(email, "@")[1]) |
|
|
|
|
if err != nil { |
|
|
|
|
continue |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
filteredEmailAddresses = append(filteredEmailAddresses, email) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return filteredEmailAddresses |
|
|
|
|
} |
|
|
|
|