diff --git a/README.md b/README.md index 88dff72..29e49e9 100644 --- a/README.md +++ b/README.md @@ -16,10 +16,11 @@ The parsing starts from `initial_pages` and goes deeper while ignoring the pages There are some special `query` values: -- `links` - tells `wecr` to search for all links there are on the page +- `email` - tells wecr to scrape email addresses and output to `output_file` - `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully) - `videos` - find and fetch files that look like videos - `audio` - find and fetch files that look like audio +- `everything` - find and fetch images, audio and video When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it. @@ -29,10 +30,10 @@ By default, if the query is not something of special values all the matches and ## TODO -- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - [x] -- Search for videos - [x] -- Search for audio - [x] -- Search for documents - [] +- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - Done +- Search for videos - Done +- Search for audio - Done +- Search for documents ## License AGPLv3 \ No newline at end of file diff --git a/src/config/config.go b/src/config/config.go index 3fdca70..5fbd4d8 100644 --- a/src/config/config.go +++ b/src/config/config.go @@ -25,10 +25,11 @@ import ( ) const ( - QueryLinks string = "links" - QueryImages string = "images" - QueryVideos string = "videos" - QueryAudio string = "audio" + QueryImages string = "images" + QueryVideos string = "videos" + QueryAudio string = "audio" + QueryEmail string = "email" + QueryEverything string = "everything" ) const ( @@ -45,7 +46,7 @@ type Search struct { type Save struct { OutputDir string `json:"output_dir"` - OutputFile string `json:"save_file"` + OutputFile string `json:"output_file"` SavePages bool `json:"save_pages"` } diff --git a/src/main.go b/src/main.go index 4ea4404..f2edd5e 100644 --- a/src/main.go +++ b/src/main.go @@ -36,7 +36,7 @@ import ( "unbewohnte/wecr/worker" ) -const version = "v0.2.0" +const version = "v0.2.1" const ( defaultConfigFile string = "conf.json" @@ -275,14 +275,16 @@ func main() { } switch conf.Search.Query { - case config.QueryLinks: - logger.Info("Looking for links") + case config.QueryEmail: + logger.Info("Looking for emails") case config.QueryImages: logger.Info("Looking for images (%+s)", web.ImageExtentions) case config.QueryVideos: logger.Info("Looking for videos (%+s)", web.VideoExtentions) case config.QueryAudio: logger.Info("Looking for audio (%+s)", web.AudioExtentions) + case config.QueryEverything: + logger.Info("Looking for emails, images, videos and audio (%+s - %+s - %+s)", web.ImageExtentions, web.VideoExtentions, web.AudioExtentions) default: if conf.Search.IsRegexp { logger.Info("Looking for RegExp matches (%s)", conf.Search.Query) @@ -359,7 +361,7 @@ func main() { }() } - // get results and write them to the output file + // get text results and write them to the output file (files are handled by each worker separately) for { result, ok := <-results if !ok { diff --git a/src/web/audio.go b/src/web/audio.go index c673c55..3462b61 100644 --- a/src/web/audio.go +++ b/src/web/audio.go @@ -23,7 +23,7 @@ import ( "strings" ) -func hasAudioExtention(url string) bool { +func HasAudioExtention(url string) bool { for _, extention := range AudioExtentions { if strings.HasSuffix(url, extention) { return true @@ -70,7 +70,7 @@ func FindPageAudio(pageBody []byte, from *url.URL) []string { } linkResolved := ResolveLink(link, from.Host) - if hasAudioExtention(linkResolved) { + if HasAudioExtention(linkResolved) { urls = append(urls, linkResolved) } } @@ -108,7 +108,7 @@ func FindPageAudio(pageBody []byte, from *url.URL) []string { } linkResolved := ResolveLink(link, from.Host) - if hasAudioExtention(linkResolved) { + if HasAudioExtention(linkResolved) { urls = append(urls, linkResolved) } } diff --git a/src/web/extentions.go b/src/web/extentions.go index 08cb815..3e06bad 100644 --- a/src/web/extentions.go +++ b/src/web/extentions.go @@ -82,7 +82,3 @@ var VideoExtentions = []string{ ".vob", ".ogv", } - -var DocumentExtentions = []string{ - "", -} diff --git a/src/web/images.go b/src/web/images.go index b092638..bf22781 100644 --- a/src/web/images.go +++ b/src/web/images.go @@ -23,7 +23,7 @@ import ( "strings" ) -func hasImageExtention(url string) bool { +func HasImageExtention(url string) bool { for _, extention := range ImageExtentions { if strings.HasSuffix(url, extention) { return true @@ -70,7 +70,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string { } linkResolved := ResolveLink(link, from.Host) - if hasImageExtention(linkResolved) { + if HasImageExtention(linkResolved) { urls = append(urls, linkResolved) } } @@ -108,7 +108,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string { } linkResolved := ResolveLink(link, from.Host) - if hasImageExtention(linkResolved) { + if HasImageExtention(linkResolved) { urls = append(urls, linkResolved) } } diff --git a/src/web/text.go b/src/web/text.go index 28ea9bf..f4c81dd 100644 --- a/src/web/text.go +++ b/src/web/text.go @@ -21,6 +21,7 @@ package web import ( "bufio" "bytes" + "net/mail" "net/url" "regexp" "strings" @@ -32,6 +33,8 @@ var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|' // matches src="link" or even something along the lines of SrC = 'link' var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`) +var emailRegexp *regexp.Regexp = regexp.MustCompile(`[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`) + // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright func ResolveLink(url *url.URL, fromHost string) string { if !url.IsAbs() { @@ -115,3 +118,33 @@ func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool { func FindPageRegexp(re *regexp.Regexp, pageBody []byte) []string { return re.FindAllString(string(pageBody), -1) } + +// Extract clear email addresses on the page +func FindPageEmails(pageBody []byte) []string { + var emailAddresses []string + + var skip bool + for _, email := range emailRegexp.FindAllString(string(pageBody), -1) { + skip = false + + _, err := mail.ParseAddress(email) + if err != nil { + continue + } + + for _, visitedEmail := range emailAddresses { + if email == visitedEmail { + skip = true + break + } + } + + if skip { + continue + } + + emailAddresses = append(emailAddresses, email) + } + + return emailAddresses +} diff --git a/src/web/videos.go b/src/web/videos.go index 8a7ebcb..b0e6b6e 100644 --- a/src/web/videos.go +++ b/src/web/videos.go @@ -23,7 +23,7 @@ import ( "strings" ) -func hasVideoExtention(url string) bool { +func HasVideoExtention(url string) bool { for _, extention := range VideoExtentions { if strings.HasSuffix(url, extention) { return true @@ -70,7 +70,7 @@ func FindPageVideos(pageBody []byte, from *url.URL) []string { } linkResolved := ResolveLink(link, from.Host) - if hasVideoExtention(linkResolved) { + if HasVideoExtention(linkResolved) { urls = append(urls, linkResolved) } } @@ -108,7 +108,7 @@ func FindPageVideos(pageBody []byte, from *url.URL) []string { } linkResolved := ResolveLink(link, from.Host) - if hasVideoExtention(linkResolved) { + if HasVideoExtention(linkResolved) { urls = append(urls, linkResolved) } } diff --git a/src/worker/worker.go b/src/worker/worker.go index 85e7255..8aa5da4 100644 --- a/src/worker/worker.go +++ b/src/worker/worker.go @@ -61,7 +61,7 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi } } -func (w *Worker) saveContent(contenType string, links []string, pageURL *url.URL) { +func (w *Worker) saveContent(links []string, pageURL *url.URL) { var alreadyProcessedUrls []string for count, link := range links { // check if this URL has been processed already @@ -83,14 +83,13 @@ func (w *Worker) saveContent(contenType string, links []string, pageURL *url.URL var fileName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(link)) var filePath string - switch contenType { - case config.QueryImages: + if web.HasImageExtention(link) { filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveImagesDir, fileName) - case config.QueryVideos: + } else if web.HasVideoExtention(link) { filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName) - case config.QueryAudio: + } else if web.HasAudioExtention(link) { filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName) - default: + } else { filePath = filepath.Join(w.Conf.Save.OutputDir, fileName) } @@ -229,22 +228,10 @@ func (w *Worker) Work() { var savePage bool = false switch job.Search.Query { - case config.QueryLinks: - // simply output links - if len(pageLinks) > 0 { - w.Results <- web.Result{ - PageURL: job.URL, - Search: job.Search, - Data: pageLinks, - } - w.stats.MatchesFound += uint64(len(pageLinks)) - savePage = true - } - case config.QueryImages: // find image URLs, output images to the file while not saving already outputted ones imageLinks := web.FindPageImages(pageData, pageURL) - w.saveContent(config.QueryImages, imageLinks, pageURL) + w.saveContent(imageLinks, pageURL) if len(imageLinks) > 0 { savePage = true } @@ -253,7 +240,7 @@ func (w *Worker) Work() { // search for videos // find video URLs, output videos to the files while not saving already outputted ones videoLinks := web.FindPageVideos(pageData, pageURL) - w.saveContent(config.QueryVideos, videoLinks, pageURL) + w.saveContent(videoLinks, pageURL) if len(videoLinks) > 0 { savePage = true } @@ -262,11 +249,50 @@ func (w *Worker) Work() { // search for audio // find audio URLs, output audio to the file while not saving already outputted ones audioLinks := web.FindPageAudio(pageData, pageURL) - w.saveContent(config.QueryAudio, audioLinks, pageURL) + w.saveContent(audioLinks, pageURL) if len(audioLinks) > 0 { savePage = true } + case config.QueryEmail: + // search for email + emailAddresses := web.FindPageEmails(pageData) + if len(emailAddresses) > 0 { + w.Results <- web.Result{ + PageURL: job.URL, + Search: job.Search, + Data: emailAddresses, + } + w.stats.MatchesFound += uint64(len(emailAddresses)) + savePage = true + } + + case config.QueryEverything: + // search for everything + + // files + var contentLinks []string + contentLinks = append(contentLinks, web.FindPageImages(pageData, pageURL)...) + contentLinks = append(contentLinks, web.FindPageAudio(pageData, pageURL)...) + contentLinks = append(contentLinks, web.FindPageVideos(pageData, pageURL)...) + w.saveContent(contentLinks, pageURL) + + // email + emailAddresses := web.FindPageEmails(pageData) + if len(emailAddresses) > 0 { + w.Results <- web.Result{ + PageURL: job.URL, + Search: job.Search, + Data: emailAddresses, + } + w.stats.MatchesFound += uint64(len(emailAddresses)) + savePage = true + } + + if len(contentLinks) > 0 || len(emailAddresses) > 0 { + savePage = true + } + default: // text search switch job.Search.IsRegexp {