Browse Source

Removed links search; Added email search

master
parent
commit
d81f732b82
  1. 11
      README.md
  2. 11
      src/config/config.go
  3. 10
      src/main.go
  4. 6
      src/web/audio.go
  5. 4
      src/web/extentions.go
  6. 6
      src/web/images.go
  7. 33
      src/web/text.go
  8. 6
      src/web/videos.go
  9. 68
      src/worker/worker.go

11
README.md

@ -16,10 +16,11 @@ The parsing starts from `initial_pages` and goes deeper while ignoring the pages
There are some special `query` values:
- `links` - tells `wecr` to search for all links there are on the page
- `email` - tells wecr to scrape email addresses and output to `output_file`
- `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully)
- `videos` - find and fetch files that look like videos
- `audio` - find and fetch files that look like audio
- `everything` - find and fetch images, audio and video
When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.
@ -29,10 +30,10 @@ By default, if the query is not something of special values all the matches and
## TODO
- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - [x]
- Search for videos - [x]
- Search for audio - [x]
- Search for documents - []
- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - Done
- Search for videos - Done
- Search for audio - Done
- Search for documents
## License
AGPLv3

11
src/config/config.go

@ -25,10 +25,11 @@ import (
)
const (
QueryLinks string = "links"
QueryImages string = "images"
QueryVideos string = "videos"
QueryAudio string = "audio"
QueryImages string = "images"
QueryVideos string = "videos"
QueryAudio string = "audio"
QueryEmail string = "email"
QueryEverything string = "everything"
)
const (
@ -45,7 +46,7 @@ type Search struct {
type Save struct {
OutputDir string `json:"output_dir"`
OutputFile string `json:"save_file"`
OutputFile string `json:"output_file"`
SavePages bool `json:"save_pages"`
}

10
src/main.go

@ -36,7 +36,7 @@ import (
"unbewohnte/wecr/worker"
)
const version = "v0.2.0"
const version = "v0.2.1"
const (
defaultConfigFile string = "conf.json"
@ -275,14 +275,16 @@ func main() {
}
switch conf.Search.Query {
case config.QueryLinks:
logger.Info("Looking for links")
case config.QueryEmail:
logger.Info("Looking for emails")
case config.QueryImages:
logger.Info("Looking for images (%+s)", web.ImageExtentions)
case config.QueryVideos:
logger.Info("Looking for videos (%+s)", web.VideoExtentions)
case config.QueryAudio:
logger.Info("Looking for audio (%+s)", web.AudioExtentions)
case config.QueryEverything:
logger.Info("Looking for emails, images, videos and audio (%+s - %+s - %+s)", web.ImageExtentions, web.VideoExtentions, web.AudioExtentions)
default:
if conf.Search.IsRegexp {
logger.Info("Looking for RegExp matches (%s)", conf.Search.Query)
@ -359,7 +361,7 @@ func main() {
}()
}
// get results and write them to the output file
// get text results and write them to the output file (files are handled by each worker separately)
for {
result, ok := <-results
if !ok {

6
src/web/audio.go

@ -23,7 +23,7 @@ import (
"strings"
)
func hasAudioExtention(url string) bool {
func HasAudioExtention(url string) bool {
for _, extention := range AudioExtentions {
if strings.HasSuffix(url, extention) {
return true
@ -70,7 +70,7 @@ func FindPageAudio(pageBody []byte, from *url.URL) []string {
}
linkResolved := ResolveLink(link, from.Host)
if hasAudioExtention(linkResolved) {
if HasAudioExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}
@ -108,7 +108,7 @@ func FindPageAudio(pageBody []byte, from *url.URL) []string {
}
linkResolved := ResolveLink(link, from.Host)
if hasAudioExtention(linkResolved) {
if HasAudioExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}

4
src/web/extentions.go

@ -82,7 +82,3 @@ var VideoExtentions = []string{
".vob",
".ogv",
}
var DocumentExtentions = []string{
"",
}

6
src/web/images.go

@ -23,7 +23,7 @@ import (
"strings"
)
func hasImageExtention(url string) bool {
func HasImageExtention(url string) bool {
for _, extention := range ImageExtentions {
if strings.HasSuffix(url, extention) {
return true
@ -70,7 +70,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string {
}
linkResolved := ResolveLink(link, from.Host)
if hasImageExtention(linkResolved) {
if HasImageExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}
@ -108,7 +108,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string {
}
linkResolved := ResolveLink(link, from.Host)
if hasImageExtention(linkResolved) {
if HasImageExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}

33
src/web/text.go

@ -21,6 +21,7 @@ package web
import (
"bufio"
"bytes"
"net/mail"
"net/url"
"regexp"
"strings"
@ -32,6 +33,8 @@ var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|'
// matches src="link" or even something along the lines of SrC = 'link'
var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`)
var emailRegexp *regexp.Regexp = regexp.MustCompile(`[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`)
// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
func ResolveLink(url *url.URL, fromHost string) string {
if !url.IsAbs() {
@ -115,3 +118,33 @@ func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool {
func FindPageRegexp(re *regexp.Regexp, pageBody []byte) []string {
return re.FindAllString(string(pageBody), -1)
}
// Extract clear email addresses on the page
func FindPageEmails(pageBody []byte) []string {
var emailAddresses []string
var skip bool
for _, email := range emailRegexp.FindAllString(string(pageBody), -1) {
skip = false
_, err := mail.ParseAddress(email)
if err != nil {
continue
}
for _, visitedEmail := range emailAddresses {
if email == visitedEmail {
skip = true
break
}
}
if skip {
continue
}
emailAddresses = append(emailAddresses, email)
}
return emailAddresses
}

6
src/web/videos.go

@ -23,7 +23,7 @@ import (
"strings"
)
func hasVideoExtention(url string) bool {
func HasVideoExtention(url string) bool {
for _, extention := range VideoExtentions {
if strings.HasSuffix(url, extention) {
return true
@ -70,7 +70,7 @@ func FindPageVideos(pageBody []byte, from *url.URL) []string {
}
linkResolved := ResolveLink(link, from.Host)
if hasVideoExtention(linkResolved) {
if HasVideoExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}
@ -108,7 +108,7 @@ func FindPageVideos(pageBody []byte, from *url.URL) []string {
}
linkResolved := ResolveLink(link, from.Host)
if hasVideoExtention(linkResolved) {
if HasVideoExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}

68
src/worker/worker.go

@ -61,7 +61,7 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
}
}
func (w *Worker) saveContent(contenType string, links []string, pageURL *url.URL) {
func (w *Worker) saveContent(links []string, pageURL *url.URL) {
var alreadyProcessedUrls []string
for count, link := range links {
// check if this URL has been processed already
@ -83,14 +83,13 @@ func (w *Worker) saveContent(contenType string, links []string, pageURL *url.URL
var fileName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(link))
var filePath string
switch contenType {
case config.QueryImages:
if web.HasImageExtention(link) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveImagesDir, fileName)
case config.QueryVideos:
} else if web.HasVideoExtention(link) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName)
case config.QueryAudio:
} else if web.HasAudioExtention(link) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName)
default:
} else {
filePath = filepath.Join(w.Conf.Save.OutputDir, fileName)
}
@ -229,22 +228,10 @@ func (w *Worker) Work() {
var savePage bool = false
switch job.Search.Query {
case config.QueryLinks:
// simply output links
if len(pageLinks) > 0 {
w.Results <- web.Result{
PageURL: job.URL,
Search: job.Search,
Data: pageLinks,
}
w.stats.MatchesFound += uint64(len(pageLinks))
savePage = true
}
case config.QueryImages:
// find image URLs, output images to the file while not saving already outputted ones
imageLinks := web.FindPageImages(pageData, pageURL)
w.saveContent(config.QueryImages, imageLinks, pageURL)
w.saveContent(imageLinks, pageURL)
if len(imageLinks) > 0 {
savePage = true
}
@ -253,7 +240,7 @@ func (w *Worker) Work() {
// search for videos
// find video URLs, output videos to the files while not saving already outputted ones
videoLinks := web.FindPageVideos(pageData, pageURL)
w.saveContent(config.QueryVideos, videoLinks, pageURL)
w.saveContent(videoLinks, pageURL)
if len(videoLinks) > 0 {
savePage = true
}
@ -262,11 +249,50 @@ func (w *Worker) Work() {
// search for audio
// find audio URLs, output audio to the file while not saving already outputted ones
audioLinks := web.FindPageAudio(pageData, pageURL)
w.saveContent(config.QueryAudio, audioLinks, pageURL)
w.saveContent(audioLinks, pageURL)
if len(audioLinks) > 0 {
savePage = true
}
case config.QueryEmail:
// search for email
emailAddresses := web.FindPageEmails(pageData)
if len(emailAddresses) > 0 {
w.Results <- web.Result{
PageURL: job.URL,
Search: job.Search,
Data: emailAddresses,
}
w.stats.MatchesFound += uint64(len(emailAddresses))
savePage = true
}
case config.QueryEverything:
// search for everything
// files
var contentLinks []string
contentLinks = append(contentLinks, web.FindPageImages(pageData, pageURL)...)
contentLinks = append(contentLinks, web.FindPageAudio(pageData, pageURL)...)
contentLinks = append(contentLinks, web.FindPageVideos(pageData, pageURL)...)
w.saveContent(contentLinks, pageURL)
// email
emailAddresses := web.FindPageEmails(pageData)
if len(emailAddresses) > 0 {
w.Results <- web.Result{
PageURL: job.URL,
Search: job.Search,
Data: emailAddresses,
}
w.stats.MatchesFound += uint64(len(emailAddresses))
savePage = true
}
if len(contentLinks) > 0 || len(emailAddresses) > 0 {
savePage = true
}
default:
// text search
switch job.Search.IsRegexp {

Loading…
Cancel
Save