Browse Source

Documents search

master
parent
commit
fd484c665e
  1. 3
      README.md
  2. 10
      src/config/config.go
  3. 17
      src/main.go
  4. 100
      src/web/documents.go
  5. 53
      src/web/extentions.go
  6. 2
      src/web/images.go
  7. 25
      src/worker/worker.go

3
README.md

@ -22,7 +22,8 @@ There are some special `query` values:
- `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully)
- `videos` - find and fetch files that look like videos
- `audio` - find and fetch files that look like audio
- `everything` - find and fetch images, audio and video
- `documents` - find and fetch files that look like a document
- `everything` - find and fetch images, audio, video, documents and email addresses
When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.

10
src/config/config.go

@ -29,14 +29,16 @@ const (
QueryVideos string = "videos"
QueryAudio string = "audio"
QueryEmail string = "email"
QueryDocuments string = "documents"
QueryEverything string = "everything"
)
const (
SavePagesDir string = "pages"
SaveImagesDir string = "images"
SaveVideosDir string = "videos"
SaveAudioDir string = "audio"
SavePagesDir string = "pages"
SaveImagesDir string = "images"
SaveVideosDir string = "videos"
SaveAudioDir string = "audio"
SaveDocumentsDir string = "documents"
)
type Search struct {

17
src/main.go

@ -39,7 +39,7 @@ import (
"unbewohnte/wecr/worker"
)
const version = "v0.2.4"
const version = "v0.2.5"
const (
defaultConfigFile string = "conf.json"
@ -295,6 +295,12 @@ func main() {
return
}
err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveDocumentsDir), os.ModePerm)
if err != nil {
logger.Error("Failed to create output directory for documents: %s", err)
return
}
switch conf.Search.Query {
case config.QueryEmail:
logger.Info("Looking for email addresses")
@ -304,8 +310,15 @@ func main() {
logger.Info("Looking for videos (%+s)", web.VideoExtentions)
case config.QueryAudio:
logger.Info("Looking for audio (%+s)", web.AudioExtentions)
case config.QueryDocuments:
logger.Info("Looking for documents (%+s)", web.DocumentExtentions)
case config.QueryEverything:
logger.Info("Looking for email addresses, images, videos and audio (%+s - %+s - %+s)", web.ImageExtentions, web.VideoExtentions, web.AudioExtentions)
logger.Info("Looking for email addresses, images, videos, audio and various documents (%+s - %+s - %+s - %+s)",
web.ImageExtentions,
web.VideoExtentions,
web.AudioExtentions,
web.DocumentExtentions,
)
default:
if conf.Search.IsRegexp {
logger.Info("Looking for RegExp matches (%s)", conf.Search.Query)

100
src/web/documents.go

@ -0,0 +1,100 @@
package web
import (
"net/url"
"strings"
)
func HasDocumentExtention(url string) bool {
for _, extention := range DocumentExtentions {
if strings.HasSuffix(url, extention) {
return true
}
}
return false
}
// Tries to find docs' URLs on the page
func FindPageDocuments(pageBody []byte, from *url.URL) []string {
var urls []string
// for every element that has "src" attribute
for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
var linkStartIndex int
var linkEndIndex int
linkStartIndex = strings.Index(match, "\"")
if linkStartIndex == -1 {
linkStartIndex = strings.Index(match, "'")
if linkStartIndex == -1 {
continue
}
linkEndIndex = strings.LastIndex(match, "'")
if linkEndIndex == -1 {
continue
}
} else {
linkEndIndex = strings.LastIndex(match, "\"")
if linkEndIndex == -1 {
continue
}
}
if linkEndIndex <= linkStartIndex+1 {
continue
}
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
if err != nil {
continue
}
linkResolved := ResolveLink(link, from.Host)
if HasDocumentExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}
// for every "a" element as well
for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
var linkStartIndex int
var linkEndIndex int
linkStartIndex = strings.Index(match, "\"")
if linkStartIndex == -1 {
linkStartIndex = strings.Index(match, "'")
if linkStartIndex == -1 {
continue
}
linkEndIndex = strings.LastIndex(match, "'")
if linkEndIndex == -1 {
continue
}
} else {
linkEndIndex = strings.LastIndex(match, "\"")
if linkEndIndex == -1 {
continue
}
}
if linkEndIndex <= linkStartIndex+1 {
continue
}
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
if err != nil {
continue
}
linkResolved := ResolveLink(link, from.Host)
if HasDocumentExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}
// return discovered doc urls
return urls
}

53
src/web/extentions.go

@ -82,3 +82,56 @@ var VideoExtentions = []string{
".vob",
".ogv",
}
var DocumentExtentions = []string{
".pdf",
".doc",
".docx",
".epub",
".fb2",
".pub",
".ppt",
".pptx",
".txt",
".tex",
".odt",
".bib",
".ps",
".dwg",
".lyx",
".key",
".ott",
".odf",
".odc",
".ppg",
".xlc",
".latex",
".c",
".cpp",
".sh",
".go",
".java",
".cs",
".rs",
".lua",
".php",
".py",
".pl",
".lua",
".kt",
".js",
".rb",
".asm",
".rar",
".tar",
".db",
".7z",
".zip",
".gbr",
".tex",
".ttf",
".ttc",
".woff",
".otf",
".exif",
}

2
src/web/images.go

@ -64,7 +64,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string {
continue
}
link, err := url.Parse(match)
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
if err != nil {
continue
}

25
src/worker/worker.go

@ -97,6 +97,8 @@ func (w *Worker) saveContent(links []string, pageURL *url.URL) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName)
} else if web.HasAudioExtention(link) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName)
} else if web.HasDocumentExtention(link) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveDocumentsDir, fileName)
} else {
filePath = filepath.Join(w.Conf.Save.OutputDir, fileName)
}
@ -146,11 +148,16 @@ func (w *Worker) Work() {
if w.Conf.VisitQueue.VisitQueue != nil {
w.Conf.VisitQueue.Lock.Lock()
newJob, err := queue.PopLastJob(w.Conf.VisitQueue.VisitQueue)
if err != nil || newJob == nil {
if err != nil {
logger.Error("Failed to get a new job from visit queue: %s", err)
w.Conf.VisitQueue.Lock.Unlock()
continue
}
if newJob == nil {
w.Conf.VisitQueue.Lock.Unlock()
continue
}
job = *newJob
w.Conf.VisitQueue.Lock.Unlock()
} else {
@ -276,8 +283,8 @@ func (w *Worker) Work() {
case config.QueryImages:
// find image URLs, output images to the file while not saving already outputted ones
imageLinks := web.FindPageImages(pageData, pageURL)
w.saveContent(imageLinks, pageURL)
if len(imageLinks) > 0 {
w.saveContent(imageLinks, pageURL)
savePage = true
}
@ -285,8 +292,8 @@ func (w *Worker) Work() {
// search for videos
// find video URLs, output videos to the files while not saving already outputted ones
videoLinks := web.FindPageVideos(pageData, pageURL)
w.saveContent(videoLinks, pageURL)
if len(videoLinks) > 0 {
w.saveContent(videoLinks, pageURL)
savePage = true
}
@ -294,8 +301,17 @@ func (w *Worker) Work() {
// search for audio
// find audio URLs, output audio to the file while not saving already outputted ones
audioLinks := web.FindPageAudio(pageData, pageURL)
w.saveContent(audioLinks, pageURL)
if len(audioLinks) > 0 {
w.saveContent(audioLinks, pageURL)
savePage = true
}
case config.QueryDocuments:
// search for various documents
// find documents URLs, output docs to the file while not saving already outputted ones
docsLinks := web.FindPageDocuments(pageData, pageURL)
if len(docsLinks) > 0 {
w.saveContent(docsLinks, pageURL)
savePage = true
}
@ -320,6 +336,7 @@ func (w *Worker) Work() {
contentLinks = append(contentLinks, web.FindPageImages(pageData, pageURL)...)
contentLinks = append(contentLinks, web.FindPageAudio(pageData, pageURL)...)
contentLinks = append(contentLinks, web.FindPageVideos(pageData, pageURL)...)
contentLinks = append(contentLinks, web.FindPageDocuments(pageData, pageURL)...)
w.saveContent(contentLinks, pageURL)
// email

Loading…
Cancel
Save