diff --git a/README.md b/README.md index 0c5630d..10d1aaa 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,8 @@ There are some special `query` values: - `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully) - `videos` - find and fetch files that look like videos - `audio` - find and fetch files that look like audio -- `everything` - find and fetch images, audio and video +- `documents` - find and fetch files that look like a document +- `everything` - find and fetch images, audio, video, documents and email addresses When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it. diff --git a/src/config/config.go b/src/config/config.go index ae61715..206cf4f 100644 --- a/src/config/config.go +++ b/src/config/config.go @@ -29,14 +29,16 @@ const ( QueryVideos string = "videos" QueryAudio string = "audio" QueryEmail string = "email" + QueryDocuments string = "documents" QueryEverything string = "everything" ) const ( - SavePagesDir string = "pages" - SaveImagesDir string = "images" - SaveVideosDir string = "videos" - SaveAudioDir string = "audio" + SavePagesDir string = "pages" + SaveImagesDir string = "images" + SaveVideosDir string = "videos" + SaveAudioDir string = "audio" + SaveDocumentsDir string = "documents" ) type Search struct { diff --git a/src/main.go b/src/main.go index 6038a98..3b665d4 100644 --- a/src/main.go +++ b/src/main.go @@ -39,7 +39,7 @@ import ( "unbewohnte/wecr/worker" ) -const version = "v0.2.4" +const version = "v0.2.5" const ( defaultConfigFile string = "conf.json" @@ -295,6 +295,12 @@ func main() { return } + err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveDocumentsDir), os.ModePerm) + if err != nil { + logger.Error("Failed to create output directory for documents: %s", err) + return + } + switch conf.Search.Query { case config.QueryEmail: logger.Info("Looking for email addresses") @@ -304,8 +310,15 @@ func main() { logger.Info("Looking for videos (%+s)", web.VideoExtentions) case config.QueryAudio: logger.Info("Looking for audio (%+s)", web.AudioExtentions) + case config.QueryDocuments: + logger.Info("Looking for documents (%+s)", web.DocumentExtentions) case config.QueryEverything: - logger.Info("Looking for email addresses, images, videos and audio (%+s - %+s - %+s)", web.ImageExtentions, web.VideoExtentions, web.AudioExtentions) + logger.Info("Looking for email addresses, images, videos, audio and various documents (%+s - %+s - %+s - %+s)", + web.ImageExtentions, + web.VideoExtentions, + web.AudioExtentions, + web.DocumentExtentions, + ) default: if conf.Search.IsRegexp { logger.Info("Looking for RegExp matches (%s)", conf.Search.Query) diff --git a/src/web/documents.go b/src/web/documents.go new file mode 100644 index 0000000..9661704 --- /dev/null +++ b/src/web/documents.go @@ -0,0 +1,100 @@ +package web + +import ( + "net/url" + "strings" +) + +func HasDocumentExtention(url string) bool { + for _, extention := range DocumentExtentions { + if strings.HasSuffix(url, extention) { + return true + } + } + + return false +} + +// Tries to find docs' URLs on the page +func FindPageDocuments(pageBody []byte, from *url.URL) []string { + var urls []string + + // for every element that has "src" attribute + for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) { + var linkStartIndex int + var linkEndIndex int + + linkStartIndex = strings.Index(match, "\"") + if linkStartIndex == -1 { + linkStartIndex = strings.Index(match, "'") + if linkStartIndex == -1 { + continue + } + + linkEndIndex = strings.LastIndex(match, "'") + if linkEndIndex == -1 { + continue + } + } else { + linkEndIndex = strings.LastIndex(match, "\"") + if linkEndIndex == -1 { + continue + } + } + + if linkEndIndex <= linkStartIndex+1 { + continue + } + + link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) + if err != nil { + continue + } + + linkResolved := ResolveLink(link, from.Host) + if HasDocumentExtention(linkResolved) { + urls = append(urls, linkResolved) + } + } + + // for every "a" element as well + for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) { + var linkStartIndex int + var linkEndIndex int + + linkStartIndex = strings.Index(match, "\"") + if linkStartIndex == -1 { + linkStartIndex = strings.Index(match, "'") + if linkStartIndex == -1 { + continue + } + + linkEndIndex = strings.LastIndex(match, "'") + if linkEndIndex == -1 { + continue + } + } else { + linkEndIndex = strings.LastIndex(match, "\"") + if linkEndIndex == -1 { + continue + } + } + + if linkEndIndex <= linkStartIndex+1 { + continue + } + + link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) + if err != nil { + continue + } + + linkResolved := ResolveLink(link, from.Host) + if HasDocumentExtention(linkResolved) { + urls = append(urls, linkResolved) + } + } + + // return discovered doc urls + return urls +} diff --git a/src/web/extentions.go b/src/web/extentions.go index 3e06bad..deaf930 100644 --- a/src/web/extentions.go +++ b/src/web/extentions.go @@ -82,3 +82,56 @@ var VideoExtentions = []string{ ".vob", ".ogv", } + +var DocumentExtentions = []string{ + ".pdf", + ".doc", + ".docx", + ".epub", + ".fb2", + ".pub", + ".ppt", + ".pptx", + ".txt", + ".tex", + ".odt", + ".bib", + ".ps", + ".dwg", + ".lyx", + ".key", + ".ott", + ".odf", + ".odc", + ".ppg", + ".xlc", + ".latex", + ".c", + ".cpp", + ".sh", + ".go", + ".java", + ".cs", + ".rs", + ".lua", + ".php", + ".py", + ".pl", + ".lua", + ".kt", + ".js", + ".rb", + ".asm", + ".rar", + ".tar", + ".db", + ".7z", + ".zip", + ".gbr", + ".tex", + ".ttf", + ".ttc", + ".woff", + ".otf", + ".exif", +} diff --git a/src/web/images.go b/src/web/images.go index bf22781..cb791c8 100644 --- a/src/web/images.go +++ b/src/web/images.go @@ -64,7 +64,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string { continue } - link, err := url.Parse(match) + link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) if err != nil { continue } diff --git a/src/worker/worker.go b/src/worker/worker.go index 9c58219..fa9a7f0 100644 --- a/src/worker/worker.go +++ b/src/worker/worker.go @@ -97,6 +97,8 @@ func (w *Worker) saveContent(links []string, pageURL *url.URL) { filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName) } else if web.HasAudioExtention(link) { filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName) + } else if web.HasDocumentExtention(link) { + filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveDocumentsDir, fileName) } else { filePath = filepath.Join(w.Conf.Save.OutputDir, fileName) } @@ -146,11 +148,16 @@ func (w *Worker) Work() { if w.Conf.VisitQueue.VisitQueue != nil { w.Conf.VisitQueue.Lock.Lock() newJob, err := queue.PopLastJob(w.Conf.VisitQueue.VisitQueue) - if err != nil || newJob == nil { + if err != nil { logger.Error("Failed to get a new job from visit queue: %s", err) w.Conf.VisitQueue.Lock.Unlock() continue } + if newJob == nil { + w.Conf.VisitQueue.Lock.Unlock() + continue + } + job = *newJob w.Conf.VisitQueue.Lock.Unlock() } else { @@ -276,8 +283,8 @@ func (w *Worker) Work() { case config.QueryImages: // find image URLs, output images to the file while not saving already outputted ones imageLinks := web.FindPageImages(pageData, pageURL) - w.saveContent(imageLinks, pageURL) if len(imageLinks) > 0 { + w.saveContent(imageLinks, pageURL) savePage = true } @@ -285,8 +292,8 @@ func (w *Worker) Work() { // search for videos // find video URLs, output videos to the files while not saving already outputted ones videoLinks := web.FindPageVideos(pageData, pageURL) - w.saveContent(videoLinks, pageURL) if len(videoLinks) > 0 { + w.saveContent(videoLinks, pageURL) savePage = true } @@ -294,8 +301,17 @@ func (w *Worker) Work() { // search for audio // find audio URLs, output audio to the file while not saving already outputted ones audioLinks := web.FindPageAudio(pageData, pageURL) - w.saveContent(audioLinks, pageURL) if len(audioLinks) > 0 { + w.saveContent(audioLinks, pageURL) + savePage = true + } + + case config.QueryDocuments: + // search for various documents + // find documents URLs, output docs to the file while not saving already outputted ones + docsLinks := web.FindPageDocuments(pageData, pageURL) + if len(docsLinks) > 0 { + w.saveContent(docsLinks, pageURL) savePage = true } @@ -320,6 +336,7 @@ func (w *Worker) Work() { contentLinks = append(contentLinks, web.FindPageImages(pageData, pageURL)...) contentLinks = append(contentLinks, web.FindPageAudio(pageData, pageURL)...) contentLinks = append(contentLinks, web.FindPageVideos(pageData, pageURL)...) + contentLinks = append(contentLinks, web.FindPageDocuments(pageData, pageURL)...) w.saveContent(contentLinks, pageURL) // email