From 722f3fb536a0f7c4e0257cd3b6be5f37c39570a2 Mon Sep 17 00:00:00 2001 From: Unbewohnte Date: Mon, 27 Feb 2023 18:38:55 +0300 Subject: [PATCH] Fixed emails being saved to a wrong file under query=everything; improved page saving process; fixed pages being saved not considering the actual setting; Added non-link resolving variation of FindPageLinks; Added query=archive functionality; working directory is now an actual working directory instead an executables directory --- README.md | 17 +++-- src/config/config.go | 1 + src/main.go | 58 ++++++++------- src/web/audio.go | 90 ++--------------------- src/web/documents.go | 107 +++++++-------------------- src/web/extentions.go | 38 ++++++++++ src/web/images.go | 90 ++--------------------- src/web/text.go | 84 ++++++++++++++++++--- src/web/videos.go | 90 ++--------------------- src/worker/worker.go | 167 +++++++++++++++++++++++++++++++----------- 10 files changed, 324 insertions(+), 418 deletions(-) diff --git a/README.md b/README.md index 3a75664..603df52 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,9 @@ A simple HTML web spider with no dependencies. It is possible to search for pages with a text on them or for the text itself, extract images, video, audio and save pages that satisfy the criteria along the way. -## Configuration +## Configuration Overview -The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wdir` (working directory) flag is set to some other value. To see al available flags run `wecr -h`. +The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the working directory unless the `-wdir` (working directory) flag is set to some other value, in which case it has a bigger importance. To see all available flags run `wecr -h`. The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them. @@ -18,7 +18,7 @@ You can change search `query` at **runtime** via web dashboard if `launch_dashbo ### Search query -There are some special `query` values: +There are some special `query` values to control the flow of work: - `email` - tells wecr to scrape email addresses and output to `output_file` - `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully) @@ -26,12 +26,13 @@ There are some special `query` values: - `audio` - find and fetch files that look like audio - `documents` - find and fetch files that look like a document - `everything` - find and fetch images, audio, video, documents and email addresses +- `archive` - no text to be searched, save every visited page -When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it. +When `is_regexp` is enabled, the `query` is treated as a regexp string (in Go "flavor") and pages will be scanned for matches that satisfy it. -### Output +### Data Output -By default, if the query is not something of special values all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be put in the corresponding directories inside `output_dir`, which is neatly created by the executable's side. +If the query is not something of special value, all text matches will be outputted to `found_text.json` file as separate continuous JSON objects in `output_dir`; if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be also put in the corresponding directories inside `output_dir`, which is neatly created in the working directory or, if `-wdir` flag is set - there. If `output_dir` is happened to be empty - contents will be outputted directly to the working directory. The output almost certainly contains some duplicates and is not easy to work with programmatically, so you can use `-extractData` with the output JSON file argument (like `found_text.json`, which is the default output file name for simple text searches) to extract the actual data, filter out the duplicates and put each entry on its new line in a new text file. @@ -43,7 +44,7 @@ Otherwise - `go build` in the `src` directory to build `wecr`. No dependencies. ## Examples -See [page on my website](https://unbewohnte.su/wecr) for some basic examples. +See [a page on my website](https://unbewohnte.su/wecr) for some basic examples. Dump of a basic configuration: @@ -87,4 +88,4 @@ Dump of a basic configuration: ``` ## License -AGPLv3 \ No newline at end of file +wecr is distributed under AGPLv3 license \ No newline at end of file diff --git a/src/config/config.go b/src/config/config.go index d84ad5b..f05e316 100644 --- a/src/config/config.go +++ b/src/config/config.go @@ -31,6 +31,7 @@ const ( QueryEmail string = "email" QueryDocuments string = "documents" QueryEverything string = "everything" + QueryArchive string = "archive" ) const ( diff --git a/src/main.go b/src/main.go index 18fb6e4..61b2ad7 100644 --- a/src/main.go +++ b/src/main.go @@ -39,7 +39,7 @@ import ( "unbewohnte/wecr/worker" ) -const version = "v0.3.4" +const version = "v0.3.5" const ( configFilename string = "conf.json" @@ -107,12 +107,12 @@ func init() { if *wDir != "" { workingDirectory = *wDir } else { - exePath, err := os.Executable() + wdir, err := os.Getwd() if err != nil { - logger.Error("Failed to determine executable's path: %s", err) + logger.Error("Failed to determine working directory path: %s", err) return } - workingDirectory = filepath.Dir(exePath) + workingDirectory = wdir } logger.Info("Working in \"%s\"", workingDirectory) @@ -294,6 +294,8 @@ func main() { logger.Info("Looking for audio (%+s)", web.AudioExtentions) case config.QueryDocuments: logger.Info("Looking for documents (%+s)", web.DocumentExtentions) + case config.QueryArchive: + logger.Info("Archiving every visited page") case config.QueryEverything: logger.Info("Looking for email addresses, images, videos, audio and various documents (%+s - %+s - %+s - %+s)", web.ImageExtentions, @@ -309,30 +311,6 @@ func main() { } } - // create and redirect logs if needed - if conf.Logging.OutputLogs { - if conf.Logging.LogsFile != "" { - // output logs to a file - logFile, err := os.Create(filepath.Join(workingDirectory, conf.Logging.LogsFile)) - if err != nil { - logger.Error("Failed to create logs file: %s", err) - return - } - defer logFile.Close() - - logger.Info("Outputting logs to %s", conf.Logging.LogsFile) - logger.SetOutput(logFile) - } else { - // output logs to stdout - logger.Info("Outputting logs to stdout") - logger.SetOutput(os.Stdout) - } - } else { - // no logging needed - logger.Info("No further logs will be outputted") - logger.SetOutput(nil) - } - // create visit queue file if not turned off var visitQueueFile *os.File = nil if !conf.InMemoryVisitQueue { @@ -401,6 +379,30 @@ func main() { logger.Info("Launched dashboard at http://localhost:%d", conf.Dashboard.Port) } + // create and redirect logs if needed + if conf.Logging.OutputLogs { + if conf.Logging.LogsFile != "" { + // output logs to a file + logFile, err := os.Create(filepath.Join(workingDirectory, conf.Logging.LogsFile)) + if err != nil { + logger.Error("Failed to create logs file: %s", err) + return + } + defer logFile.Close() + + logger.Info("Outputting logs to %s", conf.Logging.LogsFile) + logger.SetOutput(logFile) + } else { + // output logs to stdout + logger.Info("Outputting logs to stdout") + logger.SetOutput(os.Stdout) + } + } else { + // no logging needed + logger.Info("No further logs will be outputted") + logger.SetOutput(nil) + } + // launch concurrent scraping ! workerPool.Work() logger.Info("Started scraping...") diff --git a/src/web/audio.go b/src/web/audio.go index 3462b61..258e9e2 100644 --- a/src/web/audio.go +++ b/src/web/audio.go @@ -20,99 +20,25 @@ package web import ( "net/url" - "strings" ) -func HasAudioExtention(url string) bool { - for _, extention := range AudioExtentions { - if strings.HasSuffix(url, extention) { - return true - } - } - - return false -} - // Tries to find audio URLs on the page -func FindPageAudio(pageBody []byte, from *url.URL) []string { - var urls []string +func FindPageAudio(pageBody []byte, from url.URL) []url.URL { + var urls []url.URL // for every element that has "src" attribute - for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) { - var linkStartIndex int - var linkEndIndex int - - linkStartIndex = strings.Index(match, "\"") - if linkStartIndex == -1 { - linkStartIndex = strings.Index(match, "'") - if linkStartIndex == -1 { - continue - } - - linkEndIndex = strings.LastIndex(match, "'") - if linkEndIndex == -1 { - continue - } - } else { - linkEndIndex = strings.LastIndex(match, "\"") - if linkEndIndex == -1 { - continue - } - } - - if linkEndIndex <= linkStartIndex+1 { - continue - } - - link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) - if err != nil { - continue - } - - linkResolved := ResolveLink(link, from.Host) - if HasAudioExtention(linkResolved) { - urls = append(urls, linkResolved) + for _, link := range FindPageSrcLinks(pageBody, from) { + if HasAudioExtention(link.EscapedPath()) { + urls = append(urls, link) } } // for every "a" element as well - for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) { - var linkStartIndex int - var linkEndIndex int - - linkStartIndex = strings.Index(match, "\"") - if linkStartIndex == -1 { - linkStartIndex = strings.Index(match, "'") - if linkStartIndex == -1 { - continue - } - - linkEndIndex = strings.LastIndex(match, "'") - if linkEndIndex == -1 { - continue - } - } else { - linkEndIndex = strings.LastIndex(match, "\"") - if linkEndIndex == -1 { - continue - } - } - - if linkEndIndex <= linkStartIndex+1 { - continue - } - - link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) - if err != nil { - continue - } - - linkResolved := ResolveLink(link, from.Host) - if HasAudioExtention(linkResolved) { - urls = append(urls, linkResolved) + for _, link := range FindPageLinks(pageBody, from) { + if HasAudioExtention(link.EscapedPath()) { + urls = append(urls, link) } } - // return discovered mutual video urls return urls } diff --git a/src/web/documents.go b/src/web/documents.go index 9661704..8f3af23 100644 --- a/src/web/documents.go +++ b/src/web/documents.go @@ -1,97 +1,42 @@ +/* + Wecr - crawl the web for data + Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . +*/ + package web import ( "net/url" - "strings" ) -func HasDocumentExtention(url string) bool { - for _, extention := range DocumentExtentions { - if strings.HasSuffix(url, extention) { - return true - } - } - - return false -} - // Tries to find docs' URLs on the page -func FindPageDocuments(pageBody []byte, from *url.URL) []string { - var urls []string +func FindPageDocuments(pageBody []byte, from url.URL) []url.URL { + var urls []url.URL // for every element that has "src" attribute - for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) { - var linkStartIndex int - var linkEndIndex int - - linkStartIndex = strings.Index(match, "\"") - if linkStartIndex == -1 { - linkStartIndex = strings.Index(match, "'") - if linkStartIndex == -1 { - continue - } - - linkEndIndex = strings.LastIndex(match, "'") - if linkEndIndex == -1 { - continue - } - } else { - linkEndIndex = strings.LastIndex(match, "\"") - if linkEndIndex == -1 { - continue - } - } - - if linkEndIndex <= linkStartIndex+1 { - continue - } - - link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) - if err != nil { - continue - } - - linkResolved := ResolveLink(link, from.Host) - if HasDocumentExtention(linkResolved) { - urls = append(urls, linkResolved) + for _, link := range FindPageSrcLinks(pageBody, from) { + if HasDocumentExtention(link.EscapedPath()) { + urls = append(urls, link) } } // for every "a" element as well - for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) { - var linkStartIndex int - var linkEndIndex int - - linkStartIndex = strings.Index(match, "\"") - if linkStartIndex == -1 { - linkStartIndex = strings.Index(match, "'") - if linkStartIndex == -1 { - continue - } - - linkEndIndex = strings.LastIndex(match, "'") - if linkEndIndex == -1 { - continue - } - } else { - linkEndIndex = strings.LastIndex(match, "\"") - if linkEndIndex == -1 { - continue - } - } - - if linkEndIndex <= linkStartIndex+1 { - continue - } - - link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) - if err != nil { - continue - } - - linkResolved := ResolveLink(link, from.Host) - if HasDocumentExtention(linkResolved) { - urls = append(urls, linkResolved) + for _, link := range FindPageLinks(pageBody, from) { + if HasDocumentExtention(link.EscapedPath()) { + urls = append(urls, link) } } diff --git a/src/web/extentions.go b/src/web/extentions.go index a5f71b2..950db2b 100644 --- a/src/web/extentions.go +++ b/src/web/extentions.go @@ -18,6 +18,8 @@ package web +import "strings" + var AudioExtentions = []string{ ".3gp", ".aa", @@ -134,3 +136,39 @@ var DocumentExtentions = []string{ ".otf", ".exif", } + +func HasImageExtention(urlPath string) bool { + for _, extention := range ImageExtentions { + if strings.HasSuffix(urlPath, extention) { + return true + } + } + return false +} + +func HasDocumentExtention(urlPath string) bool { + for _, extention := range DocumentExtentions { + if strings.HasSuffix(urlPath, extention) { + return true + } + } + return false +} + +func HasVideoExtention(urlPath string) bool { + for _, extention := range VideoExtentions { + if strings.HasSuffix(urlPath, extention) { + return true + } + } + return false +} + +func HasAudioExtention(urlPath string) bool { + for _, extention := range AudioExtentions { + if strings.HasSuffix(urlPath, extention) { + return true + } + } + return false +} diff --git a/src/web/images.go b/src/web/images.go index cb791c8..cbc79f2 100644 --- a/src/web/images.go +++ b/src/web/images.go @@ -20,99 +20,25 @@ package web import ( "net/url" - "strings" ) -func HasImageExtention(url string) bool { - for _, extention := range ImageExtentions { - if strings.HasSuffix(url, extention) { - return true - } - } - - return false -} - // Tries to find images' URLs on the page -func FindPageImages(pageBody []byte, from *url.URL) []string { - var urls []string +func FindPageImages(pageBody []byte, from url.URL) []url.URL { + var urls []url.URL // for every element that has "src" attribute - for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) { - var linkStartIndex int - var linkEndIndex int - - linkStartIndex = strings.Index(match, "\"") - if linkStartIndex == -1 { - linkStartIndex = strings.Index(match, "'") - if linkStartIndex == -1 { - continue - } - - linkEndIndex = strings.LastIndex(match, "'") - if linkEndIndex == -1 { - continue - } - } else { - linkEndIndex = strings.LastIndex(match, "\"") - if linkEndIndex == -1 { - continue - } - } - - if linkEndIndex <= linkStartIndex+1 { - continue - } - - link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) - if err != nil { - continue - } - - linkResolved := ResolveLink(link, from.Host) - if HasImageExtention(linkResolved) { - urls = append(urls, linkResolved) + for _, link := range FindPageSrcLinks(pageBody, from) { + if HasImageExtention(link.EscapedPath()) { + urls = append(urls, link) } } // for every "a" element as well - for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) { - var linkStartIndex int - var linkEndIndex int - - linkStartIndex = strings.Index(match, "\"") - if linkStartIndex == -1 { - linkStartIndex = strings.Index(match, "'") - if linkStartIndex == -1 { - continue - } - - linkEndIndex = strings.LastIndex(match, "'") - if linkEndIndex == -1 { - continue - } - } else { - linkEndIndex = strings.LastIndex(match, "\"") - if linkEndIndex == -1 { - continue - } - } - - if linkEndIndex <= linkStartIndex+1 { - continue - } - - link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) - if err != nil { - continue - } - - linkResolved := ResolveLink(link, from.Host) - if HasImageExtention(linkResolved) { - urls = append(urls, linkResolved) + for _, link := range FindPageLinks(pageBody, from) { + if HasImageExtention(link.EscapedPath()) { + urls = append(urls, link) } } - // return discovered mutual image urls from and tags return urls } diff --git a/src/web/text.go b/src/web/text.go index 594d28e..73e8447 100644 --- a/src/web/text.go +++ b/src/web/text.go @@ -37,25 +37,27 @@ var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')( var emailRegexp *regexp.Regexp = regexp.MustCompile(`[A-Za-z0-9._%+\-!%&?~^#$]+@[A-Za-z0-9.\-]+\.[a-zA-Z]{2,4}`) // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright -func ResolveLink(url *url.URL, fromHost string) string { - if !url.IsAbs() { - if url.Scheme == "" { +func ResolveLink(link url.URL, fromHost string) url.URL { + var resolvedURL url.URL = link + + if !resolvedURL.IsAbs() { + if resolvedURL.Scheme == "" { // add scheme - url.Scheme = "http" + resolvedURL.Scheme = "https" } - if url.Host == "" { + if resolvedURL.Host == "" { // add host - url.Host = fromHost + resolvedURL.Host = fromHost } } - return url.String() + return resolvedURL } -// Find all links on page that are specified in tag -func FindPageLinks(pageBody []byte, from *url.URL) []string { - var urls []string +// Find all links on page that are specified in href attribute. Do not resolve links. Return URLs as they are on the page +func FindPageLinksDontResolve(pageBody []byte) []url.URL { + var urls []url.URL for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) { var linkStartIndex int @@ -88,12 +90,72 @@ func FindPageLinks(pageBody []byte, from *url.URL) []string { continue } - urls = append(urls, ResolveLink(link, from.Host)) + urls = append(urls, *link) + } + + return urls +} + +// Find all links on page that are specified in href attribute +func FindPageLinks(pageBody []byte, from url.URL) []url.URL { + urls := FindPageLinksDontResolve(pageBody) + for index := 0; index < len(urls); index++ { + urls[index] = ResolveLink(urls[index], from.Host) + } + + return urls +} + +// Find all links on page that are specified in "src" attribute. Do not resolve ULRs, return them as they are on page +func FindPageSrcLinksDontResolve(pageBody []byte) []url.URL { + var urls []url.URL + + for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) { + var linkStartIndex int + var linkEndIndex int + + linkStartIndex = strings.Index(match, "\"") + if linkStartIndex == -1 { + linkStartIndex = strings.Index(match, "'") + if linkStartIndex == -1 { + continue + } + + linkEndIndex = strings.LastIndex(match, "'") + if linkEndIndex == -1 { + continue + } + } else { + linkEndIndex = strings.LastIndex(match, "\"") + if linkEndIndex == -1 { + continue + } + } + + if linkEndIndex <= linkStartIndex+1 { + continue + } + + link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) + if err != nil { + continue + } + + urls = append(urls, *link) } return urls } +// Find all links on page that are specified in "src" attribute +func FindPageSrcLinks(pageBody []byte, from url.URL) []url.URL { + urls := FindPageSrcLinksDontResolve(pageBody) + for index := 0; index < len(urls); index++ { + urls[index] = ResolveLink(urls[index], from.Host) + } + return urls +} + // Tries to find a certain string in page. Returns true if such string has been found func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool { scanner := bufio.NewScanner(bytes.NewReader(pageBody)) diff --git a/src/web/videos.go b/src/web/videos.go index b0e6b6e..cc32deb 100644 --- a/src/web/videos.go +++ b/src/web/videos.go @@ -20,99 +20,25 @@ package web import ( "net/url" - "strings" ) -func HasVideoExtention(url string) bool { - for _, extention := range VideoExtentions { - if strings.HasSuffix(url, extention) { - return true - } - } - - return false -} - // Tries to find videos' URLs on the page -func FindPageVideos(pageBody []byte, from *url.URL) []string { - var urls []string +func FindPageVideos(pageBody []byte, from url.URL) []url.URL { + var urls []url.URL // for every element that has "src" attribute - for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) { - var linkStartIndex int - var linkEndIndex int - - linkStartIndex = strings.Index(match, "\"") - if linkStartIndex == -1 { - linkStartIndex = strings.Index(match, "'") - if linkStartIndex == -1 { - continue - } - - linkEndIndex = strings.LastIndex(match, "'") - if linkEndIndex == -1 { - continue - } - } else { - linkEndIndex = strings.LastIndex(match, "\"") - if linkEndIndex == -1 { - continue - } - } - - if linkEndIndex <= linkStartIndex+1 { - continue - } - - link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) - if err != nil { - continue - } - - linkResolved := ResolveLink(link, from.Host) - if HasVideoExtention(linkResolved) { - urls = append(urls, linkResolved) + for _, link := range FindPageSrcLinks(pageBody, from) { + if HasVideoExtention(link.EscapedPath()) { + urls = append(urls, link) } } // for every "a" element as well - for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) { - var linkStartIndex int - var linkEndIndex int - - linkStartIndex = strings.Index(match, "\"") - if linkStartIndex == -1 { - linkStartIndex = strings.Index(match, "'") - if linkStartIndex == -1 { - continue - } - - linkEndIndex = strings.LastIndex(match, "'") - if linkEndIndex == -1 { - continue - } - } else { - linkEndIndex = strings.LastIndex(match, "\"") - if linkEndIndex == -1 { - continue - } - } - - if linkEndIndex <= linkStartIndex+1 { - continue - } - - link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) - if err != nil { - continue - } - - linkResolved := ResolveLink(link, from.Host) - if HasVideoExtention(linkResolved) { - urls = append(urls, linkResolved) + for _, link := range FindPageLinks(pageBody, from) { + if HasVideoExtention(link.EscapedPath()) { + urls = append(urls, link) } } - // return discovered mutual video urls return urls } diff --git a/src/worker/worker.go b/src/worker/worker.go index c48dc33..97ca54c 100644 --- a/src/worker/worker.go +++ b/src/worker/worker.go @@ -19,6 +19,7 @@ package worker import ( + "bytes" "encoding/json" "fmt" "io" @@ -27,6 +28,7 @@ import ( "path" "path/filepath" "regexp" + "strings" "sync" "time" "unbewohnte/wecr/config" @@ -72,8 +74,8 @@ func NewWorker(jobs chan web.Job, conf *WorkerConf, visited *visited, stats *Sta } } -func (w *Worker) saveContent(links []string, pageURL *url.URL) { - var alreadyProcessedUrls []string +func (w *Worker) saveContent(links []url.URL, pageURL *url.URL) { + var alreadyProcessedUrls []url.URL for count, link := range links { // check if this URL has been processed already var skip bool = false @@ -91,29 +93,29 @@ func (w *Worker) saveContent(links []string, pageURL *url.URL) { } alreadyProcessedUrls = append(alreadyProcessedUrls, link) - var fileName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(link)) + var fileName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(link.Path)) var filePath string - if web.HasImageExtention(link) { + if web.HasImageExtention(link.Path) { filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveImagesDir, fileName) - } else if web.HasVideoExtention(link) { + } else if web.HasVideoExtention(link.Path) { filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName) - } else if web.HasAudioExtention(link) { + } else if web.HasAudioExtention(link.Path) { filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName) - } else if web.HasDocumentExtention(link) { + } else if web.HasDocumentExtention(link.Path) { filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveDocumentsDir, fileName) } else { filePath = filepath.Join(w.Conf.Save.OutputDir, fileName) } err := web.FetchFile( - link, + link.String(), w.Conf.Requests.UserAgent, w.Conf.Requests.ContentFetchTimeoutMs, filePath, ) if err != nil { - logger.Error("Failed to fetch file at %s: %s", link, err) + logger.Error("Failed to fetch file located at %s: %s", link.String(), err) return } @@ -122,31 +124,105 @@ func (w *Worker) saveContent(links []string, pageURL *url.URL) { } } -// Save page to the disk with a corresponding name -func (w *Worker) savePage(baseURL *url.URL, pageData []byte) { - if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" { - var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String())) - pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, config.SavePagesDir, pageName)) +// Save page to the disk with a corresponding name; Download any src files, stylesheets and JS along the way +func (w *Worker) savePage(baseURL url.URL, pageData []byte) { + var findPageFileContentURLs func([]byte) []url.URL = func(pageBody []byte) []url.URL { + var urls []url.URL + + for _, link := range web.FindPageLinksDontResolve(pageData) { + if strings.Contains(link.Path, ".css") || + strings.Contains(link.Path, ".scss") || + strings.Contains(link.Path, ".js") || + strings.Contains(link.Path, ".mjs") { + urls = append(urls, link) + } + } + urls = append(urls, web.FindPageSrcLinksDontResolve(pageBody)...) + + return urls + } + + var cleanLink func(url.URL, url.URL) url.URL = func(link url.URL, from url.URL) url.URL { + resolvedLink := web.ResolveLink(link, from.Host) + cleanLink, err := url.Parse(resolvedLink.Scheme + "://" + resolvedLink.Host + resolvedLink.Path) if err != nil { - logger.Error("Failed to create page of \"%s\": %s", baseURL.String(), err) - return + return resolvedLink } - defer pageFile.Close() + return *cleanLink + } + + // Create directory with all file content on the page + var pageFilesDirectoryName string = fmt.Sprintf( + "%s_%s_files", + baseURL.Host, + strings.ReplaceAll(baseURL.Path, "/", "_"), + ) + err := os.MkdirAll(filepath.Join(w.Conf.Save.OutputDir, config.SavePagesDir, pageFilesDirectoryName), os.ModePerm) + if err != nil { + logger.Error("Failed to create directory to store file contents of %s: %s", baseURL.String(), err) + return + } + + // Save files on page + srcLinks := findPageFileContentURLs(pageData) + for _, srcLink := range srcLinks { + web.FetchFile(srcLink.String(), + w.Conf.Requests.UserAgent, + w.Conf.Requests.ContentFetchTimeoutMs, + filepath.Join( + w.Conf.Save.OutputDir, + config.SavePagesDir, + pageFilesDirectoryName, + path.Base(srcLink.String()), + ), + ) + } - pageFile.Write(pageData) + // Redirect old content URLs to local files + for _, srcLink := range srcLinks { + cleanLink := cleanLink(srcLink, baseURL) + pageData = bytes.ReplaceAll( + pageData, + []byte(srcLink.String()), + []byte("./"+filepath.Join(pageFilesDirectoryName, path.Base(cleanLink.String()))), + ) + } - logger.Info("Saved \"%s\"", pageName) - w.stats.PagesSaved++ + // Create page output file + pageName := fmt.Sprintf( + "%s_%s.html", + baseURL.Host, + strings.ReplaceAll(baseURL.Path, "/", "_"), + ) + outfile, err := os.Create(filepath.Join( + filepath.Join(w.Conf.Save.OutputDir, config.SavePagesDir), + pageName, + )) + if err != nil { + fmt.Printf("Failed to create output file: %s\n", err) } + defer outfile.Close() + + outfile.Write(pageData) + + logger.Info("Saved \"%s\"", pageName) + w.stats.PagesSaved++ } -func (w *Worker) saveResult(result web.Result) { - // write result to the output file +const ( + textTypeMatch = iota + textTypeEmail = iota +) +// Save text result to an appropriate file +func (w *Worker) saveResult(result web.Result, textType int) { + // write result to the output file var output io.Writer - if result.Search.Query == config.QueryEmail { + switch textType { + case textTypeEmail: output = w.Conf.EmailsOutput - } else { + + default: output = w.Conf.TextOutput } @@ -257,7 +333,7 @@ func (w *Worker) Work() { } // find links - pageLinks := web.FindPageLinks(pageData, pageURL) + pageLinks := web.FindPageLinks(pageData, *pageURL) go func() { if job.Depth > 1 { // decrement depth and add new jobs @@ -267,9 +343,9 @@ func (w *Worker) Work() { // add to the visit queue w.Conf.VisitQueue.Lock.Lock() for _, link := range pageLinks { - if link != job.URL { + if link.String() != job.URL { err = queue.InsertNewJob(w.Conf.VisitQueue.VisitQueue, web.Job{ - URL: link, + URL: link.String(), Search: *w.Conf.Search, Depth: job.Depth, }) @@ -283,9 +359,9 @@ func (w *Worker) Work() { } else { // add to the in-memory channel for _, link := range pageLinks { - if link != job.URL { + if link.String() != job.URL { w.Jobs <- web.Job{ - URL: link, + URL: link.String(), Search: *w.Conf.Search, Depth: job.Depth, } @@ -301,9 +377,12 @@ func (w *Worker) Work() { var savePage bool = false switch job.Search.Query { + case config.QueryArchive: + savePage = true + case config.QueryImages: // find image URLs, output images to the file while not saving already outputted ones - imageLinks := web.FindPageImages(pageData, pageURL) + imageLinks := web.FindPageImages(pageData, *pageURL) if len(imageLinks) > 0 { w.saveContent(imageLinks, pageURL) savePage = true @@ -312,7 +391,7 @@ func (w *Worker) Work() { case config.QueryVideos: // search for videos // find video URLs, output videos to the files while not saving already outputted ones - videoLinks := web.FindPageVideos(pageData, pageURL) + videoLinks := web.FindPageVideos(pageData, *pageURL) if len(videoLinks) > 0 { w.saveContent(videoLinks, pageURL) savePage = true @@ -321,7 +400,7 @@ func (w *Worker) Work() { case config.QueryAudio: // search for audio // find audio URLs, output audio to the file while not saving already outputted ones - audioLinks := web.FindPageAudio(pageData, pageURL) + audioLinks := web.FindPageAudio(pageData, *pageURL) if len(audioLinks) > 0 { w.saveContent(audioLinks, pageURL) savePage = true @@ -330,7 +409,7 @@ func (w *Worker) Work() { case config.QueryDocuments: // search for various documents // find documents URLs, output docs to the file while not saving already outputted ones - docsLinks := web.FindPageDocuments(pageData, pageURL) + docsLinks := web.FindPageDocuments(pageData, *pageURL) if len(docsLinks) > 0 { w.saveContent(docsLinks, pageURL) savePage = true @@ -344,7 +423,7 @@ func (w *Worker) Work() { PageURL: job.URL, Search: job.Search, Data: emailAddresses, - }) + }, textTypeEmail) w.stats.MatchesFound += uint64(len(emailAddresses)) savePage = true } @@ -353,11 +432,11 @@ func (w *Worker) Work() { // search for everything // files - var contentLinks []string - contentLinks = append(contentLinks, web.FindPageImages(pageData, pageURL)...) - contentLinks = append(contentLinks, web.FindPageAudio(pageData, pageURL)...) - contentLinks = append(contentLinks, web.FindPageVideos(pageData, pageURL)...) - contentLinks = append(contentLinks, web.FindPageDocuments(pageData, pageURL)...) + var contentLinks []url.URL + contentLinks = append(contentLinks, web.FindPageImages(pageData, *pageURL)...) + contentLinks = append(contentLinks, web.FindPageAudio(pageData, *pageURL)...) + contentLinks = append(contentLinks, web.FindPageVideos(pageData, *pageURL)...) + contentLinks = append(contentLinks, web.FindPageDocuments(pageData, *pageURL)...) w.saveContent(contentLinks, pageURL) if len(contentLinks) > 0 { @@ -371,7 +450,7 @@ func (w *Worker) Work() { PageURL: job.URL, Search: job.Search, Data: emailAddresses, - }) + }, textTypeEmail) w.stats.MatchesFound += uint64(len(emailAddresses)) savePage = true } @@ -393,7 +472,7 @@ func (w *Worker) Work() { PageURL: job.URL, Search: job.Search, Data: matches, - }) + }, textTypeMatch) logger.Info("Found matches: %+v", matches) w.stats.MatchesFound += uint64(len(matches)) savePage = true @@ -405,7 +484,7 @@ func (w *Worker) Work() { PageURL: job.URL, Search: job.Search, Data: []string{job.Search.Query}, - }) + }, textTypeMatch) logger.Info("Found \"%s\" on page", job.Search.Query) w.stats.MatchesFound++ savePage = true @@ -414,8 +493,8 @@ func (w *Worker) Work() { } // save page - if savePage { - w.savePage(pageURL, pageData) + if savePage && w.Conf.Save.SavePages { + w.savePage(*pageURL, pageData) } pageData = nil pageURL = nil