diff --git a/README.md b/README.md
index 3a75664..603df52 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,9 @@
A simple HTML web spider with no dependencies. It is possible to search for pages with a text on them or for the text itself, extract images, video, audio and save pages that satisfy the criteria along the way.
-## Configuration
+## Configuration Overview
-The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wdir` (working directory) flag is set to some other value. To see al available flags run `wecr -h`.
+The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the working directory unless the `-wdir` (working directory) flag is set to some other value, in which case it has a bigger importance. To see all available flags run `wecr -h`.
The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.
@@ -18,7 +18,7 @@ You can change search `query` at **runtime** via web dashboard if `launch_dashbo
### Search query
-There are some special `query` values:
+There are some special `query` values to control the flow of work:
- `email` - tells wecr to scrape email addresses and output to `output_file`
- `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully)
@@ -26,12 +26,13 @@ There are some special `query` values:
- `audio` - find and fetch files that look like audio
- `documents` - find and fetch files that look like a document
- `everything` - find and fetch images, audio, video, documents and email addresses
+- `archive` - no text to be searched, save every visited page
-When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.
+When `is_regexp` is enabled, the `query` is treated as a regexp string (in Go "flavor") and pages will be scanned for matches that satisfy it.
-### Output
+### Data Output
-By default, if the query is not something of special values all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be put in the corresponding directories inside `output_dir`, which is neatly created by the executable's side.
+If the query is not something of special value, all text matches will be outputted to `found_text.json` file as separate continuous JSON objects in `output_dir`; if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be also put in the corresponding directories inside `output_dir`, which is neatly created in the working directory or, if `-wdir` flag is set - there. If `output_dir` is happened to be empty - contents will be outputted directly to the working directory.
The output almost certainly contains some duplicates and is not easy to work with programmatically, so you can use `-extractData` with the output JSON file argument (like `found_text.json`, which is the default output file name for simple text searches) to extract the actual data, filter out the duplicates and put each entry on its new line in a new text file.
@@ -43,7 +44,7 @@ Otherwise - `go build` in the `src` directory to build `wecr`. No dependencies.
## Examples
-See [page on my website](https://unbewohnte.su/wecr) for some basic examples.
+See [a page on my website](https://unbewohnte.su/wecr) for some basic examples.
Dump of a basic configuration:
@@ -87,4 +88,4 @@ Dump of a basic configuration:
```
## License
-AGPLv3
\ No newline at end of file
+wecr is distributed under AGPLv3 license
\ No newline at end of file
diff --git a/src/config/config.go b/src/config/config.go
index d84ad5b..f05e316 100644
--- a/src/config/config.go
+++ b/src/config/config.go
@@ -31,6 +31,7 @@ const (
QueryEmail string = "email"
QueryDocuments string = "documents"
QueryEverything string = "everything"
+ QueryArchive string = "archive"
)
const (
diff --git a/src/main.go b/src/main.go
index 18fb6e4..61b2ad7 100644
--- a/src/main.go
+++ b/src/main.go
@@ -39,7 +39,7 @@ import (
"unbewohnte/wecr/worker"
)
-const version = "v0.3.4"
+const version = "v0.3.5"
const (
configFilename string = "conf.json"
@@ -107,12 +107,12 @@ func init() {
if *wDir != "" {
workingDirectory = *wDir
} else {
- exePath, err := os.Executable()
+ wdir, err := os.Getwd()
if err != nil {
- logger.Error("Failed to determine executable's path: %s", err)
+ logger.Error("Failed to determine working directory path: %s", err)
return
}
- workingDirectory = filepath.Dir(exePath)
+ workingDirectory = wdir
}
logger.Info("Working in \"%s\"", workingDirectory)
@@ -294,6 +294,8 @@ func main() {
logger.Info("Looking for audio (%+s)", web.AudioExtentions)
case config.QueryDocuments:
logger.Info("Looking for documents (%+s)", web.DocumentExtentions)
+ case config.QueryArchive:
+ logger.Info("Archiving every visited page")
case config.QueryEverything:
logger.Info("Looking for email addresses, images, videos, audio and various documents (%+s - %+s - %+s - %+s)",
web.ImageExtentions,
@@ -309,30 +311,6 @@ func main() {
}
}
- // create and redirect logs if needed
- if conf.Logging.OutputLogs {
- if conf.Logging.LogsFile != "" {
- // output logs to a file
- logFile, err := os.Create(filepath.Join(workingDirectory, conf.Logging.LogsFile))
- if err != nil {
- logger.Error("Failed to create logs file: %s", err)
- return
- }
- defer logFile.Close()
-
- logger.Info("Outputting logs to %s", conf.Logging.LogsFile)
- logger.SetOutput(logFile)
- } else {
- // output logs to stdout
- logger.Info("Outputting logs to stdout")
- logger.SetOutput(os.Stdout)
- }
- } else {
- // no logging needed
- logger.Info("No further logs will be outputted")
- logger.SetOutput(nil)
- }
-
// create visit queue file if not turned off
var visitQueueFile *os.File = nil
if !conf.InMemoryVisitQueue {
@@ -401,6 +379,30 @@ func main() {
logger.Info("Launched dashboard at http://localhost:%d", conf.Dashboard.Port)
}
+ // create and redirect logs if needed
+ if conf.Logging.OutputLogs {
+ if conf.Logging.LogsFile != "" {
+ // output logs to a file
+ logFile, err := os.Create(filepath.Join(workingDirectory, conf.Logging.LogsFile))
+ if err != nil {
+ logger.Error("Failed to create logs file: %s", err)
+ return
+ }
+ defer logFile.Close()
+
+ logger.Info("Outputting logs to %s", conf.Logging.LogsFile)
+ logger.SetOutput(logFile)
+ } else {
+ // output logs to stdout
+ logger.Info("Outputting logs to stdout")
+ logger.SetOutput(os.Stdout)
+ }
+ } else {
+ // no logging needed
+ logger.Info("No further logs will be outputted")
+ logger.SetOutput(nil)
+ }
+
// launch concurrent scraping !
workerPool.Work()
logger.Info("Started scraping...")
diff --git a/src/web/audio.go b/src/web/audio.go
index 3462b61..258e9e2 100644
--- a/src/web/audio.go
+++ b/src/web/audio.go
@@ -20,99 +20,25 @@ package web
import (
"net/url"
- "strings"
)
-func HasAudioExtention(url string) bool {
- for _, extention := range AudioExtentions {
- if strings.HasSuffix(url, extention) {
- return true
- }
- }
-
- return false
-}
-
// Tries to find audio URLs on the page
-func FindPageAudio(pageBody []byte, from *url.URL) []string {
- var urls []string
+func FindPageAudio(pageBody []byte, from url.URL) []url.URL {
+ var urls []url.URL
// for every element that has "src" attribute
- for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
- var linkStartIndex int
- var linkEndIndex int
-
- linkStartIndex = strings.Index(match, "\"")
- if linkStartIndex == -1 {
- linkStartIndex = strings.Index(match, "'")
- if linkStartIndex == -1 {
- continue
- }
-
- linkEndIndex = strings.LastIndex(match, "'")
- if linkEndIndex == -1 {
- continue
- }
- } else {
- linkEndIndex = strings.LastIndex(match, "\"")
- if linkEndIndex == -1 {
- continue
- }
- }
-
- if linkEndIndex <= linkStartIndex+1 {
- continue
- }
-
- link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
- if err != nil {
- continue
- }
-
- linkResolved := ResolveLink(link, from.Host)
- if HasAudioExtention(linkResolved) {
- urls = append(urls, linkResolved)
+ for _, link := range FindPageSrcLinks(pageBody, from) {
+ if HasAudioExtention(link.EscapedPath()) {
+ urls = append(urls, link)
}
}
// for every "a" element as well
- for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
- var linkStartIndex int
- var linkEndIndex int
-
- linkStartIndex = strings.Index(match, "\"")
- if linkStartIndex == -1 {
- linkStartIndex = strings.Index(match, "'")
- if linkStartIndex == -1 {
- continue
- }
-
- linkEndIndex = strings.LastIndex(match, "'")
- if linkEndIndex == -1 {
- continue
- }
- } else {
- linkEndIndex = strings.LastIndex(match, "\"")
- if linkEndIndex == -1 {
- continue
- }
- }
-
- if linkEndIndex <= linkStartIndex+1 {
- continue
- }
-
- link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
- if err != nil {
- continue
- }
-
- linkResolved := ResolveLink(link, from.Host)
- if HasAudioExtention(linkResolved) {
- urls = append(urls, linkResolved)
+ for _, link := range FindPageLinks(pageBody, from) {
+ if HasAudioExtention(link.EscapedPath()) {
+ urls = append(urls, link)
}
}
- // return discovered mutual video urls
return urls
}
diff --git a/src/web/documents.go b/src/web/documents.go
index 9661704..8f3af23 100644
--- a/src/web/documents.go
+++ b/src/web/documents.go
@@ -1,97 +1,42 @@
+/*
+ Wecr - crawl the web for data
+ Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see .
+*/
+
package web
import (
"net/url"
- "strings"
)
-func HasDocumentExtention(url string) bool {
- for _, extention := range DocumentExtentions {
- if strings.HasSuffix(url, extention) {
- return true
- }
- }
-
- return false
-}
-
// Tries to find docs' URLs on the page
-func FindPageDocuments(pageBody []byte, from *url.URL) []string {
- var urls []string
+func FindPageDocuments(pageBody []byte, from url.URL) []url.URL {
+ var urls []url.URL
// for every element that has "src" attribute
- for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
- var linkStartIndex int
- var linkEndIndex int
-
- linkStartIndex = strings.Index(match, "\"")
- if linkStartIndex == -1 {
- linkStartIndex = strings.Index(match, "'")
- if linkStartIndex == -1 {
- continue
- }
-
- linkEndIndex = strings.LastIndex(match, "'")
- if linkEndIndex == -1 {
- continue
- }
- } else {
- linkEndIndex = strings.LastIndex(match, "\"")
- if linkEndIndex == -1 {
- continue
- }
- }
-
- if linkEndIndex <= linkStartIndex+1 {
- continue
- }
-
- link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
- if err != nil {
- continue
- }
-
- linkResolved := ResolveLink(link, from.Host)
- if HasDocumentExtention(linkResolved) {
- urls = append(urls, linkResolved)
+ for _, link := range FindPageSrcLinks(pageBody, from) {
+ if HasDocumentExtention(link.EscapedPath()) {
+ urls = append(urls, link)
}
}
// for every "a" element as well
- for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
- var linkStartIndex int
- var linkEndIndex int
-
- linkStartIndex = strings.Index(match, "\"")
- if linkStartIndex == -1 {
- linkStartIndex = strings.Index(match, "'")
- if linkStartIndex == -1 {
- continue
- }
-
- linkEndIndex = strings.LastIndex(match, "'")
- if linkEndIndex == -1 {
- continue
- }
- } else {
- linkEndIndex = strings.LastIndex(match, "\"")
- if linkEndIndex == -1 {
- continue
- }
- }
-
- if linkEndIndex <= linkStartIndex+1 {
- continue
- }
-
- link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
- if err != nil {
- continue
- }
-
- linkResolved := ResolveLink(link, from.Host)
- if HasDocumentExtention(linkResolved) {
- urls = append(urls, linkResolved)
+ for _, link := range FindPageLinks(pageBody, from) {
+ if HasDocumentExtention(link.EscapedPath()) {
+ urls = append(urls, link)
}
}
diff --git a/src/web/extentions.go b/src/web/extentions.go
index a5f71b2..950db2b 100644
--- a/src/web/extentions.go
+++ b/src/web/extentions.go
@@ -18,6 +18,8 @@
package web
+import "strings"
+
var AudioExtentions = []string{
".3gp",
".aa",
@@ -134,3 +136,39 @@ var DocumentExtentions = []string{
".otf",
".exif",
}
+
+func HasImageExtention(urlPath string) bool {
+ for _, extention := range ImageExtentions {
+ if strings.HasSuffix(urlPath, extention) {
+ return true
+ }
+ }
+ return false
+}
+
+func HasDocumentExtention(urlPath string) bool {
+ for _, extention := range DocumentExtentions {
+ if strings.HasSuffix(urlPath, extention) {
+ return true
+ }
+ }
+ return false
+}
+
+func HasVideoExtention(urlPath string) bool {
+ for _, extention := range VideoExtentions {
+ if strings.HasSuffix(urlPath, extention) {
+ return true
+ }
+ }
+ return false
+}
+
+func HasAudioExtention(urlPath string) bool {
+ for _, extention := range AudioExtentions {
+ if strings.HasSuffix(urlPath, extention) {
+ return true
+ }
+ }
+ return false
+}
diff --git a/src/web/images.go b/src/web/images.go
index cb791c8..cbc79f2 100644
--- a/src/web/images.go
+++ b/src/web/images.go
@@ -20,99 +20,25 @@ package web
import (
"net/url"
- "strings"
)
-func HasImageExtention(url string) bool {
- for _, extention := range ImageExtentions {
- if strings.HasSuffix(url, extention) {
- return true
- }
- }
-
- return false
-}
-
// Tries to find images' URLs on the page
-func FindPageImages(pageBody []byte, from *url.URL) []string {
- var urls []string
+func FindPageImages(pageBody []byte, from url.URL) []url.URL {
+ var urls []url.URL
// for every element that has "src" attribute
- for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
- var linkStartIndex int
- var linkEndIndex int
-
- linkStartIndex = strings.Index(match, "\"")
- if linkStartIndex == -1 {
- linkStartIndex = strings.Index(match, "'")
- if linkStartIndex == -1 {
- continue
- }
-
- linkEndIndex = strings.LastIndex(match, "'")
- if linkEndIndex == -1 {
- continue
- }
- } else {
- linkEndIndex = strings.LastIndex(match, "\"")
- if linkEndIndex == -1 {
- continue
- }
- }
-
- if linkEndIndex <= linkStartIndex+1 {
- continue
- }
-
- link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
- if err != nil {
- continue
- }
-
- linkResolved := ResolveLink(link, from.Host)
- if HasImageExtention(linkResolved) {
- urls = append(urls, linkResolved)
+ for _, link := range FindPageSrcLinks(pageBody, from) {
+ if HasImageExtention(link.EscapedPath()) {
+ urls = append(urls, link)
}
}
// for every "a" element as well
- for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
- var linkStartIndex int
- var linkEndIndex int
-
- linkStartIndex = strings.Index(match, "\"")
- if linkStartIndex == -1 {
- linkStartIndex = strings.Index(match, "'")
- if linkStartIndex == -1 {
- continue
- }
-
- linkEndIndex = strings.LastIndex(match, "'")
- if linkEndIndex == -1 {
- continue
- }
- } else {
- linkEndIndex = strings.LastIndex(match, "\"")
- if linkEndIndex == -1 {
- continue
- }
- }
-
- if linkEndIndex <= linkStartIndex+1 {
- continue
- }
-
- link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
- if err != nil {
- continue
- }
-
- linkResolved := ResolveLink(link, from.Host)
- if HasImageExtention(linkResolved) {
- urls = append(urls, linkResolved)
+ for _, link := range FindPageLinks(pageBody, from) {
+ if HasImageExtention(link.EscapedPath()) {
+ urls = append(urls, link)
}
}
- // return discovered mutual image urls from
and tags
return urls
}
diff --git a/src/web/text.go b/src/web/text.go
index 594d28e..73e8447 100644
--- a/src/web/text.go
+++ b/src/web/text.go
@@ -37,25 +37,27 @@ var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(
var emailRegexp *regexp.Regexp = regexp.MustCompile(`[A-Za-z0-9._%+\-!%&?~^#$]+@[A-Za-z0-9.\-]+\.[a-zA-Z]{2,4}`)
// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
-func ResolveLink(url *url.URL, fromHost string) string {
- if !url.IsAbs() {
- if url.Scheme == "" {
+func ResolveLink(link url.URL, fromHost string) url.URL {
+ var resolvedURL url.URL = link
+
+ if !resolvedURL.IsAbs() {
+ if resolvedURL.Scheme == "" {
// add scheme
- url.Scheme = "http"
+ resolvedURL.Scheme = "https"
}
- if url.Host == "" {
+ if resolvedURL.Host == "" {
// add host
- url.Host = fromHost
+ resolvedURL.Host = fromHost
}
}
- return url.String()
+ return resolvedURL
}
-// Find all links on page that are specified in tag
-func FindPageLinks(pageBody []byte, from *url.URL) []string {
- var urls []string
+// Find all links on page that are specified in href attribute. Do not resolve links. Return URLs as they are on the page
+func FindPageLinksDontResolve(pageBody []byte) []url.URL {
+ var urls []url.URL
for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
var linkStartIndex int
@@ -88,12 +90,72 @@ func FindPageLinks(pageBody []byte, from *url.URL) []string {
continue
}
- urls = append(urls, ResolveLink(link, from.Host))
+ urls = append(urls, *link)
+ }
+
+ return urls
+}
+
+// Find all links on page that are specified in href attribute
+func FindPageLinks(pageBody []byte, from url.URL) []url.URL {
+ urls := FindPageLinksDontResolve(pageBody)
+ for index := 0; index < len(urls); index++ {
+ urls[index] = ResolveLink(urls[index], from.Host)
+ }
+
+ return urls
+}
+
+// Find all links on page that are specified in "src" attribute. Do not resolve ULRs, return them as they are on page
+func FindPageSrcLinksDontResolve(pageBody []byte) []url.URL {
+ var urls []url.URL
+
+ for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
+ var linkStartIndex int
+ var linkEndIndex int
+
+ linkStartIndex = strings.Index(match, "\"")
+ if linkStartIndex == -1 {
+ linkStartIndex = strings.Index(match, "'")
+ if linkStartIndex == -1 {
+ continue
+ }
+
+ linkEndIndex = strings.LastIndex(match, "'")
+ if linkEndIndex == -1 {
+ continue
+ }
+ } else {
+ linkEndIndex = strings.LastIndex(match, "\"")
+ if linkEndIndex == -1 {
+ continue
+ }
+ }
+
+ if linkEndIndex <= linkStartIndex+1 {
+ continue
+ }
+
+ link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
+ if err != nil {
+ continue
+ }
+
+ urls = append(urls, *link)
}
return urls
}
+// Find all links on page that are specified in "src" attribute
+func FindPageSrcLinks(pageBody []byte, from url.URL) []url.URL {
+ urls := FindPageSrcLinksDontResolve(pageBody)
+ for index := 0; index < len(urls); index++ {
+ urls[index] = ResolveLink(urls[index], from.Host)
+ }
+ return urls
+}
+
// Tries to find a certain string in page. Returns true if such string has been found
func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool {
scanner := bufio.NewScanner(bytes.NewReader(pageBody))
diff --git a/src/web/videos.go b/src/web/videos.go
index b0e6b6e..cc32deb 100644
--- a/src/web/videos.go
+++ b/src/web/videos.go
@@ -20,99 +20,25 @@ package web
import (
"net/url"
- "strings"
)
-func HasVideoExtention(url string) bool {
- for _, extention := range VideoExtentions {
- if strings.HasSuffix(url, extention) {
- return true
- }
- }
-
- return false
-}
-
// Tries to find videos' URLs on the page
-func FindPageVideos(pageBody []byte, from *url.URL) []string {
- var urls []string
+func FindPageVideos(pageBody []byte, from url.URL) []url.URL {
+ var urls []url.URL
// for every element that has "src" attribute
- for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
- var linkStartIndex int
- var linkEndIndex int
-
- linkStartIndex = strings.Index(match, "\"")
- if linkStartIndex == -1 {
- linkStartIndex = strings.Index(match, "'")
- if linkStartIndex == -1 {
- continue
- }
-
- linkEndIndex = strings.LastIndex(match, "'")
- if linkEndIndex == -1 {
- continue
- }
- } else {
- linkEndIndex = strings.LastIndex(match, "\"")
- if linkEndIndex == -1 {
- continue
- }
- }
-
- if linkEndIndex <= linkStartIndex+1 {
- continue
- }
-
- link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
- if err != nil {
- continue
- }
-
- linkResolved := ResolveLink(link, from.Host)
- if HasVideoExtention(linkResolved) {
- urls = append(urls, linkResolved)
+ for _, link := range FindPageSrcLinks(pageBody, from) {
+ if HasVideoExtention(link.EscapedPath()) {
+ urls = append(urls, link)
}
}
// for every "a" element as well
- for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
- var linkStartIndex int
- var linkEndIndex int
-
- linkStartIndex = strings.Index(match, "\"")
- if linkStartIndex == -1 {
- linkStartIndex = strings.Index(match, "'")
- if linkStartIndex == -1 {
- continue
- }
-
- linkEndIndex = strings.LastIndex(match, "'")
- if linkEndIndex == -1 {
- continue
- }
- } else {
- linkEndIndex = strings.LastIndex(match, "\"")
- if linkEndIndex == -1 {
- continue
- }
- }
-
- if linkEndIndex <= linkStartIndex+1 {
- continue
- }
-
- link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
- if err != nil {
- continue
- }
-
- linkResolved := ResolveLink(link, from.Host)
- if HasVideoExtention(linkResolved) {
- urls = append(urls, linkResolved)
+ for _, link := range FindPageLinks(pageBody, from) {
+ if HasVideoExtention(link.EscapedPath()) {
+ urls = append(urls, link)
}
}
- // return discovered mutual video urls
return urls
}
diff --git a/src/worker/worker.go b/src/worker/worker.go
index c48dc33..97ca54c 100644
--- a/src/worker/worker.go
+++ b/src/worker/worker.go
@@ -19,6 +19,7 @@
package worker
import (
+ "bytes"
"encoding/json"
"fmt"
"io"
@@ -27,6 +28,7 @@ import (
"path"
"path/filepath"
"regexp"
+ "strings"
"sync"
"time"
"unbewohnte/wecr/config"
@@ -72,8 +74,8 @@ func NewWorker(jobs chan web.Job, conf *WorkerConf, visited *visited, stats *Sta
}
}
-func (w *Worker) saveContent(links []string, pageURL *url.URL) {
- var alreadyProcessedUrls []string
+func (w *Worker) saveContent(links []url.URL, pageURL *url.URL) {
+ var alreadyProcessedUrls []url.URL
for count, link := range links {
// check if this URL has been processed already
var skip bool = false
@@ -91,29 +93,29 @@ func (w *Worker) saveContent(links []string, pageURL *url.URL) {
}
alreadyProcessedUrls = append(alreadyProcessedUrls, link)
- var fileName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(link))
+ var fileName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(link.Path))
var filePath string
- if web.HasImageExtention(link) {
+ if web.HasImageExtention(link.Path) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveImagesDir, fileName)
- } else if web.HasVideoExtention(link) {
+ } else if web.HasVideoExtention(link.Path) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName)
- } else if web.HasAudioExtention(link) {
+ } else if web.HasAudioExtention(link.Path) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName)
- } else if web.HasDocumentExtention(link) {
+ } else if web.HasDocumentExtention(link.Path) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveDocumentsDir, fileName)
} else {
filePath = filepath.Join(w.Conf.Save.OutputDir, fileName)
}
err := web.FetchFile(
- link,
+ link.String(),
w.Conf.Requests.UserAgent,
w.Conf.Requests.ContentFetchTimeoutMs,
filePath,
)
if err != nil {
- logger.Error("Failed to fetch file at %s: %s", link, err)
+ logger.Error("Failed to fetch file located at %s: %s", link.String(), err)
return
}
@@ -122,31 +124,105 @@ func (w *Worker) saveContent(links []string, pageURL *url.URL) {
}
}
-// Save page to the disk with a corresponding name
-func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
- if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" {
- var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String()))
- pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, config.SavePagesDir, pageName))
+// Save page to the disk with a corresponding name; Download any src files, stylesheets and JS along the way
+func (w *Worker) savePage(baseURL url.URL, pageData []byte) {
+ var findPageFileContentURLs func([]byte) []url.URL = func(pageBody []byte) []url.URL {
+ var urls []url.URL
+
+ for _, link := range web.FindPageLinksDontResolve(pageData) {
+ if strings.Contains(link.Path, ".css") ||
+ strings.Contains(link.Path, ".scss") ||
+ strings.Contains(link.Path, ".js") ||
+ strings.Contains(link.Path, ".mjs") {
+ urls = append(urls, link)
+ }
+ }
+ urls = append(urls, web.FindPageSrcLinksDontResolve(pageBody)...)
+
+ return urls
+ }
+
+ var cleanLink func(url.URL, url.URL) url.URL = func(link url.URL, from url.URL) url.URL {
+ resolvedLink := web.ResolveLink(link, from.Host)
+ cleanLink, err := url.Parse(resolvedLink.Scheme + "://" + resolvedLink.Host + resolvedLink.Path)
if err != nil {
- logger.Error("Failed to create page of \"%s\": %s", baseURL.String(), err)
- return
+ return resolvedLink
}
- defer pageFile.Close()
+ return *cleanLink
+ }
+
+ // Create directory with all file content on the page
+ var pageFilesDirectoryName string = fmt.Sprintf(
+ "%s_%s_files",
+ baseURL.Host,
+ strings.ReplaceAll(baseURL.Path, "/", "_"),
+ )
+ err := os.MkdirAll(filepath.Join(w.Conf.Save.OutputDir, config.SavePagesDir, pageFilesDirectoryName), os.ModePerm)
+ if err != nil {
+ logger.Error("Failed to create directory to store file contents of %s: %s", baseURL.String(), err)
+ return
+ }
+
+ // Save files on page
+ srcLinks := findPageFileContentURLs(pageData)
+ for _, srcLink := range srcLinks {
+ web.FetchFile(srcLink.String(),
+ w.Conf.Requests.UserAgent,
+ w.Conf.Requests.ContentFetchTimeoutMs,
+ filepath.Join(
+ w.Conf.Save.OutputDir,
+ config.SavePagesDir,
+ pageFilesDirectoryName,
+ path.Base(srcLink.String()),
+ ),
+ )
+ }
- pageFile.Write(pageData)
+ // Redirect old content URLs to local files
+ for _, srcLink := range srcLinks {
+ cleanLink := cleanLink(srcLink, baseURL)
+ pageData = bytes.ReplaceAll(
+ pageData,
+ []byte(srcLink.String()),
+ []byte("./"+filepath.Join(pageFilesDirectoryName, path.Base(cleanLink.String()))),
+ )
+ }
- logger.Info("Saved \"%s\"", pageName)
- w.stats.PagesSaved++
+ // Create page output file
+ pageName := fmt.Sprintf(
+ "%s_%s.html",
+ baseURL.Host,
+ strings.ReplaceAll(baseURL.Path, "/", "_"),
+ )
+ outfile, err := os.Create(filepath.Join(
+ filepath.Join(w.Conf.Save.OutputDir, config.SavePagesDir),
+ pageName,
+ ))
+ if err != nil {
+ fmt.Printf("Failed to create output file: %s\n", err)
}
+ defer outfile.Close()
+
+ outfile.Write(pageData)
+
+ logger.Info("Saved \"%s\"", pageName)
+ w.stats.PagesSaved++
}
-func (w *Worker) saveResult(result web.Result) {
- // write result to the output file
+const (
+ textTypeMatch = iota
+ textTypeEmail = iota
+)
+// Save text result to an appropriate file
+func (w *Worker) saveResult(result web.Result, textType int) {
+ // write result to the output file
var output io.Writer
- if result.Search.Query == config.QueryEmail {
+ switch textType {
+ case textTypeEmail:
output = w.Conf.EmailsOutput
- } else {
+
+ default:
output = w.Conf.TextOutput
}
@@ -257,7 +333,7 @@ func (w *Worker) Work() {
}
// find links
- pageLinks := web.FindPageLinks(pageData, pageURL)
+ pageLinks := web.FindPageLinks(pageData, *pageURL)
go func() {
if job.Depth > 1 {
// decrement depth and add new jobs
@@ -267,9 +343,9 @@ func (w *Worker) Work() {
// add to the visit queue
w.Conf.VisitQueue.Lock.Lock()
for _, link := range pageLinks {
- if link != job.URL {
+ if link.String() != job.URL {
err = queue.InsertNewJob(w.Conf.VisitQueue.VisitQueue, web.Job{
- URL: link,
+ URL: link.String(),
Search: *w.Conf.Search,
Depth: job.Depth,
})
@@ -283,9 +359,9 @@ func (w *Worker) Work() {
} else {
// add to the in-memory channel
for _, link := range pageLinks {
- if link != job.URL {
+ if link.String() != job.URL {
w.Jobs <- web.Job{
- URL: link,
+ URL: link.String(),
Search: *w.Conf.Search,
Depth: job.Depth,
}
@@ -301,9 +377,12 @@ func (w *Worker) Work() {
var savePage bool = false
switch job.Search.Query {
+ case config.QueryArchive:
+ savePage = true
+
case config.QueryImages:
// find image URLs, output images to the file while not saving already outputted ones
- imageLinks := web.FindPageImages(pageData, pageURL)
+ imageLinks := web.FindPageImages(pageData, *pageURL)
if len(imageLinks) > 0 {
w.saveContent(imageLinks, pageURL)
savePage = true
@@ -312,7 +391,7 @@ func (w *Worker) Work() {
case config.QueryVideos:
// search for videos
// find video URLs, output videos to the files while not saving already outputted ones
- videoLinks := web.FindPageVideos(pageData, pageURL)
+ videoLinks := web.FindPageVideos(pageData, *pageURL)
if len(videoLinks) > 0 {
w.saveContent(videoLinks, pageURL)
savePage = true
@@ -321,7 +400,7 @@ func (w *Worker) Work() {
case config.QueryAudio:
// search for audio
// find audio URLs, output audio to the file while not saving already outputted ones
- audioLinks := web.FindPageAudio(pageData, pageURL)
+ audioLinks := web.FindPageAudio(pageData, *pageURL)
if len(audioLinks) > 0 {
w.saveContent(audioLinks, pageURL)
savePage = true
@@ -330,7 +409,7 @@ func (w *Worker) Work() {
case config.QueryDocuments:
// search for various documents
// find documents URLs, output docs to the file while not saving already outputted ones
- docsLinks := web.FindPageDocuments(pageData, pageURL)
+ docsLinks := web.FindPageDocuments(pageData, *pageURL)
if len(docsLinks) > 0 {
w.saveContent(docsLinks, pageURL)
savePage = true
@@ -344,7 +423,7 @@ func (w *Worker) Work() {
PageURL: job.URL,
Search: job.Search,
Data: emailAddresses,
- })
+ }, textTypeEmail)
w.stats.MatchesFound += uint64(len(emailAddresses))
savePage = true
}
@@ -353,11 +432,11 @@ func (w *Worker) Work() {
// search for everything
// files
- var contentLinks []string
- contentLinks = append(contentLinks, web.FindPageImages(pageData, pageURL)...)
- contentLinks = append(contentLinks, web.FindPageAudio(pageData, pageURL)...)
- contentLinks = append(contentLinks, web.FindPageVideos(pageData, pageURL)...)
- contentLinks = append(contentLinks, web.FindPageDocuments(pageData, pageURL)...)
+ var contentLinks []url.URL
+ contentLinks = append(contentLinks, web.FindPageImages(pageData, *pageURL)...)
+ contentLinks = append(contentLinks, web.FindPageAudio(pageData, *pageURL)...)
+ contentLinks = append(contentLinks, web.FindPageVideos(pageData, *pageURL)...)
+ contentLinks = append(contentLinks, web.FindPageDocuments(pageData, *pageURL)...)
w.saveContent(contentLinks, pageURL)
if len(contentLinks) > 0 {
@@ -371,7 +450,7 @@ func (w *Worker) Work() {
PageURL: job.URL,
Search: job.Search,
Data: emailAddresses,
- })
+ }, textTypeEmail)
w.stats.MatchesFound += uint64(len(emailAddresses))
savePage = true
}
@@ -393,7 +472,7 @@ func (w *Worker) Work() {
PageURL: job.URL,
Search: job.Search,
Data: matches,
- })
+ }, textTypeMatch)
logger.Info("Found matches: %+v", matches)
w.stats.MatchesFound += uint64(len(matches))
savePage = true
@@ -405,7 +484,7 @@ func (w *Worker) Work() {
PageURL: job.URL,
Search: job.Search,
Data: []string{job.Search.Query},
- })
+ }, textTypeMatch)
logger.Info("Found \"%s\" on page", job.Search.Query)
w.stats.MatchesFound++
savePage = true
@@ -414,8 +493,8 @@ func (w *Worker) Work() {
}
// save page
- if savePage {
- w.savePage(pageURL, pageData)
+ if savePage && w.Conf.Save.SavePages {
+ w.savePage(*pageURL, pageData)
}
pageData = nil
pageURL = nil