NO DEPENDENCIES !; Audio, and video search; separate timeout for file fetching

2 years ago · 91112b89ba
13 changed files with 605 additions and 152 deletions
--- a/1
+++ b/1
@ -11,7 +11,6 @@ DARWINDIR:=$(EXE)_darwin
 LINUXDIR32:=$(LINUXDIR)_x32
 WINDIR32:=$(WINDIR)_x32
 DARWINDIR32:=$(DARWINDIR)_x32
 LINUXDIR64:=$(LINUXDIR)_x64
 WINDIR64:=$(WINDIR)_x64
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@ Just a simple HTML web spider with minimal dependencies. It is possible to searc
 ## Configuration
-The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `wDir` (working directory) flag is set to some other value.
+The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wDir` (working directory) flag is set to some other value.
 The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.
@ -14,20 +14,25 @@ The parsing starts from `initial_pages` and goes deeper while ignoring the pages
 ### Search query
-if `is_regexp` is `false`, then `query` is the text to be searched for, but there are some special values:
+There are some special `query` values:
 - `links` - tells `wecr` to search for all links there are on the page
- `images` - find all image links and output to the `output_dir` (**IMPORTANT**: set `wait_timeout_ms` to `0` so the images load fully)
+- `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully)
 - `videos` - find and fetch files that look like videos
 - `audio` - find and fetch files that look like audio
 When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.
 ### Output
-By default, if the query is not `images` all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images` - the additional contents will be put in the `output_dir` directory neatly created by the executable's side.
+By default, if the query is not something of special values all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be put in the corresponding directories inside `output_dir`, which is neatly created by the executable's side.
 ## TODO
- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)**
+- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - [x]
 - Search for videos - [x]
 - Search for audio - [x]
 - Search for documents - []
 ## License
 AGPLv3
--- a/src/config/config.go
+++ b/src/config/config.go
@ -1,6 +1,6 @@
 /*
 	Wecr - crawl the web for data
-	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+	Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
@ -27,6 +27,15 @@ import (
 const (
 	QueryLinks  string = "links"
 	QueryImages string = "images"
 	QueryVideos string = "videos"
 	QueryAudio  string = "audio"
 )
 const (
 	SavePagesDir  string = "pages"
 	SaveImagesDir string = "images"
 	SaveVideosDir string = "videos"
 	SaveAudioDir  string = "audio"
 )
 type Search struct {
@ -41,8 +50,9 @@ type Save struct {
 }
 type Requests struct {
-	WaitTimeoutMs  uint64 `json:"wait_timeout_ms"`
+	RequestWaitTimeoutMs  uint64 `json:"request_wait_timeout_ms"`
 	RequestPauseMs        uint64 `json:"request_pause_ms"`
 	ContentFetchTimeoutMs uint64 `json:"content_fetch_timeout_ms"`
 	UserAgent             string `json:"user_agent"`
 }
@ -78,8 +88,9 @@ func Default() *Conf {
 		},
 		Requests: Requests{
 			UserAgent:             "",
-			WaitTimeoutMs:  1500,
+			RequestWaitTimeoutMs:  1500,
 			RequestPauseMs:        100,
 			ContentFetchTimeoutMs: 0,
 		},
 		InitialPages:       []string{""},
 		Depth:              5,
@ -95,7 +106,7 @@ func Default() *Conf {
 // Write current configuration to w
 func (c *Conf) WriteTo(w io.Writer) error {
-	jsonData, err := json.MarshalIndent(c, "", "  ")
+	jsonData, err := json.MarshalIndent(c, " ", "\t")
 	if err != nil {
 		return err
 	}
--- a/src/go.mod
+++ b/src/go.mod
@ -1,5 +1,3 @@
 module unbewohnte/wecr
 go 1.18
 require golang.org/x/net v0.4.0
--- a/src/go.sum
+++ b/src/go.sum
@ -1,2 +0,0 @@
 golang.org/x/net v0.4.0 h1:Q5QPcMlvfxFTAPV0+07Xz/MpK9NTXu2VDUuy0FeMfaU=
 golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=
--- a/src/main.go
+++ b/src/main.go
@ -1,6 +1,6 @@
 /*
 	Wecr - crawl the web for data
-	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+	Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
@ -36,7 +36,7 @@ import (
 	"unbewohnte/wecr/worker"
 )
-const version = "v0.1.4"
+const version = "v0.2.0"
 const (
 	defaultConfigFile string = "conf.json"
@ -82,7 +82,7 @@ func init() {
 	if *printVersion {
 		fmt.Printf(
-			"Wecr %s - crawl the web for data\n(c) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n",
+			"Wecr %s - crawl the web for data\n(c) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n",
 			version,
 		)
 		os.Exit(0)
@ -97,7 +97,7 @@ func init() {
 ╚███╔███╔╝███████╗╚██████╗██║  ██║
 ╚══╝╚══╝ ╚══════╝ ╚═════╝╚═╝  ╚═╝`),
 	)
-	logger.GetOutput().Write([]byte(version + "\n\n"))
+	logger.GetOutput().Write([]byte(version + " by Unbewohnte\n\n"))
 	// work out working directory path
 	if *wDir != "" {
@ -240,6 +240,7 @@ func main() {
 		logger.Warning("User agent is not set. Forced to \"%s\"", conf.Requests.UserAgent)
 	}
 	// create output directories and corresponding specialized ones
 	if !filepath.IsAbs(conf.Save.OutputDir) {
 		conf.Save.OutputDir = filepath.Join(workingDirectory, conf.Save.OutputDir)
 	}
@ -249,11 +250,39 @@ func main() {
 		return
 	}
 	err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SavePagesDir), os.ModePerm)
 	if err != nil {
 		logger.Error("Failed to create output directory for pages: %s", err)
 		return
 	}
 	err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveImagesDir), os.ModePerm)
 	if err != nil {
 		logger.Error("Failed to create output directory for images: %s", err)
 		return
 	}
 	err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveVideosDir), os.ModePerm)
 	if err != nil {
 		logger.Error("Failed to create output directory for video: %s", err)
 		return
 	}
 	err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveAudioDir), os.ModePerm)
 	if err != nil {
 		logger.Error("Failed to create output directory for audio: %s", err)
 		return
 	}
 	switch conf.Search.Query {
 	case config.QueryLinks:
 		logger.Info("Looking for links")
 	case config.QueryImages:
-		logger.Info("Looking for images")
+		logger.Info("Looking for images (%+s)", web.ImageExtentions)
 	case config.QueryVideos:
 		logger.Info("Looking for videos (%+s)", web.VideoExtentions)
 	case config.QueryAudio:
 		logger.Info("Looking for audio (%+s)", web.AudioExtentions)
 	default:
 		if conf.Search.IsRegexp {
 			logger.Info("Looking for RegExp matches (%s)", conf.Search.Query)
@ -319,7 +348,7 @@ func main() {
 				timeSince := time.Since(workerPool.Stats.StartTime).Round(time.Second)
-				fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d saved; %d matches (%d pages/sec)",
+				fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d pages saved; %d matches (%d pages/sec)",
 					timeSince.String(),
 					workerPool.Stats.PagesVisited,
 					workerPool.Stats.PagesSaved,
@ -338,7 +367,7 @@ func main() {
 		}
 		// each entry in output file is a self-standing JSON object
-		entryBytes, err := json.MarshalIndent(result, "", "  ")
+		entryBytes, err := json.MarshalIndent(result, " ", "\t")
 		if err != nil {
 			continue
 		}
--- a/src/web/audio.go
+++ b/src/web/audio.go
@ -0,0 +1,118 @@
 /*
 	Wecr - crawl the web for data
 	Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
 	the Free Software Foundation, either version 3 of the License, or
 	(at your option) any later version.
 	This program is distributed in the hope that it will be useful,
 	but WITHOUT ANY WARRANTY; without even the implied warranty of
 	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 	GNU Affero General Public License for more details.
 	You should have received a copy of the GNU Affero General Public License
 	along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 package web
 import (
 	"net/url"
 	"strings"
 )
 func hasAudioExtention(url string) bool {
 	for _, extention := range AudioExtentions {
 		if strings.HasSuffix(url, extention) {
 			return true
 		}
 	}
 	return false
 }
 // Tries to find audio URLs on the page
 func FindPageAudio(pageBody []byte, from *url.URL) []string {
 	var urls []string
 	// for every element that has "src" attribute
 	for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
 		var linkStartIndex int
 		var linkEndIndex int
 		linkStartIndex = strings.Index(match, "\"")
 		if linkStartIndex == -1 {
 			linkStartIndex = strings.Index(match, "'")
 			if linkStartIndex == -1 {
 				continue
 			}
 			linkEndIndex = strings.LastIndex(match, "'")
 			if linkEndIndex == -1 {
 				continue
 			}
 		} else {
 			linkEndIndex = strings.LastIndex(match, "\"")
 			if linkEndIndex == -1 {
 				continue
 			}
 		}
 		if linkEndIndex <= linkStartIndex+1 {
 			continue
 		}
 		link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
 		if err != nil {
 			continue
 		}
 		linkResolved := ResolveLink(link, from.Host)
 		if hasAudioExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
 	// for every "a" element as well
 	for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
 		var linkStartIndex int
 		var linkEndIndex int
 		linkStartIndex = strings.Index(match, "\"")
 		if linkStartIndex == -1 {
 			linkStartIndex = strings.Index(match, "'")
 			if linkStartIndex == -1 {
 				continue
 			}
 			linkEndIndex = strings.LastIndex(match, "'")
 			if linkEndIndex == -1 {
 				continue
 			}
 		} else {
 			linkEndIndex = strings.LastIndex(match, "\"")
 			if linkEndIndex == -1 {
 				continue
 			}
 		}
 		if linkEndIndex <= linkStartIndex+1 {
 			continue
 		}
 		link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
 		if err != nil {
 			continue
 		}
 		linkResolved := ResolveLink(link, from.Host)
 		if hasAudioExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
 	// return discovered mutual video urls
 	return urls
 }
--- a/src/web/extentions.go
+++ b/src/web/extentions.go
@ -0,0 +1,88 @@
 /*
 	Wecr - crawl the web for data
 	Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
 	the Free Software Foundation, either version 3 of the License, or
 	(at your option) any later version.
 	This program is distributed in the hope that it will be useful,
 	but WITHOUT ANY WARRANTY; without even the implied warranty of
 	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 	GNU Affero General Public License for more details.
 	You should have received a copy of the GNU Affero General Public License
 	along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 package web
 var AudioExtentions = []string{
 	".3gp",
 	".aa",
 	".aac",
 	".aax",
 	".act",
 	".aiff",
 	".alac",
 	".amr",
 	".ape",
 	".au",
 	".flac",
 	".m4a",
 	".mp3",
 	".mpc",
 	".msv",
 	".ogg",
 	".oga",
 	".mogg",
 	".opus",
 	".tta",
 	".wav",
 	".cda",
 }
 var ImageExtentions = []string{
 	".jpeg",
 	".jpg",
 	".jpe",
 	".jfif",
 	".png",
 	".ppm",
 	".svg",
 	".gif",
 	".tiff",
 	".bmp",
 	".webp",
 	".ico",
 	".kra",
 	".bpg",
 	".drw",
 	".tga",
 	".kra",
 }
 var VideoExtentions = []string{
 	".webm",
 	".mkv",
 	".flv",
 	".wmv",
 	".avi",
 	".yuv",
 	".mp2",
 	".mp4",
 	".mpeg",
 	".mpg",
 	".mpv",
 	".m4v",
 	".3gp",
 	".3g2",
 	".nsv",
 	".vob",
 	".ogv",
 }
 var DocumentExtentions = []string{
 	"",
 }
--- a/src/web/images.go
+++ b/src/web/images.go
@ -1,6 +1,6 @@
 /*
 	Wecr - crawl the web for data
-	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+	Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
@ -19,30 +19,12 @@
 package web
 import (
 	"bytes"
 	"net/url"
 	"strings"
 	"golang.org/x/net/html"
 )
 func hasImageExtention(url string) bool {
-	var extentions []string = []string{
+	for _, extention := range ImageExtentions {
 		".jpeg",
 		".jpg",
 		".jpe",
 		".jfif",
 		".png",
 		".ppm",
 		".svg",
 		".gif",
 		".tiff",
 		".bmp",
 		".webp",
 		".ico",
 	}
 	for _, extention := range extentions {
 		if strings.HasSuffix(url, extention) {
 			return true
 		}
@ -55,43 +37,82 @@ func hasImageExtention(url string) bool {
 func FindPageImages(pageBody []byte, from *url.URL) []string {
 	var urls []string
-	tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
+	// for every element that has "src" attribute
-	for {
+	for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
-		tokenType := tokenizer.Next()
+		var linkStartIndex int
-
+		var linkEndIndex int
 		switch tokenType {
 		case html.ErrorToken:
 			return urls
-		case html.StartTagToken:
+		linkStartIndex = strings.Index(match, "\"")
-			token := tokenizer.Token()
+		if linkStartIndex == -1 {
 			linkStartIndex = strings.Index(match, "'")
 			if linkStartIndex == -1 {
 				continue
 			}
-			if token.Data != "img" && token.Data != "a" {
+			linkEndIndex = strings.LastIndex(match, "'")
 			if linkEndIndex == -1 {
 				continue
 			}
 		} else {
 			linkEndIndex = strings.LastIndex(match, "\"")
 			if linkEndIndex == -1 {
 				continue
 			}
 		}
-			for _, attribute := range token.Attr {
+		if linkEndIndex <= linkStartIndex+1 {
 				if attribute.Key != "src" && attribute.Key != "href" {
 			continue
 		}
-				imageURL, err := url.Parse(attribute.Val)
+		link, err := url.Parse(match)
 		if err != nil {
-					break
+			continue
 		}
-				imageURLString := ResolveLink(imageURL, from.Host)
+		linkResolved := ResolveLink(link, from.Host)
 		if hasImageExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
 	// for every "a" element as well
 	for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
 		var linkStartIndex int
 		var linkEndIndex int
 		linkStartIndex = strings.Index(match, "\"")
 		if linkStartIndex == -1 {
 			linkStartIndex = strings.Index(match, "'")
 			if linkStartIndex == -1 {
 				continue
 			}
-				if token.Data == "img" {
+			linkEndIndex = strings.LastIndex(match, "'")
-					// <img> tag -> don't check
+			if linkEndIndex == -1 {
-					urls = append(urls, imageURLString)
+				continue
 			}
 		} else {
-					// <a> tag -> check for image extention
+			linkEndIndex = strings.LastIndex(match, "\"")
-					if hasImageExtention(imageURLString) {
+			if linkEndIndex == -1 {
-						urls = append(urls, imageURLString)
+				continue
 			}
 		}
 		if linkEndIndex <= linkStartIndex+1 {
 			continue
 		}
 		link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
 		if err != nil {
 			continue
 		}
 		linkResolved := ResolveLink(link, from.Host)
 		if hasImageExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
 	// return discovered mutual image urls from <img> and <a> tags
 	return urls
 }
--- a/src/web/requests.go
+++ b/src/web/requests.go
@ -1,6 +1,6 @@
 /*
 	Wecr - crawl the web for data
-	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+	Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
@ -21,18 +21,24 @@ package web
 import (
 	"io"
 	"net/http"
 	"os"
 	"time"
 )
 // Get page data coming from url with optional user agent and timeout
 func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
 	// client := &http.Client{}
 	// client.CheckRedirect = http.DefaultClient.CheckRedirect
 	// client.Transport = http.DefaultClient.Transport
 	// client.Timeout = time.Duration(timeOutMs)
 	req, err := http.NewRequest("GET", url, nil)
 	if err != nil {
 		return nil, err
 	}
 	req.Header.Set("User-Agent", userAgent)
-	http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond))
+	// response, err := client.Do(req)
 	response, err := http.DefaultClient.Do(req)
 	if err != nil {
 		return nil, err
@ -46,3 +52,33 @@ func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
 	return responseBody, nil
 }
 // Fetch file from url and save to file at filePath
 func FetchFile(url string, userAgent string, timeOutMs uint64, filePath string) error {
 	client := http.Client{}
 	client.Timeout = time.Duration(timeOutMs)
 	client.CheckRedirect = http.DefaultClient.CheckRedirect
 	client.Transport = http.DefaultClient.Transport
 	req, err := http.NewRequest("GET", url, nil)
 	if err != nil {
 		return err
 	}
 	req.Header.Set("User-Agent", userAgent)
 	response, err := client.Do(req)
 	if err != nil {
 		return nil
 	}
 	defer response.Body.Close()
 	file, err := os.Create(filePath)
 	if err != nil {
 		return err
 	}
 	defer file.Close()
 	_, _ = io.Copy(file, response.Body)
 	return nil
 }
--- a/src/web/text.go
+++ b/src/web/text.go
@ -1,6 +1,6 @@
 /*
 	Wecr - crawl the web for data
-	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+	Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
@ -24,10 +24,14 @@ import (
 	"net/url"
 	"regexp"
 	"strings"
 	"golang.org/x/net/html"
 )
 // matches href="link" or something down bad like hReF =  'link'
 var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|')(.*?)("|')`)
 // matches src="link" or even something along the lines of SrC    =  'link'
 var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`)
 // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
 func ResolveLink(url *url.URL, fromHost string) string {
 	if !url.IsAbs() {
@ -39,7 +43,6 @@ func ResolveLink(url *url.URL, fromHost string) string {
 		if url.Host == "" {
 			// add host
 			url.Host = fromHost
 		}
 	}
@ -50,36 +53,41 @@ func ResolveLink(url *url.URL, fromHost string) string {
 func FindPageLinks(pageBody []byte, from *url.URL) []string {
 	var urls []string
-	tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
+	for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
-	for {
+		var linkStartIndex int
-		tokenType := tokenizer.Next()
+		var linkEndIndex int
-		switch tokenType {
+		linkStartIndex = strings.Index(match, "\"")
-		case html.ErrorToken:
+		if linkStartIndex == -1 {
-			return urls
+			linkStartIndex = strings.Index(match, "'")
-
+			if linkStartIndex == -1 {
-		case html.StartTagToken:
+				continue
-			token := tokenizer.Token()
+			}
-			if token.Data != "a" {
+			linkEndIndex = strings.LastIndex(match, "'")
 			if linkEndIndex == -1 {
 				continue
 			}
 		} else {
 			linkEndIndex = strings.LastIndex(match, "\"")
 			if linkEndIndex == -1 {
 				continue
 			}
 		}
-			// recheck
+		if linkEndIndex <= linkStartIndex+1 {
 			for _, attribute := range token.Attr {
 				if attribute.Key != "href" {
 			continue
 		}
-				link, err := url.Parse(attribute.Val)
+		link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
 		if err != nil {
-					break
+			continue
 		}
 		urls = append(urls, ResolveLink(link, from.Host))
 	}
-		}
+
-	}
+	return urls
 }
 // Tries to find a certain string in page. Returns true if such string has been found
--- a/src/web/videos.go
+++ b/src/web/videos.go
@ -0,0 +1,118 @@
 /*
 	Wecr - crawl the web for data
 	Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
 	the Free Software Foundation, either version 3 of the License, or
 	(at your option) any later version.
 	This program is distributed in the hope that it will be useful,
 	but WITHOUT ANY WARRANTY; without even the implied warranty of
 	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 	GNU Affero General Public License for more details.
 	You should have received a copy of the GNU Affero General Public License
 	along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 package web
 import (
 	"net/url"
 	"strings"
 )
 func hasVideoExtention(url string) bool {
 	for _, extention := range VideoExtentions {
 		if strings.HasSuffix(url, extention) {
 			return true
 		}
 	}
 	return false
 }
 // Tries to find videos' URLs on the page
 func FindPageVideos(pageBody []byte, from *url.URL) []string {
 	var urls []string
 	// for every element that has "src" attribute
 	for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
 		var linkStartIndex int
 		var linkEndIndex int
 		linkStartIndex = strings.Index(match, "\"")
 		if linkStartIndex == -1 {
 			linkStartIndex = strings.Index(match, "'")
 			if linkStartIndex == -1 {
 				continue
 			}
 			linkEndIndex = strings.LastIndex(match, "'")
 			if linkEndIndex == -1 {
 				continue
 			}
 		} else {
 			linkEndIndex = strings.LastIndex(match, "\"")
 			if linkEndIndex == -1 {
 				continue
 			}
 		}
 		if linkEndIndex <= linkStartIndex+1 {
 			continue
 		}
 		link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
 		if err != nil {
 			continue
 		}
 		linkResolved := ResolveLink(link, from.Host)
 		if hasVideoExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
 	// for every "a" element as well
 	for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
 		var linkStartIndex int
 		var linkEndIndex int
 		linkStartIndex = strings.Index(match, "\"")
 		if linkStartIndex == -1 {
 			linkStartIndex = strings.Index(match, "'")
 			if linkStartIndex == -1 {
 				continue
 			}
 			linkEndIndex = strings.LastIndex(match, "'")
 			if linkEndIndex == -1 {
 				continue
 			}
 		} else {
 			linkEndIndex = strings.LastIndex(match, "\"")
 			if linkEndIndex == -1 {
 				continue
 			}
 		}
 		if linkEndIndex <= linkStartIndex+1 {
 			continue
 		}
 		link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
 		if err != nil {
 			continue
 		}
 		linkResolved := ResolveLink(link, from.Host)
 		if hasVideoExtention(linkResolved) {
 			urls = append(urls, linkResolved)
 		}
 	}
 	// return discovered mutual video urls
 	return urls
 }
--- a/src/worker/worker.go
+++ b/src/worker/worker.go
@ -1,6 +1,6 @@
 /*
 	Wecr - crawl the web for data
-	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+	Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
@ -20,8 +20,6 @@ package worker
 import (
 	"fmt"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
 	"path"
@ -63,20 +61,70 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
 	}
 }
 func (w *Worker) saveContent(contenType string, links []string, pageURL *url.URL) {
 	var alreadyProcessedUrls []string
 	for count, link := range links {
 		// check if this URL has been processed already
 		var skip bool = false
 		for _, processedURL := range alreadyProcessedUrls {
 			if link == processedURL {
 				skip = true
 				break
 			}
 		}
 		if skip {
 			skip = false
 			continue
 		}
 		alreadyProcessedUrls = append(alreadyProcessedUrls, link)
 		var fileName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(link))
 		var filePath string
 		switch contenType {
 		case config.QueryImages:
 			filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveImagesDir, fileName)
 		case config.QueryVideos:
 			filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName)
 		case config.QueryAudio:
 			filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName)
 		default:
 			filePath = filepath.Join(w.Conf.Save.OutputDir, fileName)
 		}
 		err := web.FetchFile(
 			link,
 			w.Conf.Requests.UserAgent,
 			w.Conf.Requests.ContentFetchTimeoutMs,
 			filePath,
 		)
 		if err != nil {
 			logger.Error("Failed to fetch file at %s: %s", link, err)
 			return
 		}
 		logger.Info("Outputted \"%s\"", fileName)
 		w.stats.MatchesFound++
 	}
 }
 // Save page to the disk with a corresponding name
 func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
 	if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" {
 		var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String()))
-		pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, pageName))
+		pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, config.SavePagesDir, pageName))
 		if err != nil {
 			logger.Error("Failed to create page of \"%s\": %s", baseURL.String(), err)
-		} else {
+			return
 			pageFile.Write(pageData)
 		}
 		defer pageFile.Close()
-		pageFile.Close()
+		pageFile.Write(pageData)
 		logger.Info("Saved \"%s\"", pageName)
 		w.stats.PagesSaved++
 	}
 }
@ -151,7 +199,7 @@ func (w *Worker) Work() {
 		// get page
 		logger.Info("Visiting %s", job.URL)
-		pageData, err := web.GetPage(job.URL, w.Conf.Requests.UserAgent, w.Conf.Requests.WaitTimeoutMs)
+		pageData, err := web.GetPage(job.URL, w.Conf.Requests.UserAgent, w.Conf.Requests.RequestWaitTimeoutMs)
 		if err != nil {
 			logger.Error("Failed to get \"%s\": %s", job.URL, err)
 			continue
@ -196,49 +244,26 @@ func (w *Worker) Work() {
 		case config.QueryImages:
 			// find image URLs, output images to the file while not saving already outputted ones
 			imageLinks := web.FindPageImages(pageData, pageURL)
-
+			w.saveContent(config.QueryImages, imageLinks, pageURL)
-			var alreadyProcessedImgUrls []string
+			if len(imageLinks) > 0 {
-			for count, imageLink := range imageLinks {
+				savePage = true
 				// check if this URL has been processed already
 				var skipImage bool = false
 				for _, processedURL := range alreadyProcessedImgUrls {
 					if imageLink == processedURL {
 						skipImage = true
 						break
 					}
 				}
 				if skipImage {
 					skipImage = false
 					continue
 				}
 				alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
 				var imageName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(imageLink))
 				response, err := http.Get(imageLink)
 				if err != nil {
 					logger.Error("Failed to get image %s", imageLink)
 					continue
 				}
 				imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName))
 				if err != nil {
 					logger.Error("Failed to create image file \"%s\": %s", imageName, err)
 					continue
 			}
-				_, _ = io.Copy(imageFile, response.Body)
+		case config.QueryVideos:
-
+			// search for videos
-				response.Body.Close()
+			// find video URLs, output videos to the files while not saving already outputted ones
-				imageFile.Close()
+			videoLinks := web.FindPageVideos(pageData, pageURL)
-
+			w.saveContent(config.QueryVideos, videoLinks, pageURL)
-				logger.Info("Outputted \"%s\"", imageName)
+			if len(videoLinks) > 0 {
-				w.stats.MatchesFound++
+				savePage = true
 			}
-			if len(imageLinks) > 0 {
+		case config.QueryAudio:
 			// search for audio
 			// find audio URLs, output audio to the file while not saving already outputted ones
 			audioLinks := web.FindPageAudio(pageData, pageURL)
 			w.saveContent(config.QueryAudio, audioLinks, pageURL)
 			if len(audioLinks) > 0 {
 				savePage = true
 			}
@ -284,7 +309,6 @@ func (w *Worker) Work() {
 		// save page
 		if savePage {
 			w.savePage(pageURL, pageData)
 			w.stats.PagesSaved++
 		}
 		// sleep before the next request
		`@ -1,2 +0,0 @@`
			`golang.org/x/net v0.4.0 h1:Q5QPcMlvfxFTAPV0+07Xz/MpK9NTXu2VDUuy0FeMfaU=`
			`golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=`