From 91112b89ba37b87766c2f369b1f4e1e72b372464 Mon Sep 17 00:00:00 2001
From: Unbewohnte <me@unbewohnte.su>
Date: Sat, 14 Jan 2023 20:30:28 +0300
Subject: [PATCH] NO DEPENDENCIES !; Audio, and video search; separate timeout
 for file fetching

---
 Makefile              |   1 -
 README.md             |  15 ++++--
 src/config/config.go  |  27 +++++++---
 src/go.mod            |   2 -
 src/go.sum            |   2 -
 src/main.go           |  43 ++++++++++++---
 src/web/audio.go      | 118 ++++++++++++++++++++++++++++++++++++++++
 src/web/extentions.go |  88 ++++++++++++++++++++++++++++++
 src/web/images.go     | 121 ++++++++++++++++++++++++-----------------
 src/web/requests.go   |  40 +++++++++++++-
 src/web/text.go       |  60 ++++++++++++---------
 src/web/videos.go     | 118 ++++++++++++++++++++++++++++++++++++++++
 src/worker/worker.go  | 122 +++++++++++++++++++++++++-----------------
 13 files changed, 605 insertions(+), 152 deletions(-)
 create mode 100644 src/web/audio.go
 create mode 100644 src/web/extentions.go
 create mode 100644 src/web/videos.go

diff --git a/Makefile b/Makefile
index a794f1d..f232d8f 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,6 @@ DARWINDIR:=$(EXE)_darwin
 
 LINUXDIR32:=$(LINUXDIR)_x32
 WINDIR32:=$(WINDIR)_x32
-DARWINDIR32:=$(DARWINDIR)_x32
 
 LINUXDIR64:=$(LINUXDIR)_x64
 WINDIR64:=$(WINDIR)_x64
diff --git a/README.md b/README.md
index cad5fb5..88dff72 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ Just a simple HTML web spider with minimal dependencies. It is possible to searc
 
 ## Configuration
 
-The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `wDir` (working directory) flag is set to some other value.
+The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wDir` (working directory) flag is set to some other value.
 
 The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.
 
@@ -14,20 +14,25 @@ The parsing starts from `initial_pages` and goes deeper while ignoring the pages
 
 ### Search query
 
-if `is_regexp` is `false`, then `query` is the text to be searched for, but there are some special values:
+There are some special `query` values:
 
 - `links` - tells `wecr` to search for all links there are on the page
-- `images` - find all image links and output to the `output_dir` (**IMPORTANT**: set `wait_timeout_ms` to `0` so the images load fully)
+- `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully)
+- `videos` - find and fetch files that look like videos
+- `audio` - find and fetch files that look like audio
 
 When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.
 
 ### Output
 
-By default, if the query is not `images` all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images` - the additional contents will be put in the `output_dir` directory neatly created by the executable's side.
+By default, if the query is not something of special values all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be put in the corresponding directories inside `output_dir`, which is neatly created by the executable's side.
 
 ## TODO
 
-- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)**
+- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - [x]
+- Search for videos - [x]
+- Search for audio - [x]
+- Search for documents - []
 
 ## License
 AGPLv3
\ No newline at end of file
diff --git a/src/config/config.go b/src/config/config.go
index 1f0eed3..3fdca70 100644
--- a/src/config/config.go
+++ b/src/config/config.go
@@ -1,6 +1,6 @@
 /*
 	Wecr - crawl the web for data
-	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+	Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
@@ -27,6 +27,15 @@ import (
 const (
 	QueryLinks  string = "links"
 	QueryImages string = "images"
+	QueryVideos string = "videos"
+	QueryAudio  string = "audio"
+)
+
+const (
+	SavePagesDir  string = "pages"
+	SaveImagesDir string = "images"
+	SaveVideosDir string = "videos"
+	SaveAudioDir  string = "audio"
 )
 
 type Search struct {
@@ -41,9 +50,10 @@ type Save struct {
 }
 
 type Requests struct {
-	WaitTimeoutMs  uint64 `json:"wait_timeout_ms"`
-	RequestPauseMs uint64 `json:"request_pause_ms"`
-	UserAgent      string `json:"user_agent"`
+	RequestWaitTimeoutMs  uint64 `json:"request_wait_timeout_ms"`
+	RequestPauseMs        uint64 `json:"request_pause_ms"`
+	ContentFetchTimeoutMs uint64 `json:"content_fetch_timeout_ms"`
+	UserAgent             string `json:"user_agent"`
 }
 
 type Logging struct {
@@ -77,9 +87,10 @@ func Default() *Conf {
 			OutputFile: "scraped.json",
 		},
 		Requests: Requests{
-			UserAgent:      "",
-			WaitTimeoutMs:  1500,
-			RequestPauseMs: 100,
+			UserAgent:             "",
+			RequestWaitTimeoutMs:  1500,
+			RequestPauseMs:        100,
+			ContentFetchTimeoutMs: 0,
 		},
 		InitialPages:       []string{""},
 		Depth:              5,
@@ -95,7 +106,7 @@ func Default() *Conf {
 
 // Write current configuration to w
 func (c *Conf) WriteTo(w io.Writer) error {
-	jsonData, err := json.MarshalIndent(c, "", "  ")
+	jsonData, err := json.MarshalIndent(c, " ", "\t")
 	if err != nil {
 		return err
 	}
diff --git a/src/go.mod b/src/go.mod
index dc03b88..fcda8f0 100644
--- a/src/go.mod
+++ b/src/go.mod
@@ -1,5 +1,3 @@
 module unbewohnte/wecr
 
 go 1.18
-
-require golang.org/x/net v0.4.0
diff --git a/src/go.sum b/src/go.sum
index 276f46f..e69de29 100644
--- a/src/go.sum
+++ b/src/go.sum
@@ -1,2 +0,0 @@
-golang.org/x/net v0.4.0 h1:Q5QPcMlvfxFTAPV0+07Xz/MpK9NTXu2VDUuy0FeMfaU=
-golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=
diff --git a/src/main.go b/src/main.go
index 31d22a9..4ea4404 100644
--- a/src/main.go
+++ b/src/main.go
@@ -1,6 +1,6 @@
 /*
 	Wecr - crawl the web for data
-	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+	Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
@@ -36,7 +36,7 @@ import (
 	"unbewohnte/wecr/worker"
 )
 
-const version = "v0.1.4"
+const version = "v0.2.0"
 
 const (
 	defaultConfigFile string = "conf.json"
@@ -82,7 +82,7 @@ func init() {
 
 	if *printVersion {
 		fmt.Printf(
-			"Wecr %s - crawl the web for data\n(c) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n",
+			"Wecr %s - crawl the web for data\n(c) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n",
 			version,
 		)
 		os.Exit(0)
@@ -97,7 +97,7 @@ func init() {
 ╚███╔███╔╝███████╗╚██████╗██║  ██║
  ╚══╝╚══╝ ╚══════╝ ╚═════╝╚═╝  ╚═╝`),
 	)
-	logger.GetOutput().Write([]byte(version + "\n\n"))
+	logger.GetOutput().Write([]byte(version + " by Unbewohnte\n\n"))
 
 	// work out working directory path
 	if *wDir != "" {
@@ -240,6 +240,7 @@ func main() {
 		logger.Warning("User agent is not set. Forced to \"%s\"", conf.Requests.UserAgent)
 	}
 
+	// create output directories and corresponding specialized ones
 	if !filepath.IsAbs(conf.Save.OutputDir) {
 		conf.Save.OutputDir = filepath.Join(workingDirectory, conf.Save.OutputDir)
 	}
@@ -249,11 +250,39 @@ func main() {
 		return
 	}
 
+	err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SavePagesDir), os.ModePerm)
+	if err != nil {
+		logger.Error("Failed to create output directory for pages: %s", err)
+		return
+	}
+
+	err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveImagesDir), os.ModePerm)
+	if err != nil {
+		logger.Error("Failed to create output directory for images: %s", err)
+		return
+	}
+
+	err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveVideosDir), os.ModePerm)
+	if err != nil {
+		logger.Error("Failed to create output directory for video: %s", err)
+		return
+	}
+
+	err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveAudioDir), os.ModePerm)
+	if err != nil {
+		logger.Error("Failed to create output directory for audio: %s", err)
+		return
+	}
+
 	switch conf.Search.Query {
 	case config.QueryLinks:
 		logger.Info("Looking for links")
 	case config.QueryImages:
-		logger.Info("Looking for images")
+		logger.Info("Looking for images (%+s)", web.ImageExtentions)
+	case config.QueryVideos:
+		logger.Info("Looking for videos (%+s)", web.VideoExtentions)
+	case config.QueryAudio:
+		logger.Info("Looking for audio (%+s)", web.AudioExtentions)
 	default:
 		if conf.Search.IsRegexp {
 			logger.Info("Looking for RegExp matches (%s)", conf.Search.Query)
@@ -319,7 +348,7 @@ func main() {
 
 				timeSince := time.Since(workerPool.Stats.StartTime).Round(time.Second)
 
-				fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d saved; %d matches (%d pages/sec)",
+				fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d pages saved; %d matches (%d pages/sec)",
 					timeSince.String(),
 					workerPool.Stats.PagesVisited,
 					workerPool.Stats.PagesSaved,
@@ -338,7 +367,7 @@ func main() {
 		}
 
 		// each entry in output file is a self-standing JSON object
-		entryBytes, err := json.MarshalIndent(result, "", "  ")
+		entryBytes, err := json.MarshalIndent(result, " ", "\t")
 		if err != nil {
 			continue
 		}
diff --git a/src/web/audio.go b/src/web/audio.go
new file mode 100644
index 0000000..c673c55
--- /dev/null
+++ b/src/web/audio.go
@@ -0,0 +1,118 @@
+/*
+	Wecr - crawl the web for data
+	Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+
+	This program is free software: you can redistribute it and/or modify
+	it under the terms of the GNU Affero General Public License as published by
+	the Free Software Foundation, either version 3 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU Affero General Public License for more details.
+
+	You should have received a copy of the GNU Affero General Public License
+	along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+package web
+
+import (
+	"net/url"
+	"strings"
+)
+
+func hasAudioExtention(url string) bool {
+	for _, extention := range AudioExtentions {
+		if strings.HasSuffix(url, extention) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// Tries to find audio URLs on the page
+func FindPageAudio(pageBody []byte, from *url.URL) []string {
+	var urls []string
+
+	// for every element that has "src" attribute
+	for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
+		var linkStartIndex int
+		var linkEndIndex int
+
+		linkStartIndex = strings.Index(match, "\"")
+		if linkStartIndex == -1 {
+			linkStartIndex = strings.Index(match, "'")
+			if linkStartIndex == -1 {
+				continue
+			}
+
+			linkEndIndex = strings.LastIndex(match, "'")
+			if linkEndIndex == -1 {
+				continue
+			}
+		} else {
+			linkEndIndex = strings.LastIndex(match, "\"")
+			if linkEndIndex == -1 {
+				continue
+			}
+		}
+
+		if linkEndIndex <= linkStartIndex+1 {
+			continue
+		}
+
+		link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
+		if err != nil {
+			continue
+		}
+
+		linkResolved := ResolveLink(link, from.Host)
+		if hasAudioExtention(linkResolved) {
+			urls = append(urls, linkResolved)
+		}
+	}
+
+	// for every "a" element as well
+	for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
+		var linkStartIndex int
+		var linkEndIndex int
+
+		linkStartIndex = strings.Index(match, "\"")
+		if linkStartIndex == -1 {
+			linkStartIndex = strings.Index(match, "'")
+			if linkStartIndex == -1 {
+				continue
+			}
+
+			linkEndIndex = strings.LastIndex(match, "'")
+			if linkEndIndex == -1 {
+				continue
+			}
+		} else {
+			linkEndIndex = strings.LastIndex(match, "\"")
+			if linkEndIndex == -1 {
+				continue
+			}
+		}
+
+		if linkEndIndex <= linkStartIndex+1 {
+			continue
+		}
+
+		link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
+		if err != nil {
+			continue
+		}
+
+		linkResolved := ResolveLink(link, from.Host)
+		if hasAudioExtention(linkResolved) {
+			urls = append(urls, linkResolved)
+		}
+	}
+
+	// return discovered mutual video urls
+	return urls
+}
diff --git a/src/web/extentions.go b/src/web/extentions.go
new file mode 100644
index 0000000..08cb815
--- /dev/null
+++ b/src/web/extentions.go
@@ -0,0 +1,88 @@
+/*
+	Wecr - crawl the web for data
+	Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+
+	This program is free software: you can redistribute it and/or modify
+	it under the terms of the GNU Affero General Public License as published by
+	the Free Software Foundation, either version 3 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU Affero General Public License for more details.
+
+	You should have received a copy of the GNU Affero General Public License
+	along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+package web
+
+var AudioExtentions = []string{
+	".3gp",
+	".aa",
+	".aac",
+	".aax",
+	".act",
+	".aiff",
+	".alac",
+	".amr",
+	".ape",
+	".au",
+	".flac",
+	".m4a",
+	".mp3",
+	".mpc",
+	".msv",
+	".ogg",
+	".oga",
+	".mogg",
+	".opus",
+	".tta",
+	".wav",
+	".cda",
+}
+
+var ImageExtentions = []string{
+	".jpeg",
+	".jpg",
+	".jpe",
+	".jfif",
+	".png",
+	".ppm",
+	".svg",
+	".gif",
+	".tiff",
+	".bmp",
+	".webp",
+	".ico",
+	".kra",
+	".bpg",
+	".drw",
+	".tga",
+	".kra",
+}
+
+var VideoExtentions = []string{
+	".webm",
+	".mkv",
+	".flv",
+	".wmv",
+	".avi",
+	".yuv",
+	".mp2",
+	".mp4",
+	".mpeg",
+	".mpg",
+	".mpv",
+	".m4v",
+	".3gp",
+	".3g2",
+	".nsv",
+	".vob",
+	".ogv",
+}
+
+var DocumentExtentions = []string{
+	"",
+}
diff --git a/src/web/images.go b/src/web/images.go
index a6aad61..b092638 100644
--- a/src/web/images.go
+++ b/src/web/images.go
@@ -1,6 +1,6 @@
 /*
 	Wecr - crawl the web for data
-	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+	Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
@@ -19,30 +19,12 @@
 package web
 
 import (
-	"bytes"
 	"net/url"
 	"strings"
-
-	"golang.org/x/net/html"
 )
 
 func hasImageExtention(url string) bool {
-	var extentions []string = []string{
-		".jpeg",
-		".jpg",
-		".jpe",
-		".jfif",
-		".png",
-		".ppm",
-		".svg",
-		".gif",
-		".tiff",
-		".bmp",
-		".webp",
-		".ico",
-	}
-
-	for _, extention := range extentions {
+	for _, extention := range ImageExtentions {
 		if strings.HasSuffix(url, extention) {
 			return true
 		}
@@ -55,43 +37,82 @@ func hasImageExtention(url string) bool {
 func FindPageImages(pageBody []byte, from *url.URL) []string {
 	var urls []string
 
-	tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
-	for {
-		tokenType := tokenizer.Next()
+	// for every element that has "src" attribute
+	for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
+		var linkStartIndex int
+		var linkEndIndex int
+
+		linkStartIndex = strings.Index(match, "\"")
+		if linkStartIndex == -1 {
+			linkStartIndex = strings.Index(match, "'")
+			if linkStartIndex == -1 {
+				continue
+			}
+
+			linkEndIndex = strings.LastIndex(match, "'")
+			if linkEndIndex == -1 {
+				continue
+			}
+		} else {
+			linkEndIndex = strings.LastIndex(match, "\"")
+			if linkEndIndex == -1 {
+				continue
+			}
+		}
+
+		if linkEndIndex <= linkStartIndex+1 {
+			continue
+		}
+
+		link, err := url.Parse(match)
+		if err != nil {
+			continue
+		}
 
-		switch tokenType {
-		case html.ErrorToken:
-			return urls
+		linkResolved := ResolveLink(link, from.Host)
+		if hasImageExtention(linkResolved) {
+			urls = append(urls, linkResolved)
+		}
+	}
 
-		case html.StartTagToken:
-			token := tokenizer.Token()
+	// for every "a" element as well
+	for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
+		var linkStartIndex int
+		var linkEndIndex int
 
-			if token.Data != "img" && token.Data != "a" {
+		linkStartIndex = strings.Index(match, "\"")
+		if linkStartIndex == -1 {
+			linkStartIndex = strings.Index(match, "'")
+			if linkStartIndex == -1 {
 				continue
 			}
 
-			for _, attribute := range token.Attr {
-				if attribute.Key != "src" && attribute.Key != "href" {
-					continue
-				}
-
-				imageURL, err := url.Parse(attribute.Val)
-				if err != nil {
-					break
-				}
-
-				imageURLString := ResolveLink(imageURL, from.Host)
-
-				if token.Data == "img" {
-					// <img> tag -> don't check
-					urls = append(urls, imageURLString)
-				} else {
-					// <a> tag -> check for image extention
-					if hasImageExtention(imageURLString) {
-						urls = append(urls, imageURLString)
-					}
-				}
+			linkEndIndex = strings.LastIndex(match, "'")
+			if linkEndIndex == -1 {
+				continue
+			}
+		} else {
+			linkEndIndex = strings.LastIndex(match, "\"")
+			if linkEndIndex == -1 {
+				continue
 			}
 		}
+
+		if linkEndIndex <= linkStartIndex+1 {
+			continue
+		}
+
+		link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
+		if err != nil {
+			continue
+		}
+
+		linkResolved := ResolveLink(link, from.Host)
+		if hasImageExtention(linkResolved) {
+			urls = append(urls, linkResolved)
+		}
 	}
+
+	// return discovered mutual image urls from <img> and <a> tags
+	return urls
 }
diff --git a/src/web/requests.go b/src/web/requests.go
index 286a43f..abe6e66 100644
--- a/src/web/requests.go
+++ b/src/web/requests.go
@@ -1,6 +1,6 @@
 /*
 	Wecr - crawl the web for data
-	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+	Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
@@ -21,18 +21,24 @@ package web
 import (
 	"io"
 	"net/http"
+	"os"
 	"time"
 )
 
 // Get page data coming from url with optional user agent and timeout
 func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
+	// client := &http.Client{}
+	// client.CheckRedirect = http.DefaultClient.CheckRedirect
+	// client.Transport = http.DefaultClient.Transport
+	// client.Timeout = time.Duration(timeOutMs)
+
 	req, err := http.NewRequest("GET", url, nil)
 	if err != nil {
 		return nil, err
 	}
 	req.Header.Set("User-Agent", userAgent)
 
-	http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond))
+	// response, err := client.Do(req)
 	response, err := http.DefaultClient.Do(req)
 	if err != nil {
 		return nil, err
@@ -46,3 +52,33 @@ func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
 
 	return responseBody, nil
 }
+
+// Fetch file from url and save to file at filePath
+func FetchFile(url string, userAgent string, timeOutMs uint64, filePath string) error {
+	client := http.Client{}
+	client.Timeout = time.Duration(timeOutMs)
+	client.CheckRedirect = http.DefaultClient.CheckRedirect
+	client.Transport = http.DefaultClient.Transport
+
+	req, err := http.NewRequest("GET", url, nil)
+	if err != nil {
+		return err
+	}
+	req.Header.Set("User-Agent", userAgent)
+
+	response, err := client.Do(req)
+	if err != nil {
+		return nil
+	}
+	defer response.Body.Close()
+
+	file, err := os.Create(filePath)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+
+	_, _ = io.Copy(file, response.Body)
+
+	return nil
+}
diff --git a/src/web/text.go b/src/web/text.go
index e2b0659..28ea9bf 100644
--- a/src/web/text.go
+++ b/src/web/text.go
@@ -1,6 +1,6 @@
 /*
 	Wecr - crawl the web for data
-	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+	Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
@@ -24,10 +24,14 @@ import (
 	"net/url"
 	"regexp"
 	"strings"
-
-	"golang.org/x/net/html"
 )
 
+// matches href="link" or something down bad like hReF =  'link'
+var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|')(.*?)("|')`)
+
+// matches src="link" or even something along the lines of SrC    =  'link'
+var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`)
+
 // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
 func ResolveLink(url *url.URL, fromHost string) string {
 	if !url.IsAbs() {
@@ -39,7 +43,6 @@ func ResolveLink(url *url.URL, fromHost string) string {
 		if url.Host == "" {
 			// add host
 			url.Host = fromHost
-
 		}
 	}
 
@@ -50,36 +53,41 @@ func ResolveLink(url *url.URL, fromHost string) string {
 func FindPageLinks(pageBody []byte, from *url.URL) []string {
 	var urls []string
 
-	tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
-	for {
-		tokenType := tokenizer.Next()
+	for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
+		var linkStartIndex int
+		var linkEndIndex int
 
-		switch tokenType {
-		case html.ErrorToken:
-			return urls
-
-		case html.StartTagToken:
-			token := tokenizer.Token()
-
-			if token.Data != "a" {
+		linkStartIndex = strings.Index(match, "\"")
+		if linkStartIndex == -1 {
+			linkStartIndex = strings.Index(match, "'")
+			if linkStartIndex == -1 {
 				continue
 			}
 
-			// recheck
-			for _, attribute := range token.Attr {
-				if attribute.Key != "href" {
-					continue
-				}
+			linkEndIndex = strings.LastIndex(match, "'")
+			if linkEndIndex == -1 {
+				continue
+			}
+		} else {
+			linkEndIndex = strings.LastIndex(match, "\"")
+			if linkEndIndex == -1 {
+				continue
+			}
+		}
 
-				link, err := url.Parse(attribute.Val)
-				if err != nil {
-					break
-				}
+		if linkEndIndex <= linkStartIndex+1 {
+			continue
+		}
 
-				urls = append(urls, ResolveLink(link, from.Host))
-			}
+		link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
+		if err != nil {
+			continue
 		}
+
+		urls = append(urls, ResolveLink(link, from.Host))
 	}
+
+	return urls
 }
 
 // Tries to find a certain string in page. Returns true if such string has been found
diff --git a/src/web/videos.go b/src/web/videos.go
new file mode 100644
index 0000000..8a7ebcb
--- /dev/null
+++ b/src/web/videos.go
@@ -0,0 +1,118 @@
+/*
+	Wecr - crawl the web for data
+	Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+
+	This program is free software: you can redistribute it and/or modify
+	it under the terms of the GNU Affero General Public License as published by
+	the Free Software Foundation, either version 3 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU Affero General Public License for more details.
+
+	You should have received a copy of the GNU Affero General Public License
+	along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+package web
+
+import (
+	"net/url"
+	"strings"
+)
+
+func hasVideoExtention(url string) bool {
+	for _, extention := range VideoExtentions {
+		if strings.HasSuffix(url, extention) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// Tries to find videos' URLs on the page
+func FindPageVideos(pageBody []byte, from *url.URL) []string {
+	var urls []string
+
+	// for every element that has "src" attribute
+	for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
+		var linkStartIndex int
+		var linkEndIndex int
+
+		linkStartIndex = strings.Index(match, "\"")
+		if linkStartIndex == -1 {
+			linkStartIndex = strings.Index(match, "'")
+			if linkStartIndex == -1 {
+				continue
+			}
+
+			linkEndIndex = strings.LastIndex(match, "'")
+			if linkEndIndex == -1 {
+				continue
+			}
+		} else {
+			linkEndIndex = strings.LastIndex(match, "\"")
+			if linkEndIndex == -1 {
+				continue
+			}
+		}
+
+		if linkEndIndex <= linkStartIndex+1 {
+			continue
+		}
+
+		link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
+		if err != nil {
+			continue
+		}
+
+		linkResolved := ResolveLink(link, from.Host)
+		if hasVideoExtention(linkResolved) {
+			urls = append(urls, linkResolved)
+		}
+	}
+
+	// for every "a" element as well
+	for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
+		var linkStartIndex int
+		var linkEndIndex int
+
+		linkStartIndex = strings.Index(match, "\"")
+		if linkStartIndex == -1 {
+			linkStartIndex = strings.Index(match, "'")
+			if linkStartIndex == -1 {
+				continue
+			}
+
+			linkEndIndex = strings.LastIndex(match, "'")
+			if linkEndIndex == -1 {
+				continue
+			}
+		} else {
+			linkEndIndex = strings.LastIndex(match, "\"")
+			if linkEndIndex == -1 {
+				continue
+			}
+		}
+
+		if linkEndIndex <= linkStartIndex+1 {
+			continue
+		}
+
+		link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
+		if err != nil {
+			continue
+		}
+
+		linkResolved := ResolveLink(link, from.Host)
+		if hasVideoExtention(linkResolved) {
+			urls = append(urls, linkResolved)
+		}
+	}
+
+	// return discovered mutual video urls
+	return urls
+}
diff --git a/src/worker/worker.go b/src/worker/worker.go
index 7476728..85e7255 100644
--- a/src/worker/worker.go
+++ b/src/worker/worker.go
@@ -1,6 +1,6 @@
 /*
 	Wecr - crawl the web for data
-	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+	Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
 
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU Affero General Public License as published by
@@ -20,8 +20,6 @@ package worker
 
 import (
 	"fmt"
-	"io"
-	"net/http"
 	"net/url"
 	"os"
 	"path"
@@ -63,20 +61,70 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
 	}
 }
 
+func (w *Worker) saveContent(contenType string, links []string, pageURL *url.URL) {
+	var alreadyProcessedUrls []string
+	for count, link := range links {
+		// check if this URL has been processed already
+		var skip bool = false
+
+		for _, processedURL := range alreadyProcessedUrls {
+			if link == processedURL {
+				skip = true
+				break
+			}
+		}
+
+		if skip {
+			skip = false
+			continue
+		}
+		alreadyProcessedUrls = append(alreadyProcessedUrls, link)
+
+		var fileName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(link))
+
+		var filePath string
+		switch contenType {
+		case config.QueryImages:
+			filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveImagesDir, fileName)
+		case config.QueryVideos:
+			filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName)
+		case config.QueryAudio:
+			filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName)
+		default:
+			filePath = filepath.Join(w.Conf.Save.OutputDir, fileName)
+		}
+
+		err := web.FetchFile(
+			link,
+			w.Conf.Requests.UserAgent,
+			w.Conf.Requests.ContentFetchTimeoutMs,
+			filePath,
+		)
+		if err != nil {
+			logger.Error("Failed to fetch file at %s: %s", link, err)
+			return
+		}
+
+		logger.Info("Outputted \"%s\"", fileName)
+		w.stats.MatchesFound++
+	}
+}
+
 // Save page to the disk with a corresponding name
 func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
 	if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" {
 		var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String()))
-		pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, pageName))
+		pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, config.SavePagesDir, pageName))
 		if err != nil {
 			logger.Error("Failed to create page of \"%s\": %s", baseURL.String(), err)
-		} else {
-			pageFile.Write(pageData)
+			return
 		}
+		defer pageFile.Close()
 
-		pageFile.Close()
+		pageFile.Write(pageData)
 
 		logger.Info("Saved \"%s\"", pageName)
+		w.stats.PagesSaved++
 	}
 }
 
@@ -151,7 +199,7 @@ func (w *Worker) Work() {
 
 		// get page
 		logger.Info("Visiting %s", job.URL)
-		pageData, err := web.GetPage(job.URL, w.Conf.Requests.UserAgent, w.Conf.Requests.WaitTimeoutMs)
+		pageData, err := web.GetPage(job.URL, w.Conf.Requests.UserAgent, w.Conf.Requests.RequestWaitTimeoutMs)
 		if err != nil {
 			logger.Error("Failed to get \"%s\": %s", job.URL, err)
 			continue
@@ -196,49 +244,26 @@ func (w *Worker) Work() {
 		case config.QueryImages:
 			// find image URLs, output images to the file while not saving already outputted ones
 			imageLinks := web.FindPageImages(pageData, pageURL)
+			w.saveContent(config.QueryImages, imageLinks, pageURL)
+			if len(imageLinks) > 0 {
+				savePage = true
+			}
 
-			var alreadyProcessedImgUrls []string
-			for count, imageLink := range imageLinks {
-				// check if this URL has been processed already
-				var skipImage bool = false
-
-				for _, processedURL := range alreadyProcessedImgUrls {
-					if imageLink == processedURL {
-						skipImage = true
-						break
-					}
-				}
-
-				if skipImage {
-					skipImage = false
-					continue
-				}
-				alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
-
-				var imageName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(imageLink))
-
-				response, err := http.Get(imageLink)
-				if err != nil {
-					logger.Error("Failed to get image %s", imageLink)
-					continue
-				}
-
-				imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName))
-				if err != nil {
-					logger.Error("Failed to create image file \"%s\": %s", imageName, err)
-					continue
-				}
-
-				_, _ = io.Copy(imageFile, response.Body)
-
-				response.Body.Close()
-				imageFile.Close()
-
-				logger.Info("Outputted \"%s\"", imageName)
-				w.stats.MatchesFound++
+		case config.QueryVideos:
+			// search for videos
+			// find video URLs, output videos to the files while not saving already outputted ones
+			videoLinks := web.FindPageVideos(pageData, pageURL)
+			w.saveContent(config.QueryVideos, videoLinks, pageURL)
+			if len(videoLinks) > 0 {
+				savePage = true
 			}
 
-			if len(imageLinks) > 0 {
+		case config.QueryAudio:
+			// search for audio
+			// find audio URLs, output audio to the file while not saving already outputted ones
+			audioLinks := web.FindPageAudio(pageData, pageURL)
+			w.saveContent(config.QueryAudio, audioLinks, pageURL)
+			if len(audioLinks) > 0 {
 				savePage = true
 			}
 
@@ -284,7 +309,6 @@ func (w *Worker) Work() {
 		// save page
 		if savePage {
 			w.savePage(pageURL, pageData)
-			w.stats.PagesSaved++
 		}
 
 		// sleep before the next request