From 91112b89ba37b87766c2f369b1f4e1e72b372464 Mon Sep 17 00:00:00 2001 From: Unbewohnte Date: Sat, 14 Jan 2023 20:30:28 +0300 Subject: [PATCH] NO DEPENDENCIES !; Audio, and video search; separate timeout for file fetching --- Makefile | 1 - README.md | 15 ++++-- src/config/config.go | 27 +++++++--- src/go.mod | 2 - src/go.sum | 2 - src/main.go | 43 ++++++++++++--- src/web/audio.go | 118 ++++++++++++++++++++++++++++++++++++++++ src/web/extentions.go | 88 ++++++++++++++++++++++++++++++ src/web/images.go | 121 ++++++++++++++++++++++++----------------- src/web/requests.go | 40 +++++++++++++- src/web/text.go | 60 ++++++++++++--------- src/web/videos.go | 118 ++++++++++++++++++++++++++++++++++++++++ src/worker/worker.go | 122 +++++++++++++++++++++++++----------------- 13 files changed, 605 insertions(+), 152 deletions(-) create mode 100644 src/web/audio.go create mode 100644 src/web/extentions.go create mode 100644 src/web/videos.go diff --git a/Makefile b/Makefile index a794f1d..f232d8f 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,6 @@ DARWINDIR:=$(EXE)_darwin LINUXDIR32:=$(LINUXDIR)_x32 WINDIR32:=$(WINDIR)_x32 -DARWINDIR32:=$(DARWINDIR)_x32 LINUXDIR64:=$(LINUXDIR)_x64 WINDIR64:=$(WINDIR)_x64 diff --git a/README.md b/README.md index cad5fb5..88dff72 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Just a simple HTML web spider with minimal dependencies. It is possible to searc ## Configuration -The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `wDir` (working directory) flag is set to some other value. +The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wDir` (working directory) flag is set to some other value. The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them. @@ -14,20 +14,25 @@ The parsing starts from `initial_pages` and goes deeper while ignoring the pages ### Search query -if `is_regexp` is `false`, then `query` is the text to be searched for, but there are some special values: +There are some special `query` values: - `links` - tells `wecr` to search for all links there are on the page -- `images` - find all image links and output to the `output_dir` (**IMPORTANT**: set `wait_timeout_ms` to `0` so the images load fully) +- `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully) +- `videos` - find and fetch files that look like videos +- `audio` - find and fetch files that look like audio When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it. ### Output -By default, if the query is not `images` all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images` - the additional contents will be put in the `output_dir` directory neatly created by the executable's side. +By default, if the query is not something of special values all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be put in the corresponding directories inside `output_dir`, which is neatly created by the executable's side. ## TODO -- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** +- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - [x] +- Search for videos - [x] +- Search for audio - [x] +- Search for documents - [] ## License AGPLv3 \ No newline at end of file diff --git a/src/config/config.go b/src/config/config.go index 1f0eed3..3fdca70 100644 --- a/src/config/config.go +++ b/src/config/config.go @@ -1,6 +1,6 @@ /* Wecr - crawl the web for data - Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) + Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by @@ -27,6 +27,15 @@ import ( const ( QueryLinks string = "links" QueryImages string = "images" + QueryVideos string = "videos" + QueryAudio string = "audio" +) + +const ( + SavePagesDir string = "pages" + SaveImagesDir string = "images" + SaveVideosDir string = "videos" + SaveAudioDir string = "audio" ) type Search struct { @@ -41,9 +50,10 @@ type Save struct { } type Requests struct { - WaitTimeoutMs uint64 `json:"wait_timeout_ms"` - RequestPauseMs uint64 `json:"request_pause_ms"` - UserAgent string `json:"user_agent"` + RequestWaitTimeoutMs uint64 `json:"request_wait_timeout_ms"` + RequestPauseMs uint64 `json:"request_pause_ms"` + ContentFetchTimeoutMs uint64 `json:"content_fetch_timeout_ms"` + UserAgent string `json:"user_agent"` } type Logging struct { @@ -77,9 +87,10 @@ func Default() *Conf { OutputFile: "scraped.json", }, Requests: Requests{ - UserAgent: "", - WaitTimeoutMs: 1500, - RequestPauseMs: 100, + UserAgent: "", + RequestWaitTimeoutMs: 1500, + RequestPauseMs: 100, + ContentFetchTimeoutMs: 0, }, InitialPages: []string{""}, Depth: 5, @@ -95,7 +106,7 @@ func Default() *Conf { // Write current configuration to w func (c *Conf) WriteTo(w io.Writer) error { - jsonData, err := json.MarshalIndent(c, "", " ") + jsonData, err := json.MarshalIndent(c, " ", "\t") if err != nil { return err } diff --git a/src/go.mod b/src/go.mod index dc03b88..fcda8f0 100644 --- a/src/go.mod +++ b/src/go.mod @@ -1,5 +1,3 @@ module unbewohnte/wecr go 1.18 - -require golang.org/x/net v0.4.0 diff --git a/src/go.sum b/src/go.sum index 276f46f..e69de29 100644 --- a/src/go.sum +++ b/src/go.sum @@ -1,2 +0,0 @@ -golang.org/x/net v0.4.0 h1:Q5QPcMlvfxFTAPV0+07Xz/MpK9NTXu2VDUuy0FeMfaU= -golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= diff --git a/src/main.go b/src/main.go index 31d22a9..4ea4404 100644 --- a/src/main.go +++ b/src/main.go @@ -1,6 +1,6 @@ /* Wecr - crawl the web for data - Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) + Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by @@ -36,7 +36,7 @@ import ( "unbewohnte/wecr/worker" ) -const version = "v0.1.4" +const version = "v0.2.0" const ( defaultConfigFile string = "conf.json" @@ -82,7 +82,7 @@ func init() { if *printVersion { fmt.Printf( - "Wecr %s - crawl the web for data\n(c) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n", + "Wecr %s - crawl the web for data\n(c) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n", version, ) os.Exit(0) @@ -97,7 +97,7 @@ func init() { ╚███╔███╔╝███████╗╚██████╗██║ ██║ ╚══╝╚══╝ ╚══════╝ ╚═════╝╚═╝ ╚═╝`), ) - logger.GetOutput().Write([]byte(version + "\n\n")) + logger.GetOutput().Write([]byte(version + " by Unbewohnte\n\n")) // work out working directory path if *wDir != "" { @@ -240,6 +240,7 @@ func main() { logger.Warning("User agent is not set. Forced to \"%s\"", conf.Requests.UserAgent) } + // create output directories and corresponding specialized ones if !filepath.IsAbs(conf.Save.OutputDir) { conf.Save.OutputDir = filepath.Join(workingDirectory, conf.Save.OutputDir) } @@ -249,11 +250,39 @@ func main() { return } + err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SavePagesDir), os.ModePerm) + if err != nil { + logger.Error("Failed to create output directory for pages: %s", err) + return + } + + err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveImagesDir), os.ModePerm) + if err != nil { + logger.Error("Failed to create output directory for images: %s", err) + return + } + + err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveVideosDir), os.ModePerm) + if err != nil { + logger.Error("Failed to create output directory for video: %s", err) + return + } + + err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveAudioDir), os.ModePerm) + if err != nil { + logger.Error("Failed to create output directory for audio: %s", err) + return + } + switch conf.Search.Query { case config.QueryLinks: logger.Info("Looking for links") case config.QueryImages: - logger.Info("Looking for images") + logger.Info("Looking for images (%+s)", web.ImageExtentions) + case config.QueryVideos: + logger.Info("Looking for videos (%+s)", web.VideoExtentions) + case config.QueryAudio: + logger.Info("Looking for audio (%+s)", web.AudioExtentions) default: if conf.Search.IsRegexp { logger.Info("Looking for RegExp matches (%s)", conf.Search.Query) @@ -319,7 +348,7 @@ func main() { timeSince := time.Since(workerPool.Stats.StartTime).Round(time.Second) - fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d saved; %d matches (%d pages/sec)", + fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d pages saved; %d matches (%d pages/sec)", timeSince.String(), workerPool.Stats.PagesVisited, workerPool.Stats.PagesSaved, @@ -338,7 +367,7 @@ func main() { } // each entry in output file is a self-standing JSON object - entryBytes, err := json.MarshalIndent(result, "", " ") + entryBytes, err := json.MarshalIndent(result, " ", "\t") if err != nil { continue } diff --git a/src/web/audio.go b/src/web/audio.go new file mode 100644 index 0000000..c673c55 --- /dev/null +++ b/src/web/audio.go @@ -0,0 +1,118 @@ +/* + Wecr - crawl the web for data + Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . +*/ + +package web + +import ( + "net/url" + "strings" +) + +func hasAudioExtention(url string) bool { + for _, extention := range AudioExtentions { + if strings.HasSuffix(url, extention) { + return true + } + } + + return false +} + +// Tries to find audio URLs on the page +func FindPageAudio(pageBody []byte, from *url.URL) []string { + var urls []string + + // for every element that has "src" attribute + for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) { + var linkStartIndex int + var linkEndIndex int + + linkStartIndex = strings.Index(match, "\"") + if linkStartIndex == -1 { + linkStartIndex = strings.Index(match, "'") + if linkStartIndex == -1 { + continue + } + + linkEndIndex = strings.LastIndex(match, "'") + if linkEndIndex == -1 { + continue + } + } else { + linkEndIndex = strings.LastIndex(match, "\"") + if linkEndIndex == -1 { + continue + } + } + + if linkEndIndex <= linkStartIndex+1 { + continue + } + + link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) + if err != nil { + continue + } + + linkResolved := ResolveLink(link, from.Host) + if hasAudioExtention(linkResolved) { + urls = append(urls, linkResolved) + } + } + + // for every "a" element as well + for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) { + var linkStartIndex int + var linkEndIndex int + + linkStartIndex = strings.Index(match, "\"") + if linkStartIndex == -1 { + linkStartIndex = strings.Index(match, "'") + if linkStartIndex == -1 { + continue + } + + linkEndIndex = strings.LastIndex(match, "'") + if linkEndIndex == -1 { + continue + } + } else { + linkEndIndex = strings.LastIndex(match, "\"") + if linkEndIndex == -1 { + continue + } + } + + if linkEndIndex <= linkStartIndex+1 { + continue + } + + link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) + if err != nil { + continue + } + + linkResolved := ResolveLink(link, from.Host) + if hasAudioExtention(linkResolved) { + urls = append(urls, linkResolved) + } + } + + // return discovered mutual video urls + return urls +} diff --git a/src/web/extentions.go b/src/web/extentions.go new file mode 100644 index 0000000..08cb815 --- /dev/null +++ b/src/web/extentions.go @@ -0,0 +1,88 @@ +/* + Wecr - crawl the web for data + Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . +*/ + +package web + +var AudioExtentions = []string{ + ".3gp", + ".aa", + ".aac", + ".aax", + ".act", + ".aiff", + ".alac", + ".amr", + ".ape", + ".au", + ".flac", + ".m4a", + ".mp3", + ".mpc", + ".msv", + ".ogg", + ".oga", + ".mogg", + ".opus", + ".tta", + ".wav", + ".cda", +} + +var ImageExtentions = []string{ + ".jpeg", + ".jpg", + ".jpe", + ".jfif", + ".png", + ".ppm", + ".svg", + ".gif", + ".tiff", + ".bmp", + ".webp", + ".ico", + ".kra", + ".bpg", + ".drw", + ".tga", + ".kra", +} + +var VideoExtentions = []string{ + ".webm", + ".mkv", + ".flv", + ".wmv", + ".avi", + ".yuv", + ".mp2", + ".mp4", + ".mpeg", + ".mpg", + ".mpv", + ".m4v", + ".3gp", + ".3g2", + ".nsv", + ".vob", + ".ogv", +} + +var DocumentExtentions = []string{ + "", +} diff --git a/src/web/images.go b/src/web/images.go index a6aad61..b092638 100644 --- a/src/web/images.go +++ b/src/web/images.go @@ -1,6 +1,6 @@ /* Wecr - crawl the web for data - Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) + Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by @@ -19,30 +19,12 @@ package web import ( - "bytes" "net/url" "strings" - - "golang.org/x/net/html" ) func hasImageExtention(url string) bool { - var extentions []string = []string{ - ".jpeg", - ".jpg", - ".jpe", - ".jfif", - ".png", - ".ppm", - ".svg", - ".gif", - ".tiff", - ".bmp", - ".webp", - ".ico", - } - - for _, extention := range extentions { + for _, extention := range ImageExtentions { if strings.HasSuffix(url, extention) { return true } @@ -55,43 +37,82 @@ func hasImageExtention(url string) bool { func FindPageImages(pageBody []byte, from *url.URL) []string { var urls []string - tokenizer := html.NewTokenizer(bytes.NewReader(pageBody)) - for { - tokenType := tokenizer.Next() + // for every element that has "src" attribute + for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) { + var linkStartIndex int + var linkEndIndex int + + linkStartIndex = strings.Index(match, "\"") + if linkStartIndex == -1 { + linkStartIndex = strings.Index(match, "'") + if linkStartIndex == -1 { + continue + } + + linkEndIndex = strings.LastIndex(match, "'") + if linkEndIndex == -1 { + continue + } + } else { + linkEndIndex = strings.LastIndex(match, "\"") + if linkEndIndex == -1 { + continue + } + } + + if linkEndIndex <= linkStartIndex+1 { + continue + } + + link, err := url.Parse(match) + if err != nil { + continue + } - switch tokenType { - case html.ErrorToken: - return urls + linkResolved := ResolveLink(link, from.Host) + if hasImageExtention(linkResolved) { + urls = append(urls, linkResolved) + } + } - case html.StartTagToken: - token := tokenizer.Token() + // for every "a" element as well + for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) { + var linkStartIndex int + var linkEndIndex int - if token.Data != "img" && token.Data != "a" { + linkStartIndex = strings.Index(match, "\"") + if linkStartIndex == -1 { + linkStartIndex = strings.Index(match, "'") + if linkStartIndex == -1 { continue } - for _, attribute := range token.Attr { - if attribute.Key != "src" && attribute.Key != "href" { - continue - } - - imageURL, err := url.Parse(attribute.Val) - if err != nil { - break - } - - imageURLString := ResolveLink(imageURL, from.Host) - - if token.Data == "img" { - // tag -> don't check - urls = append(urls, imageURLString) - } else { - // tag -> check for image extention - if hasImageExtention(imageURLString) { - urls = append(urls, imageURLString) - } - } + linkEndIndex = strings.LastIndex(match, "'") + if linkEndIndex == -1 { + continue + } + } else { + linkEndIndex = strings.LastIndex(match, "\"") + if linkEndIndex == -1 { + continue } } + + if linkEndIndex <= linkStartIndex+1 { + continue + } + + link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) + if err != nil { + continue + } + + linkResolved := ResolveLink(link, from.Host) + if hasImageExtention(linkResolved) { + urls = append(urls, linkResolved) + } } + + // return discovered mutual image urls from and tags + return urls } diff --git a/src/web/requests.go b/src/web/requests.go index 286a43f..abe6e66 100644 --- a/src/web/requests.go +++ b/src/web/requests.go @@ -1,6 +1,6 @@ /* Wecr - crawl the web for data - Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) + Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by @@ -21,18 +21,24 @@ package web import ( "io" "net/http" + "os" "time" ) // Get page data coming from url with optional user agent and timeout func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) { + // client := &http.Client{} + // client.CheckRedirect = http.DefaultClient.CheckRedirect + // client.Transport = http.DefaultClient.Transport + // client.Timeout = time.Duration(timeOutMs) + req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, err } req.Header.Set("User-Agent", userAgent) - http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond)) + // response, err := client.Do(req) response, err := http.DefaultClient.Do(req) if err != nil { return nil, err @@ -46,3 +52,33 @@ func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) { return responseBody, nil } + +// Fetch file from url and save to file at filePath +func FetchFile(url string, userAgent string, timeOutMs uint64, filePath string) error { + client := http.Client{} + client.Timeout = time.Duration(timeOutMs) + client.CheckRedirect = http.DefaultClient.CheckRedirect + client.Transport = http.DefaultClient.Transport + + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return err + } + req.Header.Set("User-Agent", userAgent) + + response, err := client.Do(req) + if err != nil { + return nil + } + defer response.Body.Close() + + file, err := os.Create(filePath) + if err != nil { + return err + } + defer file.Close() + + _, _ = io.Copy(file, response.Body) + + return nil +} diff --git a/src/web/text.go b/src/web/text.go index e2b0659..28ea9bf 100644 --- a/src/web/text.go +++ b/src/web/text.go @@ -1,6 +1,6 @@ /* Wecr - crawl the web for data - Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) + Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by @@ -24,10 +24,14 @@ import ( "net/url" "regexp" "strings" - - "golang.org/x/net/html" ) +// matches href="link" or something down bad like hReF = 'link' +var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|')(.*?)("|')`) + +// matches src="link" or even something along the lines of SrC = 'link' +var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`) + // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright func ResolveLink(url *url.URL, fromHost string) string { if !url.IsAbs() { @@ -39,7 +43,6 @@ func ResolveLink(url *url.URL, fromHost string) string { if url.Host == "" { // add host url.Host = fromHost - } } @@ -50,36 +53,41 @@ func ResolveLink(url *url.URL, fromHost string) string { func FindPageLinks(pageBody []byte, from *url.URL) []string { var urls []string - tokenizer := html.NewTokenizer(bytes.NewReader(pageBody)) - for { - tokenType := tokenizer.Next() + for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) { + var linkStartIndex int + var linkEndIndex int - switch tokenType { - case html.ErrorToken: - return urls - - case html.StartTagToken: - token := tokenizer.Token() - - if token.Data != "a" { + linkStartIndex = strings.Index(match, "\"") + if linkStartIndex == -1 { + linkStartIndex = strings.Index(match, "'") + if linkStartIndex == -1 { continue } - // recheck - for _, attribute := range token.Attr { - if attribute.Key != "href" { - continue - } + linkEndIndex = strings.LastIndex(match, "'") + if linkEndIndex == -1 { + continue + } + } else { + linkEndIndex = strings.LastIndex(match, "\"") + if linkEndIndex == -1 { + continue + } + } - link, err := url.Parse(attribute.Val) - if err != nil { - break - } + if linkEndIndex <= linkStartIndex+1 { + continue + } - urls = append(urls, ResolveLink(link, from.Host)) - } + link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) + if err != nil { + continue } + + urls = append(urls, ResolveLink(link, from.Host)) } + + return urls } // Tries to find a certain string in page. Returns true if such string has been found diff --git a/src/web/videos.go b/src/web/videos.go new file mode 100644 index 0000000..8a7ebcb --- /dev/null +++ b/src/web/videos.go @@ -0,0 +1,118 @@ +/* + Wecr - crawl the web for data + Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . +*/ + +package web + +import ( + "net/url" + "strings" +) + +func hasVideoExtention(url string) bool { + for _, extention := range VideoExtentions { + if strings.HasSuffix(url, extention) { + return true + } + } + + return false +} + +// Tries to find videos' URLs on the page +func FindPageVideos(pageBody []byte, from *url.URL) []string { + var urls []string + + // for every element that has "src" attribute + for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) { + var linkStartIndex int + var linkEndIndex int + + linkStartIndex = strings.Index(match, "\"") + if linkStartIndex == -1 { + linkStartIndex = strings.Index(match, "'") + if linkStartIndex == -1 { + continue + } + + linkEndIndex = strings.LastIndex(match, "'") + if linkEndIndex == -1 { + continue + } + } else { + linkEndIndex = strings.LastIndex(match, "\"") + if linkEndIndex == -1 { + continue + } + } + + if linkEndIndex <= linkStartIndex+1 { + continue + } + + link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) + if err != nil { + continue + } + + linkResolved := ResolveLink(link, from.Host) + if hasVideoExtention(linkResolved) { + urls = append(urls, linkResolved) + } + } + + // for every "a" element as well + for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) { + var linkStartIndex int + var linkEndIndex int + + linkStartIndex = strings.Index(match, "\"") + if linkStartIndex == -1 { + linkStartIndex = strings.Index(match, "'") + if linkStartIndex == -1 { + continue + } + + linkEndIndex = strings.LastIndex(match, "'") + if linkEndIndex == -1 { + continue + } + } else { + linkEndIndex = strings.LastIndex(match, "\"") + if linkEndIndex == -1 { + continue + } + } + + if linkEndIndex <= linkStartIndex+1 { + continue + } + + link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) + if err != nil { + continue + } + + linkResolved := ResolveLink(link, from.Host) + if hasVideoExtention(linkResolved) { + urls = append(urls, linkResolved) + } + } + + // return discovered mutual video urls + return urls +} diff --git a/src/worker/worker.go b/src/worker/worker.go index 7476728..85e7255 100644 --- a/src/worker/worker.go +++ b/src/worker/worker.go @@ -1,6 +1,6 @@ /* Wecr - crawl the web for data - Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) + Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by @@ -20,8 +20,6 @@ package worker import ( "fmt" - "io" - "net/http" "net/url" "os" "path" @@ -63,20 +61,70 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi } } +func (w *Worker) saveContent(contenType string, links []string, pageURL *url.URL) { + var alreadyProcessedUrls []string + for count, link := range links { + // check if this URL has been processed already + var skip bool = false + + for _, processedURL := range alreadyProcessedUrls { + if link == processedURL { + skip = true + break + } + } + + if skip { + skip = false + continue + } + alreadyProcessedUrls = append(alreadyProcessedUrls, link) + + var fileName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(link)) + + var filePath string + switch contenType { + case config.QueryImages: + filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveImagesDir, fileName) + case config.QueryVideos: + filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName) + case config.QueryAudio: + filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName) + default: + filePath = filepath.Join(w.Conf.Save.OutputDir, fileName) + } + + err := web.FetchFile( + link, + w.Conf.Requests.UserAgent, + w.Conf.Requests.ContentFetchTimeoutMs, + filePath, + ) + if err != nil { + logger.Error("Failed to fetch file at %s: %s", link, err) + return + } + + logger.Info("Outputted \"%s\"", fileName) + w.stats.MatchesFound++ + } +} + // Save page to the disk with a corresponding name func (w *Worker) savePage(baseURL *url.URL, pageData []byte) { if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" { var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String())) - pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, pageName)) + pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, config.SavePagesDir, pageName)) if err != nil { logger.Error("Failed to create page of \"%s\": %s", baseURL.String(), err) - } else { - pageFile.Write(pageData) + return } + defer pageFile.Close() - pageFile.Close() + pageFile.Write(pageData) logger.Info("Saved \"%s\"", pageName) + w.stats.PagesSaved++ } } @@ -151,7 +199,7 @@ func (w *Worker) Work() { // get page logger.Info("Visiting %s", job.URL) - pageData, err := web.GetPage(job.URL, w.Conf.Requests.UserAgent, w.Conf.Requests.WaitTimeoutMs) + pageData, err := web.GetPage(job.URL, w.Conf.Requests.UserAgent, w.Conf.Requests.RequestWaitTimeoutMs) if err != nil { logger.Error("Failed to get \"%s\": %s", job.URL, err) continue @@ -196,49 +244,26 @@ func (w *Worker) Work() { case config.QueryImages: // find image URLs, output images to the file while not saving already outputted ones imageLinks := web.FindPageImages(pageData, pageURL) + w.saveContent(config.QueryImages, imageLinks, pageURL) + if len(imageLinks) > 0 { + savePage = true + } - var alreadyProcessedImgUrls []string - for count, imageLink := range imageLinks { - // check if this URL has been processed already - var skipImage bool = false - - for _, processedURL := range alreadyProcessedImgUrls { - if imageLink == processedURL { - skipImage = true - break - } - } - - if skipImage { - skipImage = false - continue - } - alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink) - - var imageName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(imageLink)) - - response, err := http.Get(imageLink) - if err != nil { - logger.Error("Failed to get image %s", imageLink) - continue - } - - imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName)) - if err != nil { - logger.Error("Failed to create image file \"%s\": %s", imageName, err) - continue - } - - _, _ = io.Copy(imageFile, response.Body) - - response.Body.Close() - imageFile.Close() - - logger.Info("Outputted \"%s\"", imageName) - w.stats.MatchesFound++ + case config.QueryVideos: + // search for videos + // find video URLs, output videos to the files while not saving already outputted ones + videoLinks := web.FindPageVideos(pageData, pageURL) + w.saveContent(config.QueryVideos, videoLinks, pageURL) + if len(videoLinks) > 0 { + savePage = true } - if len(imageLinks) > 0 { + case config.QueryAudio: + // search for audio + // find audio URLs, output audio to the file while not saving already outputted ones + audioLinks := web.FindPageAudio(pageData, pageURL) + w.saveContent(config.QueryAudio, audioLinks, pageURL) + if len(audioLinks) > 0 { savePage = true } @@ -284,7 +309,6 @@ func (w *Worker) Work() { // save page if savePage { w.savePage(pageURL, pageData) - w.stats.PagesSaved++ } // sleep before the next request