diff --git a/Makefile b/Makefile
index a794f1d..f232d8f 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,6 @@ DARWINDIR:=$(EXE)_darwin
LINUXDIR32:=$(LINUXDIR)_x32
WINDIR32:=$(WINDIR)_x32
-DARWINDIR32:=$(DARWINDIR)_x32
LINUXDIR64:=$(LINUXDIR)_x64
WINDIR64:=$(WINDIR)_x64
diff --git a/README.md b/README.md
index cad5fb5..88dff72 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ Just a simple HTML web spider with minimal dependencies. It is possible to searc
## Configuration
-The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `wDir` (working directory) flag is set to some other value.
+The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wDir` (working directory) flag is set to some other value.
The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.
@@ -14,20 +14,25 @@ The parsing starts from `initial_pages` and goes deeper while ignoring the pages
### Search query
-if `is_regexp` is `false`, then `query` is the text to be searched for, but there are some special values:
+There are some special `query` values:
- `links` - tells `wecr` to search for all links there are on the page
-- `images` - find all image links and output to the `output_dir` (**IMPORTANT**: set `wait_timeout_ms` to `0` so the images load fully)
+- `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully)
+- `videos` - find and fetch files that look like videos
+- `audio` - find and fetch files that look like audio
When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.
### Output
-By default, if the query is not `images` all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images` - the additional contents will be put in the `output_dir` directory neatly created by the executable's side.
+By default, if the query is not something of special values all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be put in the corresponding directories inside `output_dir`, which is neatly created by the executable's side.
## TODO
-- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)**
+- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - [x]
+- Search for videos - [x]
+- Search for audio - [x]
+- Search for documents - []
## License
AGPLv3
\ No newline at end of file
diff --git a/src/config/config.go b/src/config/config.go
index 1f0eed3..3fdca70 100644
--- a/src/config/config.go
+++ b/src/config/config.go
@@ -1,6 +1,6 @@
/*
Wecr - crawl the web for data
- Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+ Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
@@ -27,6 +27,15 @@ import (
const (
QueryLinks string = "links"
QueryImages string = "images"
+ QueryVideos string = "videos"
+ QueryAudio string = "audio"
+)
+
+const (
+ SavePagesDir string = "pages"
+ SaveImagesDir string = "images"
+ SaveVideosDir string = "videos"
+ SaveAudioDir string = "audio"
)
type Search struct {
@@ -41,9 +50,10 @@ type Save struct {
}
type Requests struct {
- WaitTimeoutMs uint64 `json:"wait_timeout_ms"`
- RequestPauseMs uint64 `json:"request_pause_ms"`
- UserAgent string `json:"user_agent"`
+ RequestWaitTimeoutMs uint64 `json:"request_wait_timeout_ms"`
+ RequestPauseMs uint64 `json:"request_pause_ms"`
+ ContentFetchTimeoutMs uint64 `json:"content_fetch_timeout_ms"`
+ UserAgent string `json:"user_agent"`
}
type Logging struct {
@@ -77,9 +87,10 @@ func Default() *Conf {
OutputFile: "scraped.json",
},
Requests: Requests{
- UserAgent: "",
- WaitTimeoutMs: 1500,
- RequestPauseMs: 100,
+ UserAgent: "",
+ RequestWaitTimeoutMs: 1500,
+ RequestPauseMs: 100,
+ ContentFetchTimeoutMs: 0,
},
InitialPages: []string{""},
Depth: 5,
@@ -95,7 +106,7 @@ func Default() *Conf {
// Write current configuration to w
func (c *Conf) WriteTo(w io.Writer) error {
- jsonData, err := json.MarshalIndent(c, "", " ")
+ jsonData, err := json.MarshalIndent(c, " ", "\t")
if err != nil {
return err
}
diff --git a/src/go.mod b/src/go.mod
index dc03b88..fcda8f0 100644
--- a/src/go.mod
+++ b/src/go.mod
@@ -1,5 +1,3 @@
module unbewohnte/wecr
go 1.18
-
-require golang.org/x/net v0.4.0
diff --git a/src/go.sum b/src/go.sum
index 276f46f..e69de29 100644
--- a/src/go.sum
+++ b/src/go.sum
@@ -1,2 +0,0 @@
-golang.org/x/net v0.4.0 h1:Q5QPcMlvfxFTAPV0+07Xz/MpK9NTXu2VDUuy0FeMfaU=
-golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=
diff --git a/src/main.go b/src/main.go
index 31d22a9..4ea4404 100644
--- a/src/main.go
+++ b/src/main.go
@@ -1,6 +1,6 @@
/*
Wecr - crawl the web for data
- Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+ Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
@@ -36,7 +36,7 @@ import (
"unbewohnte/wecr/worker"
)
-const version = "v0.1.4"
+const version = "v0.2.0"
const (
defaultConfigFile string = "conf.json"
@@ -82,7 +82,7 @@ func init() {
if *printVersion {
fmt.Printf(
- "Wecr %s - crawl the web for data\n(c) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n",
+ "Wecr %s - crawl the web for data\n(c) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n",
version,
)
os.Exit(0)
@@ -97,7 +97,7 @@ func init() {
╚███╔███╔╝███████╗╚██████╗██║ ██║
╚══╝╚══╝ ╚══════╝ ╚═════╝╚═╝ ╚═╝`),
)
- logger.GetOutput().Write([]byte(version + "\n\n"))
+ logger.GetOutput().Write([]byte(version + " by Unbewohnte\n\n"))
// work out working directory path
if *wDir != "" {
@@ -240,6 +240,7 @@ func main() {
logger.Warning("User agent is not set. Forced to \"%s\"", conf.Requests.UserAgent)
}
+ // create output directories and corresponding specialized ones
if !filepath.IsAbs(conf.Save.OutputDir) {
conf.Save.OutputDir = filepath.Join(workingDirectory, conf.Save.OutputDir)
}
@@ -249,11 +250,39 @@ func main() {
return
}
+ err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SavePagesDir), os.ModePerm)
+ if err != nil {
+ logger.Error("Failed to create output directory for pages: %s", err)
+ return
+ }
+
+ err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveImagesDir), os.ModePerm)
+ if err != nil {
+ logger.Error("Failed to create output directory for images: %s", err)
+ return
+ }
+
+ err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveVideosDir), os.ModePerm)
+ if err != nil {
+ logger.Error("Failed to create output directory for video: %s", err)
+ return
+ }
+
+ err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveAudioDir), os.ModePerm)
+ if err != nil {
+ logger.Error("Failed to create output directory for audio: %s", err)
+ return
+ }
+
switch conf.Search.Query {
case config.QueryLinks:
logger.Info("Looking for links")
case config.QueryImages:
- logger.Info("Looking for images")
+ logger.Info("Looking for images (%+s)", web.ImageExtentions)
+ case config.QueryVideos:
+ logger.Info("Looking for videos (%+s)", web.VideoExtentions)
+ case config.QueryAudio:
+ logger.Info("Looking for audio (%+s)", web.AudioExtentions)
default:
if conf.Search.IsRegexp {
logger.Info("Looking for RegExp matches (%s)", conf.Search.Query)
@@ -319,7 +348,7 @@ func main() {
timeSince := time.Since(workerPool.Stats.StartTime).Round(time.Second)
- fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d saved; %d matches (%d pages/sec)",
+ fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d pages saved; %d matches (%d pages/sec)",
timeSince.String(),
workerPool.Stats.PagesVisited,
workerPool.Stats.PagesSaved,
@@ -338,7 +367,7 @@ func main() {
}
// each entry in output file is a self-standing JSON object
- entryBytes, err := json.MarshalIndent(result, "", " ")
+ entryBytes, err := json.MarshalIndent(result, " ", "\t")
if err != nil {
continue
}
diff --git a/src/web/audio.go b/src/web/audio.go
new file mode 100644
index 0000000..c673c55
--- /dev/null
+++ b/src/web/audio.go
@@ -0,0 +1,118 @@
+/*
+ Wecr - crawl the web for data
+ Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see .
+*/
+
+package web
+
+import (
+ "net/url"
+ "strings"
+)
+
+func hasAudioExtention(url string) bool {
+ for _, extention := range AudioExtentions {
+ if strings.HasSuffix(url, extention) {
+ return true
+ }
+ }
+
+ return false
+}
+
+// Tries to find audio URLs on the page
+func FindPageAudio(pageBody []byte, from *url.URL) []string {
+ var urls []string
+
+ // for every element that has "src" attribute
+ for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
+ var linkStartIndex int
+ var linkEndIndex int
+
+ linkStartIndex = strings.Index(match, "\"")
+ if linkStartIndex == -1 {
+ linkStartIndex = strings.Index(match, "'")
+ if linkStartIndex == -1 {
+ continue
+ }
+
+ linkEndIndex = strings.LastIndex(match, "'")
+ if linkEndIndex == -1 {
+ continue
+ }
+ } else {
+ linkEndIndex = strings.LastIndex(match, "\"")
+ if linkEndIndex == -1 {
+ continue
+ }
+ }
+
+ if linkEndIndex <= linkStartIndex+1 {
+ continue
+ }
+
+ link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
+ if err != nil {
+ continue
+ }
+
+ linkResolved := ResolveLink(link, from.Host)
+ if hasAudioExtention(linkResolved) {
+ urls = append(urls, linkResolved)
+ }
+ }
+
+ // for every "a" element as well
+ for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
+ var linkStartIndex int
+ var linkEndIndex int
+
+ linkStartIndex = strings.Index(match, "\"")
+ if linkStartIndex == -1 {
+ linkStartIndex = strings.Index(match, "'")
+ if linkStartIndex == -1 {
+ continue
+ }
+
+ linkEndIndex = strings.LastIndex(match, "'")
+ if linkEndIndex == -1 {
+ continue
+ }
+ } else {
+ linkEndIndex = strings.LastIndex(match, "\"")
+ if linkEndIndex == -1 {
+ continue
+ }
+ }
+
+ if linkEndIndex <= linkStartIndex+1 {
+ continue
+ }
+
+ link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
+ if err != nil {
+ continue
+ }
+
+ linkResolved := ResolveLink(link, from.Host)
+ if hasAudioExtention(linkResolved) {
+ urls = append(urls, linkResolved)
+ }
+ }
+
+ // return discovered mutual video urls
+ return urls
+}
diff --git a/src/web/extentions.go b/src/web/extentions.go
new file mode 100644
index 0000000..08cb815
--- /dev/null
+++ b/src/web/extentions.go
@@ -0,0 +1,88 @@
+/*
+ Wecr - crawl the web for data
+ Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see .
+*/
+
+package web
+
+var AudioExtentions = []string{
+ ".3gp",
+ ".aa",
+ ".aac",
+ ".aax",
+ ".act",
+ ".aiff",
+ ".alac",
+ ".amr",
+ ".ape",
+ ".au",
+ ".flac",
+ ".m4a",
+ ".mp3",
+ ".mpc",
+ ".msv",
+ ".ogg",
+ ".oga",
+ ".mogg",
+ ".opus",
+ ".tta",
+ ".wav",
+ ".cda",
+}
+
+var ImageExtentions = []string{
+ ".jpeg",
+ ".jpg",
+ ".jpe",
+ ".jfif",
+ ".png",
+ ".ppm",
+ ".svg",
+ ".gif",
+ ".tiff",
+ ".bmp",
+ ".webp",
+ ".ico",
+ ".kra",
+ ".bpg",
+ ".drw",
+ ".tga",
+ ".kra",
+}
+
+var VideoExtentions = []string{
+ ".webm",
+ ".mkv",
+ ".flv",
+ ".wmv",
+ ".avi",
+ ".yuv",
+ ".mp2",
+ ".mp4",
+ ".mpeg",
+ ".mpg",
+ ".mpv",
+ ".m4v",
+ ".3gp",
+ ".3g2",
+ ".nsv",
+ ".vob",
+ ".ogv",
+}
+
+var DocumentExtentions = []string{
+ "",
+}
diff --git a/src/web/images.go b/src/web/images.go
index a6aad61..b092638 100644
--- a/src/web/images.go
+++ b/src/web/images.go
@@ -1,6 +1,6 @@
/*
Wecr - crawl the web for data
- Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+ Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
@@ -19,30 +19,12 @@
package web
import (
- "bytes"
"net/url"
"strings"
-
- "golang.org/x/net/html"
)
func hasImageExtention(url string) bool {
- var extentions []string = []string{
- ".jpeg",
- ".jpg",
- ".jpe",
- ".jfif",
- ".png",
- ".ppm",
- ".svg",
- ".gif",
- ".tiff",
- ".bmp",
- ".webp",
- ".ico",
- }
-
- for _, extention := range extentions {
+ for _, extention := range ImageExtentions {
if strings.HasSuffix(url, extention) {
return true
}
@@ -55,43 +37,82 @@ func hasImageExtention(url string) bool {
func FindPageImages(pageBody []byte, from *url.URL) []string {
var urls []string
- tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
- for {
- tokenType := tokenizer.Next()
+ // for every element that has "src" attribute
+ for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
+ var linkStartIndex int
+ var linkEndIndex int
+
+ linkStartIndex = strings.Index(match, "\"")
+ if linkStartIndex == -1 {
+ linkStartIndex = strings.Index(match, "'")
+ if linkStartIndex == -1 {
+ continue
+ }
+
+ linkEndIndex = strings.LastIndex(match, "'")
+ if linkEndIndex == -1 {
+ continue
+ }
+ } else {
+ linkEndIndex = strings.LastIndex(match, "\"")
+ if linkEndIndex == -1 {
+ continue
+ }
+ }
+
+ if linkEndIndex <= linkStartIndex+1 {
+ continue
+ }
+
+ link, err := url.Parse(match)
+ if err != nil {
+ continue
+ }
- switch tokenType {
- case html.ErrorToken:
- return urls
+ linkResolved := ResolveLink(link, from.Host)
+ if hasImageExtention(linkResolved) {
+ urls = append(urls, linkResolved)
+ }
+ }
- case html.StartTagToken:
- token := tokenizer.Token()
+ // for every "a" element as well
+ for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
+ var linkStartIndex int
+ var linkEndIndex int
- if token.Data != "img" && token.Data != "a" {
+ linkStartIndex = strings.Index(match, "\"")
+ if linkStartIndex == -1 {
+ linkStartIndex = strings.Index(match, "'")
+ if linkStartIndex == -1 {
continue
}
- for _, attribute := range token.Attr {
- if attribute.Key != "src" && attribute.Key != "href" {
- continue
- }
-
- imageURL, err := url.Parse(attribute.Val)
- if err != nil {
- break
- }
-
- imageURLString := ResolveLink(imageURL, from.Host)
-
- if token.Data == "img" {
- // tag -> don't check
- urls = append(urls, imageURLString)
- } else {
- // tag -> check for image extention
- if hasImageExtention(imageURLString) {
- urls = append(urls, imageURLString)
- }
- }
+ linkEndIndex = strings.LastIndex(match, "'")
+ if linkEndIndex == -1 {
+ continue
+ }
+ } else {
+ linkEndIndex = strings.LastIndex(match, "\"")
+ if linkEndIndex == -1 {
+ continue
}
}
+
+ if linkEndIndex <= linkStartIndex+1 {
+ continue
+ }
+
+ link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
+ if err != nil {
+ continue
+ }
+
+ linkResolved := ResolveLink(link, from.Host)
+ if hasImageExtention(linkResolved) {
+ urls = append(urls, linkResolved)
+ }
}
+
+ // return discovered mutual image urls from and tags
+ return urls
}
diff --git a/src/web/requests.go b/src/web/requests.go
index 286a43f..abe6e66 100644
--- a/src/web/requests.go
+++ b/src/web/requests.go
@@ -1,6 +1,6 @@
/*
Wecr - crawl the web for data
- Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+ Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
@@ -21,18 +21,24 @@ package web
import (
"io"
"net/http"
+ "os"
"time"
)
// Get page data coming from url with optional user agent and timeout
func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
+ // client := &http.Client{}
+ // client.CheckRedirect = http.DefaultClient.CheckRedirect
+ // client.Transport = http.DefaultClient.Transport
+ // client.Timeout = time.Duration(timeOutMs)
+
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", userAgent)
- http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond))
+ // response, err := client.Do(req)
response, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
@@ -46,3 +52,33 @@ func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
return responseBody, nil
}
+
+// Fetch file from url and save to file at filePath
+func FetchFile(url string, userAgent string, timeOutMs uint64, filePath string) error {
+ client := http.Client{}
+ client.Timeout = time.Duration(timeOutMs)
+ client.CheckRedirect = http.DefaultClient.CheckRedirect
+ client.Transport = http.DefaultClient.Transport
+
+ req, err := http.NewRequest("GET", url, nil)
+ if err != nil {
+ return err
+ }
+ req.Header.Set("User-Agent", userAgent)
+
+ response, err := client.Do(req)
+ if err != nil {
+ return nil
+ }
+ defer response.Body.Close()
+
+ file, err := os.Create(filePath)
+ if err != nil {
+ return err
+ }
+ defer file.Close()
+
+ _, _ = io.Copy(file, response.Body)
+
+ return nil
+}
diff --git a/src/web/text.go b/src/web/text.go
index e2b0659..28ea9bf 100644
--- a/src/web/text.go
+++ b/src/web/text.go
@@ -1,6 +1,6 @@
/*
Wecr - crawl the web for data
- Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+ Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
@@ -24,10 +24,14 @@ import (
"net/url"
"regexp"
"strings"
-
- "golang.org/x/net/html"
)
+// matches href="link" or something down bad like hReF = 'link'
+var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|')(.*?)("|')`)
+
+// matches src="link" or even something along the lines of SrC = 'link'
+var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`)
+
// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
func ResolveLink(url *url.URL, fromHost string) string {
if !url.IsAbs() {
@@ -39,7 +43,6 @@ func ResolveLink(url *url.URL, fromHost string) string {
if url.Host == "" {
// add host
url.Host = fromHost
-
}
}
@@ -50,36 +53,41 @@ func ResolveLink(url *url.URL, fromHost string) string {
func FindPageLinks(pageBody []byte, from *url.URL) []string {
var urls []string
- tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
- for {
- tokenType := tokenizer.Next()
+ for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
+ var linkStartIndex int
+ var linkEndIndex int
- switch tokenType {
- case html.ErrorToken:
- return urls
-
- case html.StartTagToken:
- token := tokenizer.Token()
-
- if token.Data != "a" {
+ linkStartIndex = strings.Index(match, "\"")
+ if linkStartIndex == -1 {
+ linkStartIndex = strings.Index(match, "'")
+ if linkStartIndex == -1 {
continue
}
- // recheck
- for _, attribute := range token.Attr {
- if attribute.Key != "href" {
- continue
- }
+ linkEndIndex = strings.LastIndex(match, "'")
+ if linkEndIndex == -1 {
+ continue
+ }
+ } else {
+ linkEndIndex = strings.LastIndex(match, "\"")
+ if linkEndIndex == -1 {
+ continue
+ }
+ }
- link, err := url.Parse(attribute.Val)
- if err != nil {
- break
- }
+ if linkEndIndex <= linkStartIndex+1 {
+ continue
+ }
- urls = append(urls, ResolveLink(link, from.Host))
- }
+ link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
+ if err != nil {
+ continue
}
+
+ urls = append(urls, ResolveLink(link, from.Host))
}
+
+ return urls
}
// Tries to find a certain string in page. Returns true if such string has been found
diff --git a/src/web/videos.go b/src/web/videos.go
new file mode 100644
index 0000000..8a7ebcb
--- /dev/null
+++ b/src/web/videos.go
@@ -0,0 +1,118 @@
+/*
+ Wecr - crawl the web for data
+ Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see .
+*/
+
+package web
+
+import (
+ "net/url"
+ "strings"
+)
+
+func hasVideoExtention(url string) bool {
+ for _, extention := range VideoExtentions {
+ if strings.HasSuffix(url, extention) {
+ return true
+ }
+ }
+
+ return false
+}
+
+// Tries to find videos' URLs on the page
+func FindPageVideos(pageBody []byte, from *url.URL) []string {
+ var urls []string
+
+ // for every element that has "src" attribute
+ for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
+ var linkStartIndex int
+ var linkEndIndex int
+
+ linkStartIndex = strings.Index(match, "\"")
+ if linkStartIndex == -1 {
+ linkStartIndex = strings.Index(match, "'")
+ if linkStartIndex == -1 {
+ continue
+ }
+
+ linkEndIndex = strings.LastIndex(match, "'")
+ if linkEndIndex == -1 {
+ continue
+ }
+ } else {
+ linkEndIndex = strings.LastIndex(match, "\"")
+ if linkEndIndex == -1 {
+ continue
+ }
+ }
+
+ if linkEndIndex <= linkStartIndex+1 {
+ continue
+ }
+
+ link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
+ if err != nil {
+ continue
+ }
+
+ linkResolved := ResolveLink(link, from.Host)
+ if hasVideoExtention(linkResolved) {
+ urls = append(urls, linkResolved)
+ }
+ }
+
+ // for every "a" element as well
+ for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
+ var linkStartIndex int
+ var linkEndIndex int
+
+ linkStartIndex = strings.Index(match, "\"")
+ if linkStartIndex == -1 {
+ linkStartIndex = strings.Index(match, "'")
+ if linkStartIndex == -1 {
+ continue
+ }
+
+ linkEndIndex = strings.LastIndex(match, "'")
+ if linkEndIndex == -1 {
+ continue
+ }
+ } else {
+ linkEndIndex = strings.LastIndex(match, "\"")
+ if linkEndIndex == -1 {
+ continue
+ }
+ }
+
+ if linkEndIndex <= linkStartIndex+1 {
+ continue
+ }
+
+ link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
+ if err != nil {
+ continue
+ }
+
+ linkResolved := ResolveLink(link, from.Host)
+ if hasVideoExtention(linkResolved) {
+ urls = append(urls, linkResolved)
+ }
+ }
+
+ // return discovered mutual video urls
+ return urls
+}
diff --git a/src/worker/worker.go b/src/worker/worker.go
index 7476728..85e7255 100644
--- a/src/worker/worker.go
+++ b/src/worker/worker.go
@@ -1,6 +1,6 @@
/*
Wecr - crawl the web for data
- Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+ Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
@@ -20,8 +20,6 @@ package worker
import (
"fmt"
- "io"
- "net/http"
"net/url"
"os"
"path"
@@ -63,20 +61,70 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
}
}
+func (w *Worker) saveContent(contenType string, links []string, pageURL *url.URL) {
+ var alreadyProcessedUrls []string
+ for count, link := range links {
+ // check if this URL has been processed already
+ var skip bool = false
+
+ for _, processedURL := range alreadyProcessedUrls {
+ if link == processedURL {
+ skip = true
+ break
+ }
+ }
+
+ if skip {
+ skip = false
+ continue
+ }
+ alreadyProcessedUrls = append(alreadyProcessedUrls, link)
+
+ var fileName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(link))
+
+ var filePath string
+ switch contenType {
+ case config.QueryImages:
+ filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveImagesDir, fileName)
+ case config.QueryVideos:
+ filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName)
+ case config.QueryAudio:
+ filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName)
+ default:
+ filePath = filepath.Join(w.Conf.Save.OutputDir, fileName)
+ }
+
+ err := web.FetchFile(
+ link,
+ w.Conf.Requests.UserAgent,
+ w.Conf.Requests.ContentFetchTimeoutMs,
+ filePath,
+ )
+ if err != nil {
+ logger.Error("Failed to fetch file at %s: %s", link, err)
+ return
+ }
+
+ logger.Info("Outputted \"%s\"", fileName)
+ w.stats.MatchesFound++
+ }
+}
+
// Save page to the disk with a corresponding name
func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" {
var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String()))
- pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, pageName))
+ pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, config.SavePagesDir, pageName))
if err != nil {
logger.Error("Failed to create page of \"%s\": %s", baseURL.String(), err)
- } else {
- pageFile.Write(pageData)
+ return
}
+ defer pageFile.Close()
- pageFile.Close()
+ pageFile.Write(pageData)
logger.Info("Saved \"%s\"", pageName)
+ w.stats.PagesSaved++
}
}
@@ -151,7 +199,7 @@ func (w *Worker) Work() {
// get page
logger.Info("Visiting %s", job.URL)
- pageData, err := web.GetPage(job.URL, w.Conf.Requests.UserAgent, w.Conf.Requests.WaitTimeoutMs)
+ pageData, err := web.GetPage(job.URL, w.Conf.Requests.UserAgent, w.Conf.Requests.RequestWaitTimeoutMs)
if err != nil {
logger.Error("Failed to get \"%s\": %s", job.URL, err)
continue
@@ -196,49 +244,26 @@ func (w *Worker) Work() {
case config.QueryImages:
// find image URLs, output images to the file while not saving already outputted ones
imageLinks := web.FindPageImages(pageData, pageURL)
+ w.saveContent(config.QueryImages, imageLinks, pageURL)
+ if len(imageLinks) > 0 {
+ savePage = true
+ }
- var alreadyProcessedImgUrls []string
- for count, imageLink := range imageLinks {
- // check if this URL has been processed already
- var skipImage bool = false
-
- for _, processedURL := range alreadyProcessedImgUrls {
- if imageLink == processedURL {
- skipImage = true
- break
- }
- }
-
- if skipImage {
- skipImage = false
- continue
- }
- alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
-
- var imageName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(imageLink))
-
- response, err := http.Get(imageLink)
- if err != nil {
- logger.Error("Failed to get image %s", imageLink)
- continue
- }
-
- imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName))
- if err != nil {
- logger.Error("Failed to create image file \"%s\": %s", imageName, err)
- continue
- }
-
- _, _ = io.Copy(imageFile, response.Body)
-
- response.Body.Close()
- imageFile.Close()
-
- logger.Info("Outputted \"%s\"", imageName)
- w.stats.MatchesFound++
+ case config.QueryVideos:
+ // search for videos
+ // find video URLs, output videos to the files while not saving already outputted ones
+ videoLinks := web.FindPageVideos(pageData, pageURL)
+ w.saveContent(config.QueryVideos, videoLinks, pageURL)
+ if len(videoLinks) > 0 {
+ savePage = true
}
- if len(imageLinks) > 0 {
+ case config.QueryAudio:
+ // search for audio
+ // find audio URLs, output audio to the file while not saving already outputted ones
+ audioLinks := web.FindPageAudio(pageData, pageURL)
+ w.saveContent(config.QueryAudio, audioLinks, pageURL)
+ if len(audioLinks) > 0 {
savePage = true
}
@@ -284,7 +309,6 @@ func (w *Worker) Work() {
// save page
if savePage {
w.savePage(pageURL, pageData)
- w.stats.PagesSaved++
}
// sleep before the next request