Browse Source

NO DEPENDENCIES !; Audio, and video search; separate timeout for file fetching

master
parent
commit
91112b89ba
  1. 1
      Makefile
  2. 15
      README.md
  3. 19
      src/config/config.go
  4. 2
      src/go.mod
  5. 2
      src/go.sum
  6. 43
      src/main.go
  7. 118
      src/web/audio.go
  8. 88
      src/web/extentions.go
  9. 103
      src/web/images.go
  10. 40
      src/web/requests.go
  11. 50
      src/web/text.go
  12. 118
      src/web/videos.go
  13. 120
      src/worker/worker.go

1
Makefile

@ -11,7 +11,6 @@ DARWINDIR:=$(EXE)_darwin
LINUXDIR32:=$(LINUXDIR)_x32 LINUXDIR32:=$(LINUXDIR)_x32
WINDIR32:=$(WINDIR)_x32 WINDIR32:=$(WINDIR)_x32
DARWINDIR32:=$(DARWINDIR)_x32
LINUXDIR64:=$(LINUXDIR)_x64 LINUXDIR64:=$(LINUXDIR)_x64
WINDIR64:=$(WINDIR)_x64 WINDIR64:=$(WINDIR)_x64

15
README.md

@ -6,7 +6,7 @@ Just a simple HTML web spider with minimal dependencies. It is possible to searc
## Configuration ## Configuration
The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `wDir` (working directory) flag is set to some other value. The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wDir` (working directory) flag is set to some other value.
The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them. The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.
@ -14,20 +14,25 @@ The parsing starts from `initial_pages` and goes deeper while ignoring the pages
### Search query ### Search query
if `is_regexp` is `false`, then `query` is the text to be searched for, but there are some special values: There are some special `query` values:
- `links` - tells `wecr` to search for all links there are on the page - `links` - tells `wecr` to search for all links there are on the page
- `images` - find all image links and output to the `output_dir` (**IMPORTANT**: set `wait_timeout_ms` to `0` so the images load fully) - `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully)
- `videos` - find and fetch files that look like videos
- `audio` - find and fetch files that look like audio
When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it. When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.
### Output ### Output
By default, if the query is not `images` all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images` - the additional contents will be put in the `output_dir` directory neatly created by the executable's side. By default, if the query is not something of special values all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be put in the corresponding directories inside `output_dir`, which is neatly created by the executable's side.
## TODO ## TODO
- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - [x]
- Search for videos - [x]
- Search for audio - [x]
- Search for documents - []
## License ## License
AGPLv3 AGPLv3

19
src/config/config.go

@ -1,6 +1,6 @@
/* /*
Wecr - crawl the web for data Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by it under the terms of the GNU Affero General Public License as published by
@ -27,6 +27,15 @@ import (
const ( const (
QueryLinks string = "links" QueryLinks string = "links"
QueryImages string = "images" QueryImages string = "images"
QueryVideos string = "videos"
QueryAudio string = "audio"
)
const (
SavePagesDir string = "pages"
SaveImagesDir string = "images"
SaveVideosDir string = "videos"
SaveAudioDir string = "audio"
) )
type Search struct { type Search struct {
@ -41,8 +50,9 @@ type Save struct {
} }
type Requests struct { type Requests struct {
WaitTimeoutMs uint64 `json:"wait_timeout_ms"` RequestWaitTimeoutMs uint64 `json:"request_wait_timeout_ms"`
RequestPauseMs uint64 `json:"request_pause_ms"` RequestPauseMs uint64 `json:"request_pause_ms"`
ContentFetchTimeoutMs uint64 `json:"content_fetch_timeout_ms"`
UserAgent string `json:"user_agent"` UserAgent string `json:"user_agent"`
} }
@ -78,8 +88,9 @@ func Default() *Conf {
}, },
Requests: Requests{ Requests: Requests{
UserAgent: "", UserAgent: "",
WaitTimeoutMs: 1500, RequestWaitTimeoutMs: 1500,
RequestPauseMs: 100, RequestPauseMs: 100,
ContentFetchTimeoutMs: 0,
}, },
InitialPages: []string{""}, InitialPages: []string{""},
Depth: 5, Depth: 5,
@ -95,7 +106,7 @@ func Default() *Conf {
// Write current configuration to w // Write current configuration to w
func (c *Conf) WriteTo(w io.Writer) error { func (c *Conf) WriteTo(w io.Writer) error {
jsonData, err := json.MarshalIndent(c, "", " ") jsonData, err := json.MarshalIndent(c, " ", "\t")
if err != nil { if err != nil {
return err return err
} }

2
src/go.mod

@ -1,5 +1,3 @@
module unbewohnte/wecr module unbewohnte/wecr
go 1.18 go 1.18
require golang.org/x/net v0.4.0

2
src/go.sum

@ -1,2 +0,0 @@
golang.org/x/net v0.4.0 h1:Q5QPcMlvfxFTAPV0+07Xz/MpK9NTXu2VDUuy0FeMfaU=
golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=

43
src/main.go

@ -1,6 +1,6 @@
/* /*
Wecr - crawl the web for data Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by it under the terms of the GNU Affero General Public License as published by
@ -36,7 +36,7 @@ import (
"unbewohnte/wecr/worker" "unbewohnte/wecr/worker"
) )
const version = "v0.1.4" const version = "v0.2.0"
const ( const (
defaultConfigFile string = "conf.json" defaultConfigFile string = "conf.json"
@ -82,7 +82,7 @@ func init() {
if *printVersion { if *printVersion {
fmt.Printf( fmt.Printf(
"Wecr %s - crawl the web for data\n(c) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n", "Wecr %s - crawl the web for data\n(c) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n",
version, version,
) )
os.Exit(0) os.Exit(0)
@ -97,7 +97,7 @@ func init() {
`), `),
) )
logger.GetOutput().Write([]byte(version + "\n\n")) logger.GetOutput().Write([]byte(version + " by Unbewohnte\n\n"))
// work out working directory path // work out working directory path
if *wDir != "" { if *wDir != "" {
@ -240,6 +240,7 @@ func main() {
logger.Warning("User agent is not set. Forced to \"%s\"", conf.Requests.UserAgent) logger.Warning("User agent is not set. Forced to \"%s\"", conf.Requests.UserAgent)
} }
// create output directories and corresponding specialized ones
if !filepath.IsAbs(conf.Save.OutputDir) { if !filepath.IsAbs(conf.Save.OutputDir) {
conf.Save.OutputDir = filepath.Join(workingDirectory, conf.Save.OutputDir) conf.Save.OutputDir = filepath.Join(workingDirectory, conf.Save.OutputDir)
} }
@ -249,11 +250,39 @@ func main() {
return return
} }
err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SavePagesDir), os.ModePerm)
if err != nil {
logger.Error("Failed to create output directory for pages: %s", err)
return
}
err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveImagesDir), os.ModePerm)
if err != nil {
logger.Error("Failed to create output directory for images: %s", err)
return
}
err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveVideosDir), os.ModePerm)
if err != nil {
logger.Error("Failed to create output directory for video: %s", err)
return
}
err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveAudioDir), os.ModePerm)
if err != nil {
logger.Error("Failed to create output directory for audio: %s", err)
return
}
switch conf.Search.Query { switch conf.Search.Query {
case config.QueryLinks: case config.QueryLinks:
logger.Info("Looking for links") logger.Info("Looking for links")
case config.QueryImages: case config.QueryImages:
logger.Info("Looking for images") logger.Info("Looking for images (%+s)", web.ImageExtentions)
case config.QueryVideos:
logger.Info("Looking for videos (%+s)", web.VideoExtentions)
case config.QueryAudio:
logger.Info("Looking for audio (%+s)", web.AudioExtentions)
default: default:
if conf.Search.IsRegexp { if conf.Search.IsRegexp {
logger.Info("Looking for RegExp matches (%s)", conf.Search.Query) logger.Info("Looking for RegExp matches (%s)", conf.Search.Query)
@ -319,7 +348,7 @@ func main() {
timeSince := time.Since(workerPool.Stats.StartTime).Round(time.Second) timeSince := time.Since(workerPool.Stats.StartTime).Round(time.Second)
fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d saved; %d matches (%d pages/sec)", fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d pages saved; %d matches (%d pages/sec)",
timeSince.String(), timeSince.String(),
workerPool.Stats.PagesVisited, workerPool.Stats.PagesVisited,
workerPool.Stats.PagesSaved, workerPool.Stats.PagesSaved,
@ -338,7 +367,7 @@ func main() {
} }
// each entry in output file is a self-standing JSON object // each entry in output file is a self-standing JSON object
entryBytes, err := json.MarshalIndent(result, "", " ") entryBytes, err := json.MarshalIndent(result, " ", "\t")
if err != nil { if err != nil {
continue continue
} }

118
src/web/audio.go

@ -0,0 +1,118 @@
/*
Wecr - crawl the web for data
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package web
import (
"net/url"
"strings"
)
func hasAudioExtention(url string) bool {
for _, extention := range AudioExtentions {
if strings.HasSuffix(url, extention) {
return true
}
}
return false
}
// Tries to find audio URLs on the page
func FindPageAudio(pageBody []byte, from *url.URL) []string {
var urls []string
// for every element that has "src" attribute
for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
var linkStartIndex int
var linkEndIndex int
linkStartIndex = strings.Index(match, "\"")
if linkStartIndex == -1 {
linkStartIndex = strings.Index(match, "'")
if linkStartIndex == -1 {
continue
}
linkEndIndex = strings.LastIndex(match, "'")
if linkEndIndex == -1 {
continue
}
} else {
linkEndIndex = strings.LastIndex(match, "\"")
if linkEndIndex == -1 {
continue
}
}
if linkEndIndex <= linkStartIndex+1 {
continue
}
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
if err != nil {
continue
}
linkResolved := ResolveLink(link, from.Host)
if hasAudioExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}
// for every "a" element as well
for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
var linkStartIndex int
var linkEndIndex int
linkStartIndex = strings.Index(match, "\"")
if linkStartIndex == -1 {
linkStartIndex = strings.Index(match, "'")
if linkStartIndex == -1 {
continue
}
linkEndIndex = strings.LastIndex(match, "'")
if linkEndIndex == -1 {
continue
}
} else {
linkEndIndex = strings.LastIndex(match, "\"")
if linkEndIndex == -1 {
continue
}
}
if linkEndIndex <= linkStartIndex+1 {
continue
}
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
if err != nil {
continue
}
linkResolved := ResolveLink(link, from.Host)
if hasAudioExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}
// return discovered mutual video urls
return urls
}

88
src/web/extentions.go

@ -0,0 +1,88 @@
/*
Wecr - crawl the web for data
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package web
var AudioExtentions = []string{
".3gp",
".aa",
".aac",
".aax",
".act",
".aiff",
".alac",
".amr",
".ape",
".au",
".flac",
".m4a",
".mp3",
".mpc",
".msv",
".ogg",
".oga",
".mogg",
".opus",
".tta",
".wav",
".cda",
}
var ImageExtentions = []string{
".jpeg",
".jpg",
".jpe",
".jfif",
".png",
".ppm",
".svg",
".gif",
".tiff",
".bmp",
".webp",
".ico",
".kra",
".bpg",
".drw",
".tga",
".kra",
}
var VideoExtentions = []string{
".webm",
".mkv",
".flv",
".wmv",
".avi",
".yuv",
".mp2",
".mp4",
".mpeg",
".mpg",
".mpv",
".m4v",
".3gp",
".3g2",
".nsv",
".vob",
".ogv",
}
var DocumentExtentions = []string{
"",
}

103
src/web/images.go

@ -1,6 +1,6 @@
/* /*
Wecr - crawl the web for data Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by it under the terms of the GNU Affero General Public License as published by
@ -19,30 +19,12 @@
package web package web
import ( import (
"bytes"
"net/url" "net/url"
"strings" "strings"
"golang.org/x/net/html"
) )
func hasImageExtention(url string) bool { func hasImageExtention(url string) bool {
var extentions []string = []string{ for _, extention := range ImageExtentions {
".jpeg",
".jpg",
".jpe",
".jfif",
".png",
".ppm",
".svg",
".gif",
".tiff",
".bmp",
".webp",
".ico",
}
for _, extention := range extentions {
if strings.HasSuffix(url, extention) { if strings.HasSuffix(url, extention) {
return true return true
} }
@ -55,43 +37,82 @@ func hasImageExtention(url string) bool {
func FindPageImages(pageBody []byte, from *url.URL) []string { func FindPageImages(pageBody []byte, from *url.URL) []string {
var urls []string var urls []string
tokenizer := html.NewTokenizer(bytes.NewReader(pageBody)) // for every element that has "src" attribute
for { for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
tokenType := tokenizer.Next() var linkStartIndex int
var linkEndIndex int
switch tokenType {
case html.ErrorToken:
return urls
case html.StartTagToken: linkStartIndex = strings.Index(match, "\"")
token := tokenizer.Token() if linkStartIndex == -1 {
linkStartIndex = strings.Index(match, "'")
if linkStartIndex == -1 {
continue
}
if token.Data != "img" && token.Data != "a" { linkEndIndex = strings.LastIndex(match, "'")
if linkEndIndex == -1 {
continue
}
} else {
linkEndIndex = strings.LastIndex(match, "\"")
if linkEndIndex == -1 {
continue continue
} }
}
for _, attribute := range token.Attr { if linkEndIndex <= linkStartIndex+1 {
if attribute.Key != "src" && attribute.Key != "href" {
continue continue
} }
imageURL, err := url.Parse(attribute.Val) link, err := url.Parse(match)
if err != nil { if err != nil {
break continue
} }
imageURLString := ResolveLink(imageURL, from.Host) linkResolved := ResolveLink(link, from.Host)
if hasImageExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}
// for every "a" element as well
for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
var linkStartIndex int
var linkEndIndex int
linkStartIndex = strings.Index(match, "\"")
if linkStartIndex == -1 {
linkStartIndex = strings.Index(match, "'")
if linkStartIndex == -1 {
continue
}
if token.Data == "img" { linkEndIndex = strings.LastIndex(match, "'")
// <img> tag -> don't check if linkEndIndex == -1 {
urls = append(urls, imageURLString) continue
}
} else { } else {
// <a> tag -> check for image extention linkEndIndex = strings.LastIndex(match, "\"")
if hasImageExtention(imageURLString) { if linkEndIndex == -1 {
urls = append(urls, imageURLString) continue
}
} }
if linkEndIndex <= linkStartIndex+1 {
continue
} }
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
if err != nil {
continue
} }
linkResolved := ResolveLink(link, from.Host)
if hasImageExtention(linkResolved) {
urls = append(urls, linkResolved)
} }
} }
// return discovered mutual image urls from <img> and <a> tags
return urls
} }

40
src/web/requests.go

@ -1,6 +1,6 @@
/* /*
Wecr - crawl the web for data Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by it under the terms of the GNU Affero General Public License as published by
@ -21,18 +21,24 @@ package web
import ( import (
"io" "io"
"net/http" "net/http"
"os"
"time" "time"
) )
// Get page data coming from url with optional user agent and timeout // Get page data coming from url with optional user agent and timeout
func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) { func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
// client := &http.Client{}
// client.CheckRedirect = http.DefaultClient.CheckRedirect
// client.Transport = http.DefaultClient.Transport
// client.Timeout = time.Duration(timeOutMs)
req, err := http.NewRequest("GET", url, nil) req, err := http.NewRequest("GET", url, nil)
if err != nil { if err != nil {
return nil, err return nil, err
} }
req.Header.Set("User-Agent", userAgent) req.Header.Set("User-Agent", userAgent)
http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond)) // response, err := client.Do(req)
response, err := http.DefaultClient.Do(req) response, err := http.DefaultClient.Do(req)
if err != nil { if err != nil {
return nil, err return nil, err
@ -46,3 +52,33 @@ func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
return responseBody, nil return responseBody, nil
} }
// Fetch file from url and save to file at filePath
func FetchFile(url string, userAgent string, timeOutMs uint64, filePath string) error {
client := http.Client{}
client.Timeout = time.Duration(timeOutMs)
client.CheckRedirect = http.DefaultClient.CheckRedirect
client.Transport = http.DefaultClient.Transport
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return err
}
req.Header.Set("User-Agent", userAgent)
response, err := client.Do(req)
if err != nil {
return nil
}
defer response.Body.Close()
file, err := os.Create(filePath)
if err != nil {
return err
}
defer file.Close()
_, _ = io.Copy(file, response.Body)
return nil
}

50
src/web/text.go

@ -1,6 +1,6 @@
/* /*
Wecr - crawl the web for data Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by it under the terms of the GNU Affero General Public License as published by
@ -24,10 +24,14 @@ import (
"net/url" "net/url"
"regexp" "regexp"
"strings" "strings"
"golang.org/x/net/html"
) )
// matches href="link" or something down bad like hReF = 'link'
var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|')(.*?)("|')`)
// matches src="link" or even something along the lines of SrC = 'link'
var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`)
// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
func ResolveLink(url *url.URL, fromHost string) string { func ResolveLink(url *url.URL, fromHost string) string {
if !url.IsAbs() { if !url.IsAbs() {
@ -39,7 +43,6 @@ func ResolveLink(url *url.URL, fromHost string) string {
if url.Host == "" { if url.Host == "" {
// add host // add host
url.Host = fromHost url.Host = fromHost
} }
} }
@ -50,36 +53,41 @@ func ResolveLink(url *url.URL, fromHost string) string {
func FindPageLinks(pageBody []byte, from *url.URL) []string { func FindPageLinks(pageBody []byte, from *url.URL) []string {
var urls []string var urls []string
tokenizer := html.NewTokenizer(bytes.NewReader(pageBody)) for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
for { var linkStartIndex int
tokenType := tokenizer.Next() var linkEndIndex int
switch tokenType { linkStartIndex = strings.Index(match, "\"")
case html.ErrorToken: if linkStartIndex == -1 {
return urls linkStartIndex = strings.Index(match, "'")
if linkStartIndex == -1 {
case html.StartTagToken: continue
token := tokenizer.Token() }
if token.Data != "a" { linkEndIndex = strings.LastIndex(match, "'")
if linkEndIndex == -1 {
continue continue
} }
} else {
linkEndIndex = strings.LastIndex(match, "\"")
if linkEndIndex == -1 {
continue
}
}
// recheck if linkEndIndex <= linkStartIndex+1 {
for _, attribute := range token.Attr {
if attribute.Key != "href" {
continue continue
} }
link, err := url.Parse(attribute.Val) link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
if err != nil { if err != nil {
break continue
} }
urls = append(urls, ResolveLink(link, from.Host)) urls = append(urls, ResolveLink(link, from.Host))
} }
}
} return urls
} }
// Tries to find a certain string in page. Returns true if such string has been found // Tries to find a certain string in page. Returns true if such string has been found

118
src/web/videos.go

@ -0,0 +1,118 @@
/*
Wecr - crawl the web for data
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package web
import (
"net/url"
"strings"
)
func hasVideoExtention(url string) bool {
for _, extention := range VideoExtentions {
if strings.HasSuffix(url, extention) {
return true
}
}
return false
}
// Tries to find videos' URLs on the page
func FindPageVideos(pageBody []byte, from *url.URL) []string {
var urls []string
// for every element that has "src" attribute
for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
var linkStartIndex int
var linkEndIndex int
linkStartIndex = strings.Index(match, "\"")
if linkStartIndex == -1 {
linkStartIndex = strings.Index(match, "'")
if linkStartIndex == -1 {
continue
}
linkEndIndex = strings.LastIndex(match, "'")
if linkEndIndex == -1 {
continue
}
} else {
linkEndIndex = strings.LastIndex(match, "\"")
if linkEndIndex == -1 {
continue
}
}
if linkEndIndex <= linkStartIndex+1 {
continue
}
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
if err != nil {
continue
}
linkResolved := ResolveLink(link, from.Host)
if hasVideoExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}
// for every "a" element as well
for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
var linkStartIndex int
var linkEndIndex int
linkStartIndex = strings.Index(match, "\"")
if linkStartIndex == -1 {
linkStartIndex = strings.Index(match, "'")
if linkStartIndex == -1 {
continue
}
linkEndIndex = strings.LastIndex(match, "'")
if linkEndIndex == -1 {
continue
}
} else {
linkEndIndex = strings.LastIndex(match, "\"")
if linkEndIndex == -1 {
continue
}
}
if linkEndIndex <= linkStartIndex+1 {
continue
}
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
if err != nil {
continue
}
linkResolved := ResolveLink(link, from.Host)
if hasVideoExtention(linkResolved) {
urls = append(urls, linkResolved)
}
}
// return discovered mutual video urls
return urls
}

120
src/worker/worker.go

@ -1,6 +1,6 @@
/* /*
Wecr - crawl the web for data Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by it under the terms of the GNU Affero General Public License as published by
@ -20,8 +20,6 @@ package worker
import ( import (
"fmt" "fmt"
"io"
"net/http"
"net/url" "net/url"
"os" "os"
"path" "path"
@ -63,20 +61,70 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
} }
} }
func (w *Worker) saveContent(contenType string, links []string, pageURL *url.URL) {
var alreadyProcessedUrls []string
for count, link := range links {
// check if this URL has been processed already
var skip bool = false
for _, processedURL := range alreadyProcessedUrls {
if link == processedURL {
skip = true
break
}
}
if skip {
skip = false
continue
}
alreadyProcessedUrls = append(alreadyProcessedUrls, link)
var fileName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(link))
var filePath string
switch contenType {
case config.QueryImages:
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveImagesDir, fileName)
case config.QueryVideos:
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName)
case config.QueryAudio:
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName)
default:
filePath = filepath.Join(w.Conf.Save.OutputDir, fileName)
}
err := web.FetchFile(
link,
w.Conf.Requests.UserAgent,
w.Conf.Requests.ContentFetchTimeoutMs,
filePath,
)
if err != nil {
logger.Error("Failed to fetch file at %s: %s", link, err)
return
}
logger.Info("Outputted \"%s\"", fileName)
w.stats.MatchesFound++
}
}
// Save page to the disk with a corresponding name // Save page to the disk with a corresponding name
func (w *Worker) savePage(baseURL *url.URL, pageData []byte) { func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" { if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" {
var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String())) var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String()))
pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, pageName)) pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, config.SavePagesDir, pageName))
if err != nil { if err != nil {
logger.Error("Failed to create page of \"%s\": %s", baseURL.String(), err) logger.Error("Failed to create page of \"%s\": %s", baseURL.String(), err)
} else { return
pageFile.Write(pageData)
} }
defer pageFile.Close()
pageFile.Close() pageFile.Write(pageData)
logger.Info("Saved \"%s\"", pageName) logger.Info("Saved \"%s\"", pageName)
w.stats.PagesSaved++
} }
} }
@ -151,7 +199,7 @@ func (w *Worker) Work() {
// get page // get page
logger.Info("Visiting %s", job.URL) logger.Info("Visiting %s", job.URL)
pageData, err := web.GetPage(job.URL, w.Conf.Requests.UserAgent, w.Conf.Requests.WaitTimeoutMs) pageData, err := web.GetPage(job.URL, w.Conf.Requests.UserAgent, w.Conf.Requests.RequestWaitTimeoutMs)
if err != nil { if err != nil {
logger.Error("Failed to get \"%s\": %s", job.URL, err) logger.Error("Failed to get \"%s\": %s", job.URL, err)
continue continue
@ -196,49 +244,26 @@ func (w *Worker) Work() {
case config.QueryImages: case config.QueryImages:
// find image URLs, output images to the file while not saving already outputted ones // find image URLs, output images to the file while not saving already outputted ones
imageLinks := web.FindPageImages(pageData, pageURL) imageLinks := web.FindPageImages(pageData, pageURL)
w.saveContent(config.QueryImages, imageLinks, pageURL)
var alreadyProcessedImgUrls []string if len(imageLinks) > 0 {
for count, imageLink := range imageLinks { savePage = true
// check if this URL has been processed already
var skipImage bool = false
for _, processedURL := range alreadyProcessedImgUrls {
if imageLink == processedURL {
skipImage = true
break
}
}
if skipImage {
skipImage = false
continue
}
alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
var imageName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(imageLink))
response, err := http.Get(imageLink)
if err != nil {
logger.Error("Failed to get image %s", imageLink)
continue
}
imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName))
if err != nil {
logger.Error("Failed to create image file \"%s\": %s", imageName, err)
continue
} }
_, _ = io.Copy(imageFile, response.Body) case config.QueryVideos:
// search for videos
response.Body.Close() // find video URLs, output videos to the files while not saving already outputted ones
imageFile.Close() videoLinks := web.FindPageVideos(pageData, pageURL)
w.saveContent(config.QueryVideos, videoLinks, pageURL)
logger.Info("Outputted \"%s\"", imageName) if len(videoLinks) > 0 {
w.stats.MatchesFound++ savePage = true
} }
if len(imageLinks) > 0 { case config.QueryAudio:
// search for audio
// find audio URLs, output audio to the file while not saving already outputted ones
audioLinks := web.FindPageAudio(pageData, pageURL)
w.saveContent(config.QueryAudio, audioLinks, pageURL)
if len(audioLinks) > 0 {
savePage = true savePage = true
} }
@ -284,7 +309,6 @@ func (w *Worker) Work() {
// save page // save page
if savePage { if savePage {
w.savePage(pageURL, pageData) w.savePage(pageURL, pageData)
w.stats.PagesSaved++
} }
// sleep before the next request // sleep before the next request

Loading…
Cancel
Save