Compare commits

...

23 Commits

Author SHA1 Message Date
Kasianov Nikolai Alekseevich 722f3fb536 Fixed emails being saved to a wrong file under query=everything; improved page saving process; fixed pages being saved not considering the actual setting; Added non-link resolving variation of FindPageLinks; Added query=archive functionality; working directory is now an actual working directory instead an executables directory 2 years ago
Kasianov Nikolai Alekseevich c91986d42d Dashboard: Stop|Resume worker pools work at runtime 2 years ago
Kasianov Nikolai Alekseevich c2ec2073dc Removed unnecessary results channel 2 years ago
Kasianov Nikolai Alekseevich 812fd2adf7 Moved up until now separate text saving code to the worker package where it should be 2 years ago
Kasianov Nikolai Alekseevich b256d8a83e No more unified text output file. Text searches of different kinds go into their own files 2 years ago
Kasianov Nikolai Alekseevich e5af2939cc Web dashboard: ability to change runtime query 2 years ago
Kasianov Nikolai Alekseevich 6fab9031b1 Web dashboard foundation 2 years ago
Kasianov Nikolai Alekseevich fd484c665e Documents search 2 years ago
Kasianov Nikolai Alekseevich 00bc33d5de Updated README 2 years ago
Kasianov Nikolai Alekseevich f96bad448a Removed debug logging from visit queue functions 2 years ago
Kasianov Nikolai Alekseevich d877a483a2 In-file memory queue! No more insane RAM consumption 2 years ago
Kasianov Nikolai Alekseevich 023c2e5a19 Removed forgotten pprof stuf 2 years ago
Kasianov Nikolai Alekseevich 1771d19b82 Added new information to README 2 years ago
Kasianov Nikolai Alekseevich 5150edc41c Real pages/sec calculation and output 2 years ago
Kasianov Nikolai Alekseevich d4888dab92 Bumped data channels capacity 2 years ago
Kasianov Nikolai Alekseevich 793c2b2a70 HUGE FIX: partially fixed abnormal memory consumption via actually closing http connections; Side-effect: no more what I thought was "bandwidth-throttling" 2 years ago
Kasianov Nikolai Alekseevich 7dbf55cb32 extractData flag 2 years ago
Kasianov Nikolai Alekseevich 963c3b3ca4 Better email address regexp; MX lookup check 2 years ago
Kasianov Nikolai Alekseevich d81f732b82 Removed links search; Added email search 2 years ago
Kasianov Nikolai Alekseevich 91112b89ba NO DEPENDENCIES !; Audio, and video search; separate timeout for file fetching 2 years ago
Kasianov Nikolai Alekseevich 36b887f372 Fixed allowed domains being broken when an empty string was specified in the configuration file 2 years ago
Kasianov Nikolai Alekseevich b84af90d08 Fixed allowed_domains not being used 2 years ago
Kasianov Nikolai Alekseevich 7fa094d80f Fixed a terrible logical "bug" in worker`s Work function; Added comments; Added more image extentions; Merged and improved ResolveLink function 2 years ago
  1. 2
      .gitignore
  2. 8
      Makefile
  3. 86
      README.md
  4. 45
      src/config/config.go
  5. 161
      src/dashboard/dashboard.go
  6. 223
      src/dashboard/res/index.html
  7. 11783
      src/dashboard/res/static/bootstrap.css
  8. 1
      src/dashboard/res/static/bootstrap.css.map
  9. 0
      src/dashboard/res/static/stylesheet.css
  10. 2
      src/go.mod
  11. 2
      src/go.sum
  12. 5
      src/logger/logger.go
  13. 320
      src/main.go
  14. 54
      src/queue/visitqueue.go
  15. 78
      src/utilities/dataextractor.go
  16. 44
      src/web/audio.go
  17. 45
      src/web/documents.go
  18. 174
      src/web/extentions.go
  19. 81
      src/web/images.go
  20. 7
      src/web/job.go
  21. 46
      src/web/requests.go
  22. 1
      src/web/result.go
  23. 187
      src/web/text.go
  24. 44
      src/web/videos.go
  25. 30
      src/worker/pool.go
  26. 396
      src/worker/worker.go

2
.gitignore vendored

@ -7,3 +7,5 @@ conf_mega_ita.json
wecr
release/
scraped/
extracted_data.txt
visit_queue.tmp

8
Makefile

@ -4,6 +4,9 @@ TESTDIR:=testing
RELEASEDIR:=release
LICENSE:=COPYING
README:=README.md
UTILITIESDIR:=utilities
UTILITYEXTRACTOR:=extractor
BUILDDIR:=build
LINUXDIR:=$(EXE)_linux
WINDIR:=$(EXE)_windows
@ -11,7 +14,6 @@ DARWINDIR:=$(EXE)_darwin
LINUXDIR32:=$(LINUXDIR)_x32
WINDIR32:=$(WINDIR)_x32
DARWINDIR32:=$(DARWINDIR)_x32
LINUXDIR64:=$(LINUXDIR)_x64
WINDIR64:=$(WINDIR)_x64
@ -28,9 +30,9 @@ test: all
cp conf.json $(TESTDIR)
clean:
rm -rf $(TESTDIR) $(EXE)
rm -rf $(TESTDIR) $(EXE) $(RELEASEDIR)
release:
release: clean
rm -rf $(RELEASEDIR)
mkdir -p $(RELEASEDIR)/$(LINUXDIR64)

86
README.md

@ -1,33 +1,91 @@
# Wecr - simple web crawler
# Wecr - versatile WEb CRawler
## Overview
Just a simple HTML web spider with minimal dependencies. It is possible to search for pages with a text on them or for the text itself, extract images and save pages that satisfy the criteria along the way.
A simple HTML web spider with no dependencies. It is possible to search for pages with a text on them or for the text itself, extract images, video, audio and save pages that satisfy the criteria along the way.
## Configuration
## Configuration Overview
The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `wDir` (working directory) flag is set to some other value.
The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the working directory unless the `-wdir` (working directory) flag is set to some other value, in which case it has a bigger importance. To see all available flags run `wecr -h`.
The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.
The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains` or are NOT in `allowed_domains`. If all initial pages are happen to be on blacklisted domains or are not in the allowed list - the program will get stuck.
The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains` or are NOT in `allowed_domains`. If all initial pages are happen to be on blacklisted domains or are not in the allowed list - the program will get stuck. It is important to note that `*_domains` should be specified with an existing scheme (ie: https://en.wikipedia.org). Subdomains and ports **matter**: `https://unbewohnte.su:3000/` and `https://unbewohnte.su/` are **different**.
Previous versions stored the entire visit queue in memory, resulting in gigabytes of memory usage but as of `v0.2.4` it is possible to offload the queue to the persistent storage via `in_memory_visit_queue` option (`false` by default).
You can change search `query` at **runtime** via web dashboard if `launch_dashboard` is set to `true`
### Search query
if `is_regexp` is `false`, then `query` is the text to be searched for, but there are some special values:
There are some special `query` values to control the flow of work:
- `email` - tells wecr to scrape email addresses and output to `output_file`
- `images` - find all images on pages and output to the corresponding directory in `output_dir` (**IMPORTANT**: set `content_fetch_timeout_ms` to `0` so the images (and other content below) load fully)
- `videos` - find and fetch files that look like videos
- `audio` - find and fetch files that look like audio
- `documents` - find and fetch files that look like a document
- `everything` - find and fetch images, audio, video, documents and email addresses
- `archive` - no text to be searched, save every visited page
When `is_regexp` is enabled, the `query` is treated as a regexp string (in Go "flavor") and pages will be scanned for matches that satisfy it.
### Data Output
If the query is not something of special value, all text matches will be outputted to `found_text.json` file as separate continuous JSON objects in `output_dir`; if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be also put in the corresponding directories inside `output_dir`, which is neatly created in the working directory or, if `-wdir` flag is set - there. If `output_dir` is happened to be empty - contents will be outputted directly to the working directory.
The output almost certainly contains some duplicates and is not easy to work with programmatically, so you can use `-extractData` with the output JSON file argument (like `found_text.json`, which is the default output file name for simple text searches) to extract the actual data, filter out the duplicates and put each entry on its new line in a new text file.
## Build
- `links` - tells `wecr` to search for all links there are on the page
- `images` - find all image links and output to the `output_dir` (**IMPORTANT**: set `wait_timeout_ms` to `0` so the images load fully)
If you're on *nix - it's as easy as `make`.
When `is_regexp` is enabled, the `query` is treated as a regexp string and pages will be scanned for matches that satisfy it.
Otherwise - `go build` in the `src` directory to build `wecr`. No dependencies.
### Output
## Examples
By default, if the query is not `images` all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images` - the additional contents will be put in the `output_dir` directory neatly created by the executable's side.
See [a page on my website](https://unbewohnte.su/wecr) for some basic examples.
## TODO
Dump of a basic configuration:
- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)**
```json
{
"search": {
"is_regexp": true,
"query": "(sequence to search)|(other sequence)"
},
"requests": {
"request_wait_timeout_ms": 2500,
"request_pause_ms": 100,
"content_fetch_timeout_ms": 0,
"user_agent": ""
},
"depth": 90,
"workers": 30,
"initial_pages": [
"https://en.wikipedia.org/wiki/Main_Page"
],
"allowed_domains": [
"https://en.wikipedia.org/"
],
"blacklisted_domains": [
""
],
"in_memory_visit_queue": false,
"web_dashboard": {
"launch_dashboard": true,
"port": 13370
},
"save": {
"output_dir": "scraped",
"save_pages": false
},
"logging": {
"output_logs": true,
"logs_file": "logs.log"
}
}
```
## License
AGPLv3
wecr is distributed under AGPLv3 license

45
src/config/config.go

@ -1,6 +1,6 @@
/*
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
@ -25,8 +25,21 @@ import (
)
const (
QueryLinks string = "links"
QueryImages string = "images"
QueryVideos string = "videos"
QueryAudio string = "audio"
QueryEmail string = "email"
QueryDocuments string = "documents"
QueryEverything string = "everything"
QueryArchive string = "archive"
)
const (
SavePagesDir string = "pages"
SaveImagesDir string = "images"
SaveVideosDir string = "videos"
SaveAudioDir string = "audio"
SaveDocumentsDir string = "documents"
)
type Search struct {
@ -36,13 +49,13 @@ type Search struct {
type Save struct {
OutputDir string `json:"output_dir"`
OutputFile string `json:"save_file"`
SavePages bool `json:"save_pages"`
}
type Requests struct {
WaitTimeoutMs uint64 `json:"wait_timeout_ms"`
RequestWaitTimeoutMs uint64 `json:"request_wait_timeout_ms"`
RequestPauseMs uint64 `json:"request_pause_ms"`
ContentFetchTimeoutMs uint64 `json:"content_fetch_timeout_ms"`
UserAgent string `json:"user_agent"`
}
@ -51,6 +64,12 @@ type Logging struct {
LogsFile string `json:"logs_file"`
}
type WebDashboard struct {
UseDashboard bool `json:"launch_dashboard"`
Port uint16 `json:"port"`
}
// Configuration file structure
type Conf struct {
Search Search `json:"search"`
Requests Requests `json:"requests"`
@ -59,10 +78,13 @@ type Conf struct {
InitialPages []string `json:"initial_pages"`
AllowedDomains []string `json:"allowed_domains"`
BlacklistedDomains []string `json:"blacklisted_domains"`
InMemoryVisitQueue bool `json:"in_memory_visit_queue"`
Dashboard WebDashboard `json:"web_dashboard"`
Save Save `json:"save"`
Logging Logging `json:"logging"`
}
// Default configuration file structure
func Default() *Conf {
return &Conf{
Search: Search{
@ -72,18 +94,23 @@ func Default() *Conf {
Save: Save{
OutputDir: "scraped",
SavePages: false,
OutputFile: "scraped.json",
},
Requests: Requests{
UserAgent: "",
WaitTimeoutMs: 1500,
RequestWaitTimeoutMs: 2500,
RequestPauseMs: 100,
ContentFetchTimeoutMs: 0,
},
InitialPages: []string{""},
Depth: 5,
Workers: 20,
AllowedDomains: []string{""},
BlacklistedDomains: []string{""},
InMemoryVisitQueue: false,
Dashboard: WebDashboard{
UseDashboard: true,
Port: 13370,
},
Logging: Logging{
OutputLogs: true,
LogsFile: "logs.log",
@ -91,8 +118,9 @@ func Default() *Conf {
}
}
// Write current configuration to w
func (c *Conf) WriteTo(w io.Writer) error {
jsonData, err := json.MarshalIndent(c, "", " ")
jsonData, err := json.MarshalIndent(c, " ", "\t")
if err != nil {
return err
}
@ -105,6 +133,7 @@ func (c *Conf) WriteTo(w io.Writer) error {
return nil
}
// Read configuration from r
func (c *Conf) ReadFrom(r io.Reader) error {
jsonData, err := io.ReadAll(r)
if err != nil {
@ -119,6 +148,7 @@ func (c *Conf) ReadFrom(r io.Reader) error {
return nil
}
// Creates configuration file at path
func CreateConfigFile(conf Conf, path string) error {
confFile, err := os.Create(path)
if err != nil {
@ -134,6 +164,7 @@ func CreateConfigFile(conf Conf, path string) error {
return nil
}
// Tries to open configuration file at path. If it fails - returns default configuration
func OpenConfigFile(path string) (*Conf, error) {
confFile, err := os.Open(path)
if err != nil {

161
src/dashboard/dashboard.go

@ -0,0 +1,161 @@
/*
Wecr - crawl the web for data
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package dashboard
import (
"embed"
"encoding/json"
"fmt"
"html/template"
"io"
"io/fs"
"net/http"
"unbewohnte/wecr/config"
"unbewohnte/wecr/logger"
"unbewohnte/wecr/worker"
)
type Dashboard struct {
Server *http.Server
}
//go:embed res
var resFS embed.FS
type PageData struct {
Conf config.Conf
Stats worker.Statistics
}
type PoolStop struct {
Stop bool `json:"stop"`
}
func NewDashboard(port uint16, webConf *config.Conf, pool *worker.Pool) *Dashboard {
mux := http.NewServeMux()
res, err := fs.Sub(resFS, "res")
if err != nil {
logger.Error("Failed to Sub embedded dashboard FS: %s", err)
return nil
}
mux.Handle("/static/", http.FileServer(http.FS(res)))
mux.HandleFunc("/", func(w http.ResponseWriter, req *http.Request) {
template, err := template.ParseFS(res, "*.html")
if err != nil {
logger.Error("Failed to parse embedded dashboard FS: %s", err)
return
}
template.ExecuteTemplate(w, "index.html", nil)
})
mux.HandleFunc("/stop", func(w http.ResponseWriter, req *http.Request) {
var stop PoolStop
requestBody, err := io.ReadAll(req.Body)
if err != nil {
http.Error(w, "Failed to read request body", http.StatusInternalServerError)
logger.Error("Failed to read stop|resume signal from dashboard request: %s", err)
return
}
defer req.Body.Close()
err = json.Unmarshal(requestBody, &stop)
if err != nil {
http.Error(w, "Failed to unmarshal stop|resume signal", http.StatusInternalServerError)
logger.Error("Failed to unmarshal stop|resume signal from dashboard UI: %s", err)
return
}
if stop.Stop {
// stop worker pool
pool.Stop()
logger.Info("Stopped worker pool via request from dashboard")
} else {
// resume work
pool.Work()
logger.Info("Resumed work via request from dashboard")
}
})
mux.HandleFunc("/stats", func(w http.ResponseWriter, req *http.Request) {
jsonStats, err := json.MarshalIndent(pool.Stats, "", " ")
if err != nil {
http.Error(w, "Failed to marshal statistics", http.StatusInternalServerError)
logger.Error("Failed to marshal stats to send to the dashboard: %s", err)
return
}
w.Header().Add("Content-type", "application/json")
w.Write(jsonStats)
})
mux.HandleFunc("/conf", func(w http.ResponseWriter, req *http.Request) {
switch req.Method {
case http.MethodPost:
var newConfig config.Conf
defer req.Body.Close()
newConfigData, err := io.ReadAll(req.Body)
if err != nil {
http.Error(w, "Failed to read request body", http.StatusInternalServerError)
logger.Error("Failed to read new configuration from dashboard request: %s", err)
return
}
err = json.Unmarshal(newConfigData, &newConfig)
if err != nil {
http.Error(w, "Failed to unmarshal new configuration", http.StatusInternalServerError)
logger.Error("Failed to unmarshal new configuration from dashboard UI: %s", err)
return
}
// DO NOT blindly replace global configuration. Manually check and replace values
webConf.Search.IsRegexp = newConfig.Search.IsRegexp
if len(newConfig.Search.Query) != 0 {
webConf.Search.Query = newConfig.Search.Query
}
webConf.Logging.OutputLogs = newConfig.Logging.OutputLogs
default:
jsonConf, err := json.MarshalIndent(webConf, "", " ")
if err != nil {
http.Error(w, "Failed to marshal configuration", http.StatusInternalServerError)
logger.Error("Failed to marshal current configuration to send to the dashboard UI: %s", err)
return
}
w.Header().Add("Content-type", "application/json")
w.Write(jsonConf)
}
})
server := &http.Server{
Addr: fmt.Sprintf(":%d", port),
Handler: mux,
}
return &Dashboard{
Server: server,
}
}
func (board *Dashboard) Launch() error {
return board.Server.ListenAndServe()
}

223
src/dashboard/res/index.html

@ -0,0 +1,223 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Wecr dashboard</title>
<!-- <link rel="icon" href="/static/icon.png"> -->
<link rel="stylesheet" href="/static/bootstrap.css">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
</head>
<body class="d-flex flex-column h-100">
<div class="container">
<header class="d-flex flex-wrap justify-content-center py-3 mb-4 border-bottom">
<a href="/" class="d-flex align-items-center mb-3 mb-md-0 me-md-auto text-dark text-decoration-none">
<svg class="bi me-2" width="40" height="32">
<use xlink:href="#bootstrap"></use>
</svg>
<strong class="fs-4">Wecr</strong>
</a>
<ul class="nav nav-pills">
<li class="nav-item"><a href="/stats" class="nav-link">Stats</a></li>
<li class="nav-item"><a href="/conf" class="nav-link">Config</a></li>
</ul>
</header>
</div>
<div class="container">
<h1>Dashboard</h1>
<div style="height: 3rem;"></div>
<div class="container">
<h2>Statistics</h2>
<div id="statistics">
<ol class="list-group list-group-numbered">
<li class="list-group-item d-flex justify-content-between align-items-start">
<div class="ms-2 me-auto">
<div class="fw-bold">Pages visited</div>
</div>
<span class="badge bg-primary rounded-pill" id="pages_visited">0</span>
</li>
<li class="list-group-item d-flex justify-content-between align-items-start">
<div class="ms-2 me-auto">
<div class="fw-bold">Matches found</div>
</div>
<span class="badge bg-primary rounded-pill" id="matches_found">0</span>
</li>
<li class="list-group-item d-flex justify-content-between align-items-start">
<div class="ms-2 me-auto">
<div class="fw-bold">Pages saved</div>
</div>
<span class="badge bg-primary rounded-pill" id="pages_saved">0</span>
</li>
<li class="list-group-item d-flex justify-content-between align-items-start">
<div class="ms-2 me-auto">
<div class="fw-bold">Start time</div>
</div>
<span class="badge bg-primary rounded-pill" id="start_time_unix">0</span>
</li>
<li class="list-group-item d-flex justify-content-between align-items-start">
<div class="ms-2 me-auto">
<div class="fw-bold">Stopped</div>
</div>
<span class="badge bg-primary rounded-pill" id="stopped">false</span>
</li>
</ol>
</div>
<button class="btn btn-primary" id="btn_stop">Stop</button>
<button class="btn btn-primary" id="btn_resume" disabled>Resume</button>
</div>
<div style="height: 3rem;"></div>
<div class="container">
<h2>Configuration</h2>
<div>
<b>Make runtime changes to configuration</b>
<table class="table table-borderless">
<tr>
<th>Key</th>
<th>Value</th>
</tr>
<tr>
<th>Query</th>
<th>
<input type="text" id="conf_query">
</th>
</tr>
<tr>
<th>Is regexp</th>
<th>
<input type="text" id="conf_is_regexp">
</th>
</tr>
</table>
<button class="btn btn-primary" id="config_apply_button">
Apply
</button>
</div>
<div style="height: 3rem;"></div>
<pre id="conf_output"></pre>
</div>
</div>
</body>
<script>
window.onload = function () {
let confOutput = document.getElementById("conf_output");
let pagesVisitedOut = document.getElementById("pages_visited");
let matchesFoundOut = document.getElementById("matches_found");
let pagesSavedOut = document.getElementById("pages_saved");
let startTimeOut = document.getElementById("start_time_unix");
let stoppedOut = document.getElementById("stopped");
let applyConfButton = document.getElementById("config_apply_button");
let confQuery = document.getElementById("conf_query");
let confIsRegexp = document.getElementById("conf_is_regexp");
let buttonStop = document.getElementById("btn_stop");
let buttonResume = document.getElementById("btn_resume");
buttonStop.addEventListener("click", (event) => {
buttonStop.disabled = true;
buttonResume.disabled = false;
// stop worker pool
let signal = {
"stop": true,
};
fetch("/stop", {
method: "POST",
headers: {
"Content-type": "application/json",
},
body: JSON.stringify(signal),
});
});
buttonResume.addEventListener("click", (event) => {
buttonResume.disabled = true;
buttonStop.disabled = false;
// resume worker pool's work
let signal = {
"stop": false,
};
fetch("/stop", {
method: "POST",
headers: {
"Content-type": "application/json",
},
body: JSON.stringify(signal),
});
});
applyConfButton.addEventListener("click", (event) => {
let query = String(confQuery.value);
if (confIsRegexp.value === "0") {
isRegexp = false;
} else if (confIsRegexp.value === "1") {
isRegexp = true;
};
if (confIsRegexp.value === "false") {
isRegexp = false;
} else if (confIsRegexp.value === "true") {
isRegexp = true;
};
let newConf = {
"search": {
"is_regexp": isRegexp,
"query": query,
},
};
fetch("/conf", {
method: "POST",
headers: {
"Content-type": "application/json",
},
body: JSON.stringify(newConf),
});
});
const interval = setInterval(function () {
// update statistics
fetch("/stats")
.then((response) => response.json())
.then((statistics) => {
pagesVisitedOut.innerText = statistics.pages_visited;
matchesFoundOut.innerText = statistics.matches_found;
pagesSavedOut.innerText = statistics.pages_saved;
startTimeOut.innerText = new Date(1000 * statistics.start_time_unix);
stoppedOut.innerText = statistics.stopped;
});
// update config
fetch("/conf")
.then((response) => response.text())
.then((config) => {
// "print" whole configuration
confOutput.innerText = config;
// update values in the change table if they're empty
let confJSON = JSON.parse(config);
if (confQuery.value == "") {
confQuery.value = confJSON.search.query;
}
if (confIsRegexp.value == "") {
confIsRegexp.value = confJSON.search.is_regexp;
}
});
}, 650);
}();
</script>
</html>

11783
src/dashboard/res/static/bootstrap.css vendored

File diff suppressed because it is too large Load Diff

1
src/dashboard/res/static/bootstrap.css.map

File diff suppressed because one or more lines are too long

0
src/dashboard/res/static/stylesheet.css

2
src/go.mod

@ -1,5 +1,3 @@
module unbewohnte/wecr
go 1.18
require golang.org/x/net v0.4.0

2
src/go.sum

@ -1,2 +0,0 @@
golang.org/x/net v0.4.0 h1:Q5QPcMlvfxFTAPV0+07Xz/MpK9NTXu2VDUuy0FeMfaU=
golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=

5
src/logger/logger.go

@ -50,6 +50,11 @@ func SetOutput(writer io.Writer) {
errorLog.SetOutput(writer)
}
// Get current logger's output writer
func GetOutput() io.Writer {
return infoLog.Writer()
}
// Log information
func Info(format string, a ...interface{}) {
infoLog.Printf(format, a...)

320
src/main.go

@ -1,6 +1,6 @@
/*
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
@ -19,7 +19,6 @@
package main
import (
"encoding/json"
"flag"
"fmt"
"io"
@ -28,18 +27,26 @@ import (
"os"
"os/signal"
"path/filepath"
"strings"
"sync"
"time"
"unbewohnte/wecr/config"
"unbewohnte/wecr/dashboard"
"unbewohnte/wecr/logger"
"unbewohnte/wecr/queue"
"unbewohnte/wecr/utilities"
"unbewohnte/wecr/web"
"unbewohnte/wecr/worker"
)
const version = "v0.1.2"
const version = "v0.3.5"
const (
defaultConfigFile string = "conf.json"
defaultOutputFile string = "output.json"
configFilename string = "conf.json"
prettifiedTextOutputFilename string = "extracted_data.txt"
visitQueueFilename string = "visit_queue.tmp"
textOutputFilename string = "found_text.json"
emailsOutputFilename string = "found_emails.json"
)
var (
@ -54,58 +61,76 @@ var (
)
configFile = flag.String(
"conf", defaultConfigFile,
"conf", configFilename,
"Configuration file name to create|look for",
)
outputFile = flag.String(
"out", defaultOutputFile,
"Output file name to output information into",
extractDataFilename = flag.String(
"extractData", "",
"Specify previously outputted JSON file and extract data from it, put each entry nicely on a new line in a new file, exit afterwards",
)
workingDirectory string
configFilePath string
outputFilePath string
)
func init() {
// set log output
logger.SetOutput(os.Stdout)
// and work around random log prints by /x/net library
// make default http logger silent
log.SetOutput(io.Discard)
log.SetFlags(0)
// parse and process flags
flag.Parse()
if *printVersion {
fmt.Printf(
"Wecr %s - crawl the web for data\n(c) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n",
"Wecr %s - crawl the web for data\n(c) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n",
version,
)
os.Exit(0)
}
// print logo
logger.GetOutput().Write([]byte(
`
`),
)
logger.GetOutput().Write([]byte(version + " by Unbewohnte\n\n"))
// work out working directory path
if *wDir != "" {
workingDirectory = *wDir
} else {
exePath, err := os.Executable()
wdir, err := os.Getwd()
if err != nil {
logger.Error("Failed to determine executable's path: %s", err)
logger.Error("Failed to determine working directory path: %s", err)
return
}
workingDirectory = filepath.Dir(exePath)
workingDirectory = wdir
}
logger.Info("Working in \"%s\"", workingDirectory)
// extract data if needed
if strings.TrimSpace(*extractDataFilename) != "" {
logger.Info("Extracting data from %s...", *extractDataFilename)
err := utilities.ExtractDataFromOutput(*extractDataFilename, prettifiedTextOutputFilename, "\n", false)
if err != nil {
logger.Error("Failed to extract data from %s: %s", *extractDataFilename, err)
os.Exit(1)
}
logger.Info("Outputted \"%s\"", prettifiedTextOutputFilename)
os.Exit(0)
}
// global path to configuration file
configFilePath = filepath.Join(workingDirectory, *configFile)
// global path to output file
outputFilePath = filepath.Join(workingDirectory, *outputFile)
}
func main() {
@ -131,30 +156,6 @@ func main() {
}
logger.Info("Successfully opened configuration file")
// create logs if needed
if conf.Logging.OutputLogs {
if conf.Logging.LogsFile != "" {
// output logs to a file
logFile, err := os.Create(filepath.Join(workingDirectory, conf.Logging.LogsFile))
if err != nil {
logger.Error("Failed to create logs file: %s", err)
return
}
defer logFile.Close()
logger.Info("Outputting logs to %s", conf.Logging.LogsFile)
logger.SetOutput(logFile)
} else {
// output logs to stdout
logger.Info("Outputting logs to stdout")
logger.SetOutput(os.Stdout)
}
} else {
// no logging needed
logger.Info("No logs will be outputted")
logger.SetOutput(nil)
}
// sanitize and correct inputs
if len(conf.InitialPages) == 0 {
logger.Error("No initial page URLs have been set")
@ -164,25 +165,49 @@ func main() {
return
}
for index, blacklistedDomain := range conf.BlacklistedDomains {
var sanitizedBlacklistedDomains []string
for _, blacklistedDomain := range conf.BlacklistedDomains {
if strings.TrimSpace(blacklistedDomain) == "" {
continue
}
parsedURL, err := url.Parse(blacklistedDomain)
if err != nil {
logger.Warning("Failed to parse blacklisted %s: %s", blacklistedDomain, err)
logger.Warning("Failed to parse blacklisted \"%s\": %s", blacklistedDomain, err)
continue
}
conf.BlacklistedDomains[index] = parsedURL.Hostname()
if parsedURL.Scheme == "" {
// parsing is invalid, as stdlib says
logger.Warning("Failed to parse blacklisted \"%s\": no scheme specified", blacklistedDomain)
continue
}
sanitizedBlacklistedDomains = append(sanitizedBlacklistedDomains, parsedURL.Host)
}
conf.BlacklistedDomains = sanitizedBlacklistedDomains
var sanitizedAllowedDomains []string
for _, allowedDomain := range conf.AllowedDomains {
if strings.TrimSpace(allowedDomain) == "" {
continue
}
for index, allowedDomain := range conf.AllowedDomains {
parsedURL, err := url.Parse(allowedDomain)
if err != nil {
logger.Warning("Failed to parse allowed %s: %s", allowedDomain, err)
logger.Warning("Failed to parse allowed \"%s\": %s", allowedDomain, err)
continue
}
if parsedURL.Scheme == "" {
// parsing is invalid, as stdlib says
logger.Warning("Failed to parse allowed \"%s\": no scheme specified", allowedDomain)
continue
}
conf.AllowedDomains[index] = parsedURL.Hostname()
sanitizedAllowedDomains = append(sanitizedAllowedDomains, parsedURL.Host)
}
conf.AllowedDomains = sanitizedAllowedDomains
if conf.Depth <= 0 {
conf.Depth = 1
@ -204,6 +229,7 @@ func main() {
logger.Warning("User agent is not set. Forced to \"%s\"", conf.Requests.UserAgent)
}
// create output directory and corresponding specialized ones, text output files
if !filepath.IsAbs(conf.Save.OutputDir) {
conf.Save.OutputDir = filepath.Join(workingDirectory, conf.Save.OutputDir)
}
@ -213,11 +239,70 @@ func main() {
return
}
err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SavePagesDir), os.ModePerm)
if err != nil {
logger.Error("Failed to create output directory for pages: %s", err)
return
}
err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveImagesDir), os.ModePerm)
if err != nil {
logger.Error("Failed to create output directory for images: %s", err)
return
}
err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveVideosDir), os.ModePerm)
if err != nil {
logger.Error("Failed to create output directory for video: %s", err)
return
}
err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveAudioDir), os.ModePerm)
if err != nil {
logger.Error("Failed to create output directory for audio: %s", err)
return
}
err = os.MkdirAll(filepath.Join(conf.Save.OutputDir, config.SaveDocumentsDir), os.ModePerm)
if err != nil {
logger.Error("Failed to create output directory for documents: %s", err)
return
}
textOutputFile, err := os.Create(filepath.Join(conf.Save.OutputDir, textOutputFilename))
if err != nil {
logger.Error("Failed to create text output file: %s", err)
return
}
defer textOutputFile.Close()
emailsOutputFile, err := os.Create(filepath.Join(conf.Save.OutputDir, emailsOutputFilename))
if err != nil {
logger.Error("Failed to create email addresses output file: %s", err)
return
}
defer emailsOutputFile.Close()
switch conf.Search.Query {
case config.QueryLinks:
logger.Info("Looking for links")
case config.QueryEmail:
logger.Info("Looking for email addresses")
case config.QueryImages:
logger.Info("Looking for images")
logger.Info("Looking for images (%+s)", web.ImageExtentions)
case config.QueryVideos:
logger.Info("Looking for videos (%+s)", web.VideoExtentions)
case config.QueryAudio:
logger.Info("Looking for audio (%+s)", web.AudioExtentions)
case config.QueryDocuments:
logger.Info("Looking for documents (%+s)", web.DocumentExtentions)
case config.QueryArchive:
logger.Info("Archiving every visited page")
case config.QueryEverything:
logger.Info("Looking for email addresses, images, videos, audio and various documents (%+s - %+s - %+s - %+s)",
web.ImageExtentions,
web.VideoExtentions,
web.AudioExtentions,
web.DocumentExtentions,
)
default:
if conf.Search.IsRegexp {
logger.Info("Looking for RegExp matches (%s)", conf.Search.Query)
@ -226,48 +311,97 @@ func main() {
}
}
// create output file
outputFile, err := os.Create(outputFilePath)
// create visit queue file if not turned off
var visitQueueFile *os.File = nil
if !conf.InMemoryVisitQueue {
var err error
visitQueueFile, err = os.Create(filepath.Join(workingDirectory, visitQueueFilename))
if err != nil {
logger.Error("Failed to create output file: %s", err)
logger.Error("Could not create visit queue temporary file: %s", err)
return
}
defer outputFile.Close()
// prepare channels
jobs := make(chan web.Job, conf.Workers*5)
results := make(chan web.Result, conf.Workers*5)
defer func() {
visitQueueFile.Close()
os.Remove(filepath.Join(workingDirectory, visitQueueFilename))
}()
}
// create initial jobs
initialJobs := make(chan web.Job, conf.Workers*5)
if !conf.InMemoryVisitQueue {
for _, initialPage := range conf.InitialPages {
jobs <- web.Job{
var newJob web.Job = web.Job{
URL: initialPage,
Search: conf.Search,
Depth: conf.Depth,
}
err = queue.InsertNewJob(visitQueueFile, newJob)
if err != nil {
logger.Error("Failed to encode an initial job to the visit queue: %s", err)
continue
}
}
visitQueueFile.Seek(0, io.SeekStart)
} else {
for _, initialPage := range conf.InitialPages {
initialJobs <- web.Job{
URL: initialPage,
Search: conf.Search,
Depth: conf.Depth,
}
}
}
// Prepare global statistics variable
statistics := worker.Statistics{}
// form a worker pool
workerPool := worker.NewWorkerPool(jobs, results, conf.Workers, worker.WorkerConf{
Requests: conf.Requests,
Save: conf.Save,
workerPool := worker.NewWorkerPool(initialJobs, conf.Workers, &worker.WorkerConf{
Search: &conf.Search,
Requests: &conf.Requests,
Save: &conf.Save,
BlacklistedDomains: conf.BlacklistedDomains,
})
AllowedDomains: conf.AllowedDomains,
VisitQueue: worker.VisitQueue{
VisitQueue: visitQueueFile,
Lock: &sync.Mutex{},
},
EmailsOutput: emailsOutputFile,
TextOutput: textOutputFile,
}, &statistics)
logger.Info("Created a worker pool with %d workers", conf.Workers)
// set up graceful shutdown
sig := make(chan os.Signal, 1)
signal.Notify(sig, os.Interrupt)
go func() {
<-sig
logger.Info("Received interrupt signal. Exiting...")
// open dashboard if needed
var board *dashboard.Dashboard = nil
if conf.Dashboard.UseDashboard {
board = dashboard.NewDashboard(conf.Dashboard.Port, conf, workerPool)
go board.Launch()
logger.Info("Launched dashboard at http://localhost:%d", conf.Dashboard.Port)
}
// stop workers
workerPool.Stop()
// create and redirect logs if needed
if conf.Logging.OutputLogs {
if conf.Logging.LogsFile != "" {
// output logs to a file
logFile, err := os.Create(filepath.Join(workingDirectory, conf.Logging.LogsFile))
if err != nil {
logger.Error("Failed to create logs file: %s", err)
return
}
defer logFile.Close()
// close results channel
close(results)
}()
logger.Info("Outputting logs to %s", conf.Logging.LogsFile)
logger.SetOutput(logFile)
} else {
// output logs to stdout
logger.Info("Outputting logs to stdout")
logger.SetOutput(os.Stdout)
}
} else {
// no logging needed
logger.Info("No further logs will be outputted")
logger.SetOutput(nil)
}
// launch concurrent scraping !
workerPool.Work()
@ -276,34 +410,30 @@ func main() {
// if logs are not used or are printed to the file - output a nice statistics message on the screen
if !conf.Logging.OutputLogs || (conf.Logging.OutputLogs && conf.Logging.LogsFile != "") {
go func() {
var lastPagesVisited uint64 = 0
fmt.Printf("\n")
for {
time.Sleep(time.Second)
timeSince := time.Since(workerPool.Stats.StartTime).Round(time.Second)
fmt.Fprintf(os.Stdout, "\r[%s] %d pages; %d matches (%d pages/sec)",
timeSince := time.Since(time.Unix(int64(statistics.StartTimeUnix), 0)).Round(time.Second)
fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d pages saved; %d matches (%d pages/sec)",
timeSince.String(),
workerPool.Stats.PagesVisited,
workerPool.Stats.MatchesFound,
workerPool.Stats.PagesVisited/uint64(timeSince.Seconds()),
statistics.PagesVisited,
statistics.PagesSaved,
statistics.MatchesFound,
statistics.PagesVisited-lastPagesVisited,
)
lastPagesVisited = statistics.PagesVisited
}
}()
}
// get results and write them to the output file
for {
result, ok := <-results
if !ok {
break
}
// set up graceful shutdown
sig := make(chan os.Signal, 1)
signal.Notify(sig, os.Interrupt)
<-sig
logger.Info("Received interrupt signal. Exiting...")
entryBytes, err := json.MarshalIndent(result, "", " ")
if err != nil {
continue
}
outputFile.Write(entryBytes)
outputFile.Write([]byte("\n"))
}
// stop workers
workerPool.Stop()
}

54
src/queue/visitqueue.go

@ -0,0 +1,54 @@
package queue
import (
"encoding/json"
"io"
"os"
"unbewohnte/wecr/web"
)
func PopLastJob(queue *os.File) (*web.Job, error) {
stats, err := queue.Stat()
if err != nil {
return nil, err
}
if stats.Size() == 0 {
return nil, nil
}
// find the last job in the queue
var job web.Job
var offset int64 = -1
for {
currentOffset, err := queue.Seek(offset, io.SeekEnd)
if err != nil {
return nil, err
}
decoder := json.NewDecoder(queue)
err = decoder.Decode(&job)
if err != nil || job.URL == "" || job.Search.Query == "" {
offset -= 1
continue
}
queue.Truncate(currentOffset)
return &job, nil
}
}
func InsertNewJob(queue *os.File, newJob web.Job) error {
_, err := queue.Seek(0, io.SeekEnd)
if err != nil {
return err
}
encoder := json.NewEncoder(queue)
err = encoder.Encode(&newJob)
if err != nil {
return err
}
return nil
}

78
src/utilities/dataextractor.go

@ -0,0 +1,78 @@
/*
Wecr - crawl the web for data
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package utilities
import (
"encoding/json"
"fmt"
"io"
"os"
"unbewohnte/wecr/web"
)
// Extracts data from the output JSON file and puts it in a new file with separators between each entry
func ExtractDataFromOutput(inputFilename string, outputFilename string, separator string, keepDuplicates bool) error {
inputFile, err := os.Open(inputFilename)
if err != nil {
return err
}
defer inputFile.Close()
outputFile, err := os.Create(outputFilename)
if err != nil {
return err
}
defer outputFile.Close()
var processedData []string
decoder := json.NewDecoder(inputFile)
for {
var result web.Result
err := decoder.Decode(&result)
if err == io.EOF {
break
}
if err != nil {
return err
}
for _, dataEntry := range result.Data {
var skip = false
if !keepDuplicates {
for _, processedEntry := range processedData {
if dataEntry == processedEntry {
skip = true
break
}
}
if skip {
continue
}
processedData = append(processedData, dataEntry)
}
outputFile.WriteString(fmt.Sprintf("%s%s", dataEntry, separator))
}
}
return nil
}

44
src/web/audio.go

@ -0,0 +1,44 @@
/*
Wecr - crawl the web for data
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package web
import (
"net/url"
)
// Tries to find audio URLs on the page
func FindPageAudio(pageBody []byte, from url.URL) []url.URL {
var urls []url.URL
// for every element that has "src" attribute
for _, link := range FindPageSrcLinks(pageBody, from) {
if HasAudioExtention(link.EscapedPath()) {
urls = append(urls, link)
}
}
// for every "a" element as well
for _, link := range FindPageLinks(pageBody, from) {
if HasAudioExtention(link.EscapedPath()) {
urls = append(urls, link)
}
}
return urls
}

45
src/web/documents.go

@ -0,0 +1,45 @@
/*
Wecr - crawl the web for data
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package web
import (
"net/url"
)
// Tries to find docs' URLs on the page
func FindPageDocuments(pageBody []byte, from url.URL) []url.URL {
var urls []url.URL
// for every element that has "src" attribute
for _, link := range FindPageSrcLinks(pageBody, from) {
if HasDocumentExtention(link.EscapedPath()) {
urls = append(urls, link)
}
}
// for every "a" element as well
for _, link := range FindPageLinks(pageBody, from) {
if HasDocumentExtention(link.EscapedPath()) {
urls = append(urls, link)
}
}
// return discovered doc urls
return urls
}

174
src/web/extentions.go

@ -0,0 +1,174 @@
/*
Wecr - crawl the web for data
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package web
import "strings"
var AudioExtentions = []string{
".3gp",
".aa",
".aac",
".aax",
".act",
".aiff",
".alac",
".amr",
".ape",
".au",
".flac",
".m4a",
".mp3",
".mpc",
".msv",
".ogg",
".oga",
".mogg",
".opus",
".tta",
".wav",
".cda",
}
var ImageExtentions = []string{
".jpeg",
".jpg",
".jpe",
".jfif",
".png",
".ppm",
".svg",
".gif",
".tiff",
".bmp",
".webp",
".ico",
".kra",
".bpg",
".drw",
".tga",
".kra",
}
var VideoExtentions = []string{
".webm",
".mkv",
".flv",
".wmv",
".avi",
".yuv",
".mp2",
".mp4",
".mpeg",
".mpg",
".mpv",
".m4v",
".3gp",
".3g2",
".nsv",
".vob",
".ogv",
}
var DocumentExtentions = []string{
".pdf",
".doc",
".docx",
".epub",
".fb2",
".pub",
".ppt",
".pptx",
".txt",
".tex",
".odt",
".bib",
".ps",
".dwg",
".lyx",
".key",
".ott",
".odf",
".odc",
".ppg",
".xlc",
".latex",
".c",
".cpp",
".sh",
".go",
".java",
".cs",
".rs",
".lua",
".php",
".py",
".pl",
".lua",
".kt",
".rb",
".asm",
".rar",
".tar",
".db",
".7z",
".zip",
".gbr",
".tex",
".ttf",
".ttc",
".woff",
".otf",
".exif",
}
func HasImageExtention(urlPath string) bool {
for _, extention := range ImageExtentions {
if strings.HasSuffix(urlPath, extention) {
return true
}
}
return false
}
func HasDocumentExtention(urlPath string) bool {
for _, extention := range DocumentExtentions {
if strings.HasSuffix(urlPath, extention) {
return true
}
}
return false
}
func HasVideoExtention(urlPath string) bool {
for _, extention := range VideoExtentions {
if strings.HasSuffix(urlPath, extention) {
return true
}
}
return false
}
func HasAudioExtention(urlPath string) bool {
for _, extention := range AudioExtentions {
if strings.HasSuffix(urlPath, extention) {
return true
}
}
return false
}

81
src/web/images.go

@ -1,6 +1,6 @@
/*
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
@ -19,81 +19,26 @@
package web
import (
"bytes"
"fmt"
"strings"
"golang.org/x/net/html"
"net/url"
)
func hasImageExtention(url string) bool {
var extentions []string = []string{
".jpeg",
".jpg",
".jpe",
".png",
".ppm",
".svg",
}
for _, extention := range extentions {
if strings.HasSuffix(url, extention) {
return true
}
}
return false
}
// Tries to find images' URLs on the page
func FindPageImages(pageBody []byte, hostname string) []string {
var urls []string
tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
for {
tokenType := tokenizer.Next()
switch tokenType {
case html.ErrorToken:
return urls
func FindPageImages(pageBody []byte, from url.URL) []url.URL {
var urls []url.URL
case html.StartTagToken:
token := tokenizer.Token()
if token.Data != "img" && token.Data != "a" {
continue
// for every element that has "src" attribute
for _, link := range FindPageSrcLinks(pageBody, from) {
if HasImageExtention(link.EscapedPath()) {
urls = append(urls, link)
}
for _, attribute := range token.Attr {
if attribute.Key != "src" && attribute.Key != "href" {
continue
}
var imageURL string = attribute.Val
if !strings.Contains(imageURL, hostname) {
// add hostname
if strings.HasPrefix(imageURL, "/") && strings.HasSuffix(hostname, "/") {
imageURL = fmt.Sprintf("%s%s", hostname, imageURL[1:])
} else if !strings.HasPrefix(imageURL, "/") && !strings.HasSuffix(hostname, "/") {
imageURL = fmt.Sprintf("%s/%s", hostname, imageURL)
} else {
imageURL = fmt.Sprintf("%s%s", hostname, imageURL)
// for every "a" element as well
for _, link := range FindPageLinks(pageBody, from) {
if HasImageExtention(link.EscapedPath()) {
urls = append(urls, link)
}
}
imageURL = strings.TrimPrefix(imageURL, "//")
if !strings.HasPrefix(imageURL, "http://") && !strings.HasPrefix(imageURL, "https://") {
// add scheme
imageURL = "http://" + imageURL
}
// check for image extention
if hasImageExtention(imageURL) {
urls = append(urls, imageURL)
}
}
}
}
return urls
}

7
src/web/job.go

@ -20,8 +20,9 @@ package web
import "unbewohnte/wecr/config"
// Job to pass around workers
type Job struct {
URL string
Search config.Search
Depth uint
URL string `json:"u"`
Search config.Search `json:"s"`
Depth uint `json:"d"`
}

46
src/web/requests.go

@ -1,6 +1,6 @@
/*
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
@ -21,27 +21,65 @@ package web
import (
"io"
"net/http"
"os"
"time"
)
// Get page data coming from url with optional user agent and timeout
func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
http.DefaultClient.CloseIdleConnections()
http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond))
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", userAgent)
http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond))
response, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
defer response.Body.Close()
responseBody, err := io.ReadAll(response.Body)
pageData, err := io.ReadAll(response.Body)
if err != nil {
return nil, err
}
return responseBody, nil
return pageData, nil
}
// Fetch file from url and save to file at filePath
func FetchFile(url string, userAgent string, timeOutMs uint64, filePath string) error {
client := http.Client{}
client.Timeout = time.Duration(timeOutMs)
client.CheckRedirect = http.DefaultClient.CheckRedirect
client.Transport = http.DefaultClient.Transport
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return err
}
req.Header.Set("User-Agent", userAgent)
req.Close = true
response, err := client.Do(req)
if err != nil {
return nil
}
response.Close = true
defer response.Body.Close()
file, err := os.Create(filePath)
if err != nil {
return err
}
defer file.Close()
_, _ = io.Copy(file, response.Body)
client.CloseIdleConnections()
return nil
}

1
src/web/result.go

@ -20,6 +20,7 @@ package web
import "unbewohnte/wecr/config"
// Result of page parsing
type Result struct {
PageURL string
Search config.Search

187
src/web/text.go

@ -1,6 +1,6 @@
/*
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
@ -21,61 +21,139 @@ package web
import (
"bufio"
"bytes"
"fmt"
"net"
"net/mail"
"net/url"
"regexp"
"strings"
"golang.org/x/net/html"
)
func FindPageLinks(pageBody []byte, hostname string) []string {
var urls []string
// matches href="link" or something down bad like hReF = 'link'
var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|')(.*?)("|')`)
tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
for {
tokenType := tokenizer.Next()
// matches src="link" or even something along the lines of SrC = 'link'
var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`)
switch tokenType {
case html.ErrorToken:
return urls
var emailRegexp *regexp.Regexp = regexp.MustCompile(`[A-Za-z0-9._%+\-!%&?~^#$]+@[A-Za-z0-9.\-]+\.[a-zA-Z]{2,4}`)
// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
func ResolveLink(link url.URL, fromHost string) url.URL {
var resolvedURL url.URL = link
if !resolvedURL.IsAbs() {
if resolvedURL.Scheme == "" {
// add scheme
resolvedURL.Scheme = "https"
}
if resolvedURL.Host == "" {
// add host
resolvedURL.Host = fromHost
}
}
case html.StartTagToken:
token := tokenizer.Token()
return resolvedURL
}
// Find all links on page that are specified in href attribute. Do not resolve links. Return URLs as they are on the page
func FindPageLinksDontResolve(pageBody []byte) []url.URL {
var urls []url.URL
for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) {
var linkStartIndex int
var linkEndIndex int
if token.Data != "a" {
linkStartIndex = strings.Index(match, "\"")
if linkStartIndex == -1 {
linkStartIndex = strings.Index(match, "'")
if linkStartIndex == -1 {
continue
}
// recheck
for _, attribute := range token.Attr {
if attribute.Key != "href" {
linkEndIndex = strings.LastIndex(match, "'")
if linkEndIndex == -1 {
continue
}
} else {
linkEndIndex = strings.LastIndex(match, "\"")
if linkEndIndex == -1 {
continue
}
}
var link string = attribute.Val
if linkEndIndex <= linkStartIndex+1 {
continue
}
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
if err != nil {
continue
}
urls = append(urls, *link)
}
return urls
}
// Find all links on page that are specified in href attribute
func FindPageLinks(pageBody []byte, from url.URL) []url.URL {
urls := FindPageLinksDontResolve(pageBody)
for index := 0; index < len(urls); index++ {
urls[index] = ResolveLink(urls[index], from.Host)
}
return urls
}
// Find all links on page that are specified in "src" attribute. Do not resolve ULRs, return them as they are on page
func FindPageSrcLinksDontResolve(pageBody []byte) []url.URL {
var urls []url.URL
for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) {
var linkStartIndex int
var linkEndIndex int
linkStartIndex = strings.Index(match, "\"")
if linkStartIndex == -1 {
linkStartIndex = strings.Index(match, "'")
if linkStartIndex == -1 {
continue
}
if !strings.Contains(link, hostname) {
// add hostname
if strings.HasPrefix(link, "/") && strings.HasSuffix(hostname, "/") {
link = fmt.Sprintf("%s%s", hostname, link[1:])
} else if !strings.HasPrefix(link, "/") && !strings.HasSuffix(hostname, "/") {
link = fmt.Sprintf("%s/%s", hostname, link)
linkEndIndex = strings.LastIndex(match, "'")
if linkEndIndex == -1 {
continue
}
} else {
link = fmt.Sprintf("%s%s", hostname, link)
linkEndIndex = strings.LastIndex(match, "\"")
if linkEndIndex == -1 {
continue
}
}
link = strings.TrimPrefix(link, "//")
if linkEndIndex <= linkStartIndex+1 {
continue
}
if !strings.HasPrefix(link, "http://") && !strings.HasPrefix(link, "https://") {
// add scheme
link = "http://" + link
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex])
if err != nil {
continue
}
urls = append(urls, link)
urls = append(urls, *link)
}
return urls
}
// Find all links on page that are specified in "src" attribute
func FindPageSrcLinks(pageBody []byte, from url.URL) []url.URL {
urls := FindPageSrcLinksDontResolve(pageBody)
for index := 0; index < len(urls); index++ {
urls[index] = ResolveLink(urls[index], from.Host)
}
return urls
}
// Tries to find a certain string in page. Returns true if such string has been found
@ -103,3 +181,50 @@ func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool {
func FindPageRegexp(re *regexp.Regexp, pageBody []byte) []string {
return re.FindAllString(string(pageBody), -1)
}
// Extract clear email addresses on the page
func FindPageEmails(pageBody []byte) []string {
var emailAddresses []string
var skip bool
for _, email := range emailRegexp.FindAllString(string(pageBody), -1) {
skip = false
_, err := mail.ParseAddress(email)
if err != nil {
continue
}
for _, visitedEmail := range emailAddresses {
if email == visitedEmail {
skip = true
break
}
}
if skip {
continue
}
emailAddresses = append(emailAddresses, email)
}
return emailAddresses
}
// Extract clear email addresses on the page and make an MX lookup
func FindPageEmailsWithCheck(pageBody []byte) []string {
var filteredEmailAddresses []string
emailAddresses := FindPageEmails(pageBody)
for _, email := range emailAddresses {
mx, err := net.LookupMX(strings.Split(email, "@")[1])
if err != nil || len(mx) == 0 {
continue
}
filteredEmailAddresses = append(filteredEmailAddresses, email)
}
return filteredEmailAddresses
}

44
src/web/videos.go

@ -0,0 +1,44 @@
/*
Wecr - crawl the web for data
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package web
import (
"net/url"
)
// Tries to find videos' URLs on the page
func FindPageVideos(pageBody []byte, from url.URL) []url.URL {
var urls []url.URL
// for every element that has "src" attribute
for _, link := range FindPageSrcLinks(pageBody, from) {
if HasVideoExtention(link.EscapedPath()) {
urls = append(urls, link)
}
}
// for every "a" element as well
for _, link := range FindPageLinks(pageBody, from) {
if HasVideoExtention(link.EscapedPath()) {
urls = append(urls, link)
}
}
return urls
}

30
src/worker/pool.go

@ -24,25 +24,31 @@ import (
"unbewohnte/wecr/web"
)
// Already visited URLs
type visited struct {
URLs []string
Lock sync.Mutex
}
// Whole worker pool's statistics
type Statistics struct {
PagesVisited uint64
MatchesFound uint64
StartTime time.Time
PagesVisited uint64 `json:"pages_visited"`
MatchesFound uint64 `json:"matches_found"`
PagesSaved uint64 `json:"pages_saved"`
StartTimeUnix uint64 `json:"start_time_unix"`
Stopped bool `json:"stopped"`
}
// Web-Worker pool
type Pool struct {
workersCount uint
workers []*Worker
visited visited
Stats Statistics
Stats *Statistics
}
func NewWorkerPool(jobs chan web.Job, results chan web.Result, workerCount uint, workerConf WorkerConf) *Pool {
// Create a new worker pool
func NewWorkerPool(initialJobs chan web.Job, workerCount uint, workerConf *WorkerConf, stats *Statistics) *Pool {
var newPool Pool = Pool{
workersCount: workerCount,
workers: nil,
@ -50,24 +56,22 @@ func NewWorkerPool(jobs chan web.Job, results chan web.Result, workerCount uint,
URLs: nil,
Lock: sync.Mutex{},
},
Stats: Statistics{
StartTime: time.Time{},
PagesVisited: 0,
MatchesFound: 0,
},
Stats: stats,
}
var i uint
for i = 0; i < workerCount; i++ {
newWorker := NewWorker(jobs, results, workerConf, &newPool.visited, &newPool.Stats)
newWorker := NewWorker(initialJobs, workerConf, &newPool.visited, newPool.Stats)
newPool.workers = append(newPool.workers, &newWorker)
}
return &newPool
}
// Notify all workers in pool to start scraping
func (p *Pool) Work() {
p.Stats.StartTime = time.Now()
p.Stats.StartTimeUnix = uint64(time.Now().Unix())
p.Stats.Stopped = false
for _, worker := range p.workers {
worker.Stopped = false
@ -75,7 +79,9 @@ func (p *Pool) Work() {
}
}
// Notify all workers in pool to stop scraping
func (p *Pool) Stop() {
p.Stats.Stopped = true
for _, worker := range p.workers {
worker.Stopped = true
}

396
src/worker/worker.go

@ -1,6 +1,6 @@
/*
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
Copyright (C) 2022, 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
@ -19,40 +19,54 @@
package worker
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path"
"path/filepath"
"regexp"
"strings"
"sync"
"time"
"unbewohnte/wecr/config"
"unbewohnte/wecr/logger"
"unbewohnte/wecr/queue"
"unbewohnte/wecr/web"
)
type VisitQueue struct {
VisitQueue *os.File
Lock *sync.Mutex
}
// Worker configuration
type WorkerConf struct {
Requests config.Requests
Save config.Save
Search *config.Search
Requests *config.Requests
Save *config.Save
BlacklistedDomains []string
AllowedDomains []string
VisitQueue VisitQueue
TextOutput io.Writer
EmailsOutput io.Writer
}
// Web worker
type Worker struct {
Jobs chan web.Job
Results chan web.Result
Conf WorkerConf
Conf *WorkerConf
visited *visited
stats *Statistics
Stopped bool
}
func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visited *visited, stats *Statistics) Worker {
// Create a new worker
func NewWorker(jobs chan web.Job, conf *WorkerConf, visited *visited, stats *Statistics) Worker {
return Worker{
Jobs: jobs,
Results: results,
Conf: conf,
visited: visited,
stats: stats,
@ -60,63 +74,234 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
}
}
func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" {
var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String()))
pageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, pageName))
if err != nil {
logger.Error("Failed to create page of \"%s\": %s", baseURL.String(), err)
func (w *Worker) saveContent(links []url.URL, pageURL *url.URL) {
var alreadyProcessedUrls []url.URL
for count, link := range links {
// check if this URL has been processed already
var skip bool = false
for _, processedURL := range alreadyProcessedUrls {
if link == processedURL {
skip = true
break
}
}
if skip {
skip = false
continue
}
alreadyProcessedUrls = append(alreadyProcessedUrls, link)
var fileName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(link.Path))
var filePath string
if web.HasImageExtention(link.Path) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveImagesDir, fileName)
} else if web.HasVideoExtention(link.Path) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveVideosDir, fileName)
} else if web.HasAudioExtention(link.Path) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveAudioDir, fileName)
} else if web.HasDocumentExtention(link.Path) {
filePath = filepath.Join(w.Conf.Save.OutputDir, config.SaveDocumentsDir, fileName)
} else {
pageFile.Write(pageData)
filePath = filepath.Join(w.Conf.Save.OutputDir, fileName)
}
err := web.FetchFile(
link.String(),
w.Conf.Requests.UserAgent,
w.Conf.Requests.ContentFetchTimeoutMs,
filePath,
)
if err != nil {
logger.Error("Failed to fetch file located at %s: %s", link.String(), err)
return
}
pageFile.Close()
logger.Info("Outputted \"%s\"", fileName)
w.stats.MatchesFound++
}
}
// Save page to the disk with a corresponding name; Download any src files, stylesheets and JS along the way
func (w *Worker) savePage(baseURL url.URL, pageData []byte) {
var findPageFileContentURLs func([]byte) []url.URL = func(pageBody []byte) []url.URL {
var urls []url.URL
for _, link := range web.FindPageLinksDontResolve(pageData) {
if strings.Contains(link.Path, ".css") ||
strings.Contains(link.Path, ".scss") ||
strings.Contains(link.Path, ".js") ||
strings.Contains(link.Path, ".mjs") {
urls = append(urls, link)
}
}
urls = append(urls, web.FindPageSrcLinksDontResolve(pageBody)...)
return urls
}
var cleanLink func(url.URL, url.URL) url.URL = func(link url.URL, from url.URL) url.URL {
resolvedLink := web.ResolveLink(link, from.Host)
cleanLink, err := url.Parse(resolvedLink.Scheme + "://" + resolvedLink.Host + resolvedLink.Path)
if err != nil {
return resolvedLink
}
return *cleanLink
}
// Create directory with all file content on the page
var pageFilesDirectoryName string = fmt.Sprintf(
"%s_%s_files",
baseURL.Host,
strings.ReplaceAll(baseURL.Path, "/", "_"),
)
err := os.MkdirAll(filepath.Join(w.Conf.Save.OutputDir, config.SavePagesDir, pageFilesDirectoryName), os.ModePerm)
if err != nil {
logger.Error("Failed to create directory to store file contents of %s: %s", baseURL.String(), err)
return
}
// Save files on page
srcLinks := findPageFileContentURLs(pageData)
for _, srcLink := range srcLinks {
web.FetchFile(srcLink.String(),
w.Conf.Requests.UserAgent,
w.Conf.Requests.ContentFetchTimeoutMs,
filepath.Join(
w.Conf.Save.OutputDir,
config.SavePagesDir,
pageFilesDirectoryName,
path.Base(srcLink.String()),
),
)
}
// Redirect old content URLs to local files
for _, srcLink := range srcLinks {
cleanLink := cleanLink(srcLink, baseURL)
pageData = bytes.ReplaceAll(
pageData,
[]byte(srcLink.String()),
[]byte("./"+filepath.Join(pageFilesDirectoryName, path.Base(cleanLink.String()))),
)
}
// Create page output file
pageName := fmt.Sprintf(
"%s_%s.html",
baseURL.Host,
strings.ReplaceAll(baseURL.Path, "/", "_"),
)
outfile, err := os.Create(filepath.Join(
filepath.Join(w.Conf.Save.OutputDir, config.SavePagesDir),
pageName,
))
if err != nil {
fmt.Printf("Failed to create output file: %s\n", err)
}
defer outfile.Close()
outfile.Write(pageData)
logger.Info("Saved \"%s\"", pageName)
w.stats.PagesSaved++
}
const (
textTypeMatch = iota
textTypeEmail = iota
)
// Save text result to an appropriate file
func (w *Worker) saveResult(result web.Result, textType int) {
// write result to the output file
var output io.Writer
switch textType {
case textTypeEmail:
output = w.Conf.EmailsOutput
default:
output = w.Conf.TextOutput
}
// each entry in output file is a self-standing JSON object
entryBytes, err := json.MarshalIndent(result, " ", "\t")
if err != nil {
return
}
output.Write(entryBytes)
output.Write([]byte("\n"))
}
// Launch scraping process on this worker
func (w *Worker) Work() {
if w.Stopped {
return
}
for job := range w.Jobs {
for {
var job web.Job
if w.Conf.VisitQueue.VisitQueue != nil {
w.Conf.VisitQueue.Lock.Lock()
newJob, err := queue.PopLastJob(w.Conf.VisitQueue.VisitQueue)
if err != nil {
logger.Error("Failed to get a new job from visit queue: %s", err)
w.Conf.VisitQueue.Lock.Unlock()
continue
}
if newJob == nil {
w.Conf.VisitQueue.Lock.Unlock()
continue
}
job = *newJob
w.Conf.VisitQueue.Lock.Unlock()
} else {
job = <-w.Jobs
}
// check if the worker has been stopped
if w.Stopped {
// stop working
return
}
// see if the domain is allowed and is not blacklisted
var skip bool = false
parsedURL, err := url.Parse(job.URL)
pageURL, err := url.Parse(job.URL)
if err != nil {
logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err)
continue
}
for _, allowedDomain := range w.Conf.AllowedDomains {
if parsedURL.Hostname() != allowedDomain {
var skip bool = false
// see if the domain is allowed and is not blacklisted
if len(w.Conf.AllowedDomains) > 0 {
skip = true
logger.Info("Skipped non-allowed %s", job.URL)
for _, allowedDomain := range w.Conf.AllowedDomains {
if pageURL.Host == allowedDomain {
skip = false
break
}
}
for _, blacklistedDomain := range w.Conf.BlacklistedDomains {
if skip {
break
logger.Info("Skipped non-allowed %s", job.URL)
continue
}
}
if parsedURL.Hostname() == blacklistedDomain {
if len(w.Conf.BlacklistedDomains) > 0 {
for _, blacklistedDomain := range w.Conf.BlacklistedDomains {
if pageURL.Host == blacklistedDomain {
skip = true
logger.Info("Skipped blacklisted %s", job.URL)
break
}
}
if skip {
continue
}
}
// check if it is the first occurence
w.visited.Lock.Lock()
@ -129,6 +314,7 @@ func (w *Worker) Work() {
break
}
}
if skip {
continue
}
@ -140,92 +326,132 @@ func (w *Worker) Work() {
// get page
logger.Info("Visiting %s", job.URL)
pageData, err := web.GetPage(job.URL, w.Conf.Requests.UserAgent, w.Conf.Requests.WaitTimeoutMs)
pageData, err := web.GetPage(job.URL, w.Conf.Requests.UserAgent, w.Conf.Requests.RequestWaitTimeoutMs)
if err != nil {
logger.Error("Failed to get \"%s\": %s", job.URL, err)
continue
}
// find links
pageLinks := web.FindPageLinks(pageData, parsedURL.Host)
pageLinks := web.FindPageLinks(pageData, *pageURL)
go func() {
if job.Depth > 1 {
// decrement depth and add new jobs to the channel
// decrement depth and add new jobs
job.Depth--
if w.Conf.VisitQueue.VisitQueue != nil {
// add to the visit queue
w.Conf.VisitQueue.Lock.Lock()
for _, link := range pageLinks {
if link != job.URL {
if link.String() != job.URL {
err = queue.InsertNewJob(w.Conf.VisitQueue.VisitQueue, web.Job{
URL: link.String(),
Search: *w.Conf.Search,
Depth: job.Depth,
})
if err != nil {
logger.Error("Failed to encode a new job to a visit queue: %s", err)
continue
}
}
}
w.Conf.VisitQueue.Lock.Unlock()
} else {
// add to the in-memory channel
for _, link := range pageLinks {
if link.String() != job.URL {
w.Jobs <- web.Job{
URL: link,
Search: job.Search,
URL: link.String(),
Search: *w.Conf.Search,
Depth: job.Depth,
}
}
}
}
}
pageLinks = nil
}()
// process and output result
var savePage bool = false
switch job.Search.Query {
case config.QueryLinks:
// simply output links
if len(pageLinks) > 0 {
w.Results <- web.Result{
PageURL: job.URL,
Search: job.Search,
Data: pageLinks,
}
case config.QueryArchive:
savePage = true
}
case config.QueryImages:
// find image URLs, output images to the file while not saving already outputted ones
imageLinks := web.FindPageImages(pageData, parsedURL.Host)
var alreadyProcessedImgUrls []string
for count, imageLink := range imageLinks {
// check if this URL has been processed already
var skipImage bool = false
for _, processedURL := range alreadyProcessedImgUrls {
if imageLink == processedURL {
skipImage = true
break
}
imageLinks := web.FindPageImages(pageData, *pageURL)
if len(imageLinks) > 0 {
w.saveContent(imageLinks, pageURL)
savePage = true
}
if skipImage {
skipImage = false
continue
} else {
alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
case config.QueryVideos:
// search for videos
// find video URLs, output videos to the files while not saving already outputted ones
videoLinks := web.FindPageVideos(pageData, *pageURL)
if len(videoLinks) > 0 {
w.saveContent(videoLinks, pageURL)
savePage = true
}
var imageName string = fmt.Sprintf("%s_%d_%s", parsedURL.Host, count, path.Base(imageLink))
case config.QueryAudio:
// search for audio
// find audio URLs, output audio to the file while not saving already outputted ones
audioLinks := web.FindPageAudio(pageData, *pageURL)
if len(audioLinks) > 0 {
w.saveContent(audioLinks, pageURL)
savePage = true
}
response, err := http.Get(imageLink)
if err != nil {
logger.Error("Failed to get image %s", imageLink)
continue
case config.QueryDocuments:
// search for various documents
// find documents URLs, output docs to the file while not saving already outputted ones
docsLinks := web.FindPageDocuments(pageData, *pageURL)
if len(docsLinks) > 0 {
w.saveContent(docsLinks, pageURL)
savePage = true
}
imageFile, err := os.Create(filepath.Join(w.Conf.Save.OutputDir, imageName))
if err != nil {
logger.Error("Failed to create image file \"%s\": %s", imageName, err)
continue
case config.QueryEmail:
// search for email
emailAddresses := web.FindPageEmailsWithCheck(pageData)
if len(emailAddresses) > 0 {
w.saveResult(web.Result{
PageURL: job.URL,
Search: job.Search,
Data: emailAddresses,
}, textTypeEmail)
w.stats.MatchesFound += uint64(len(emailAddresses))
savePage = true
}
_, _ = io.Copy(imageFile, response.Body)
case config.QueryEverything:
// search for everything
response.Body.Close()
imageFile.Close()
// files
var contentLinks []url.URL
contentLinks = append(contentLinks, web.FindPageImages(pageData, *pageURL)...)
contentLinks = append(contentLinks, web.FindPageAudio(pageData, *pageURL)...)
contentLinks = append(contentLinks, web.FindPageVideos(pageData, *pageURL)...)
contentLinks = append(contentLinks, web.FindPageDocuments(pageData, *pageURL)...)
w.saveContent(contentLinks, pageURL)
logger.Info("Outputted \"%s\"", imageName)
w.stats.MatchesFound++
if len(contentLinks) > 0 {
savePage = true
}
if len(imageLinks) > 0 {
// email
emailAddresses := web.FindPageEmailsWithCheck(pageData)
if len(emailAddresses) > 0 {
w.saveResult(web.Result{
PageURL: job.URL,
Search: job.Search,
Data: emailAddresses,
}, textTypeEmail)
w.stats.MatchesFound += uint64(len(emailAddresses))
savePage = true
}
@ -242,38 +468,38 @@ func (w *Worker) Work() {
matches := web.FindPageRegexp(re, pageData)
if len(matches) > 0 {
w.Results <- web.Result{
w.saveResult(web.Result{
PageURL: job.URL,
Search: job.Search,
Data: matches,
}
}, textTypeMatch)
logger.Info("Found matches: %+v", matches)
w.stats.MatchesFound += uint64(len(matches))
savePage = true
}
case false:
// just text
if web.IsTextOnPage(job.Search.Query, true, pageData) {
w.Results <- web.Result{
w.saveResult(web.Result{
PageURL: job.URL,
Search: job.Search,
Data: nil,
}
Data: []string{job.Search.Query},
}, textTypeMatch)
logger.Info("Found \"%s\" on page", job.Search.Query)
w.stats.MatchesFound++
savePage = true
}
}
}
// save page
if savePage {
w.savePage(parsedURL, pageData)
if savePage && w.Conf.Save.SavePages {
w.savePage(*pageURL, pageData)
}
pageData = nil
pageURL = nil
// sleep before the next request
time.Sleep(time.Duration(w.Conf.Requests.RequestPauseMs * uint64(time.Millisecond)))
}
}
}

Loading…
Cancel
Save