From b256d8a83e552d18460b7f78b4e7c26123c50a85 Mon Sep 17 00:00:00 2001 From: Unbewohnte Date: Sun, 12 Feb 2023 16:29:19 +0300 Subject: [PATCH] No more unified text output file. Text searches of different kinds go into their own files --- README.md | 55 ++++++++++++++++++++++++++++++++----- src/config/config.go | 10 +++---- src/main.go | 64 ++++++++++++++++++++++++-------------------- src/worker/worker.go | 4 +-- 4 files changed, 89 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 10d1aaa..3a75664 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,20 @@ -# Wecr - simple web crawler +# Wecr - versatile WEb CRawler ## Overview -Just a simple HTML web spider with no dependencies. It is possible to search for pages with a text on them or for the text itself, extract images, video, audio and save pages that satisfy the criteria along the way. +A simple HTML web spider with no dependencies. It is possible to search for pages with a text on them or for the text itself, extract images, video, audio and save pages that satisfy the criteria along the way. ## Configuration -The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wDir` (working directory) flag is set to some other value. To see al available flags run `wecr -h`. +The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wdir` (working directory) flag is set to some other value. To see al available flags run `wecr -h`. The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them. The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains` or are NOT in `allowed_domains`. If all initial pages are happen to be on blacklisted domains or are not in the allowed list - the program will get stuck. It is important to note that `*_domains` should be specified with an existing scheme (ie: https://en.wikipedia.org). Subdomains and ports **matter**: `https://unbewohnte.su:3000/` and `https://unbewohnte.su/` are **different**. -Previous versions stored the entire visit queue in memory, resulting in gigabytes of memory usage but as of `v0.2.4` it is possible to offload the queue to the persistent storage via `in_memory_visit_queue` option (`false` by default). +Previous versions stored the entire visit queue in memory, resulting in gigabytes of memory usage but as of `v0.2.4` it is possible to offload the queue to the persistent storage via `in_memory_visit_queue` option (`false` by default). + +You can change search `query` at **runtime** via web dashboard if `launch_dashboard` is set to `true` ### Search query @@ -31,17 +33,58 @@ When `is_regexp` is enabled, the `query` is treated as a regexp string and pages By default, if the query is not something of special values all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be put in the corresponding directories inside `output_dir`, which is neatly created by the executable's side. -The output almost certainly contains some duplicates and is not easy to work with programmatically, so you can use `-extractData` with the output JSON file argument (like `output.json`, which is the default output file name) to extract the actual data, filter out the duplicates and put each entry on its new line in a new text file. +The output almost certainly contains some duplicates and is not easy to work with programmatically, so you can use `-extractData` with the output JSON file argument (like `found_text.json`, which is the default output file name for simple text searches) to extract the actual data, filter out the duplicates and put each entry on its new line in a new text file. ## Build If you're on *nix - it's as easy as `make`. -Otherwise - `go build` in the `src` directory to build `wecr`. +Otherwise - `go build` in the `src` directory to build `wecr`. No dependencies. ## Examples See [page on my website](https://unbewohnte.su/wecr) for some basic examples. +Dump of a basic configuration: + +```json +{ + "search": { + "is_regexp": true, + "query": "(sequence to search)|(other sequence)" + }, + "requests": { + "request_wait_timeout_ms": 2500, + "request_pause_ms": 100, + "content_fetch_timeout_ms": 0, + "user_agent": "" + }, + "depth": 90, + "workers": 30, + "initial_pages": [ + "https://en.wikipedia.org/wiki/Main_Page" + ], + "allowed_domains": [ + "https://en.wikipedia.org/" + ], + "blacklisted_domains": [ + "" + ], + "in_memory_visit_queue": false, + "web_dashboard": { + "launch_dashboard": true, + "port": 13370 + }, + "save": { + "output_dir": "scraped", + "save_pages": false + }, + "logging": { + "output_logs": true, + "logs_file": "logs.log" + } +} +``` + ## License AGPLv3 \ No newline at end of file diff --git a/src/config/config.go b/src/config/config.go index 4eae654..d84ad5b 100644 --- a/src/config/config.go +++ b/src/config/config.go @@ -47,9 +47,8 @@ type Search struct { } type Save struct { - OutputDir string `json:"output_dir"` - OutputFile string `json:"output_file"` - SavePages bool `json:"save_pages"` + OutputDir string `json:"output_dir"` + SavePages bool `json:"save_pages"` } type Requests struct { @@ -92,9 +91,8 @@ func Default() *Conf { Query: "", }, Save: Save{ - OutputDir: "scraped", - SavePages: false, - OutputFile: "scraped.json", + OutputDir: "scraped", + SavePages: false, }, Requests: Requests{ UserAgent: "", diff --git a/src/main.go b/src/main.go index 9276b2f..1888573 100644 --- a/src/main.go +++ b/src/main.go @@ -40,13 +40,14 @@ import ( "unbewohnte/wecr/worker" ) -const version = "v0.3.1" +const version = "v0.3.2" const ( - defaultConfigFile string = "conf.json" - defaultOutputFile string = "output.json" - defaultPrettifiedOutputFile string = "extracted_data.txt" - defaultVisitQueueFile string = "visit_queue.tmp" + configFilename string = "conf.json" + prettifiedTextOutputFilename string = "extracted_data.txt" + visitQueueFilename string = "visit_queue.tmp" + textOutputFilename string = "found_text.json" + emailsOutputFilename string = "found_emails.json" ) var ( @@ -61,15 +62,10 @@ var ( ) configFile = flag.String( - "conf", defaultConfigFile, + "conf", configFilename, "Configuration file name to create|look for", ) - outputFile = flag.String( - "out", defaultOutputFile, - "Output file name to output information into", - ) - extractDataFilename = flag.String( "extractData", "", "Set filename for output JSON file and extract data from it, put each entry nicely on a new line in a new file, then exit", @@ -77,7 +73,6 @@ var ( workingDirectory string configFilePath string - outputFilePath string ) func init() { @@ -126,20 +121,17 @@ func init() { // extract data if needed if strings.TrimSpace(*extractDataFilename) != "" { logger.Info("Extracting data from %s...", *extractDataFilename) - err := utilities.ExtractDataFromOutput(*extractDataFilename, defaultPrettifiedOutputFile, "\n", false) + err := utilities.ExtractDataFromOutput(*extractDataFilename, prettifiedTextOutputFilename, "\n", false) if err != nil { logger.Error("Failed to extract data from %s: %s", *extractDataFilename, err) os.Exit(1) } - logger.Info("Outputted \"%s\"", defaultPrettifiedOutputFile) + logger.Info("Outputted \"%s\"", prettifiedTextOutputFilename) os.Exit(0) } // global path to configuration file configFilePath = filepath.Join(workingDirectory, *configFile) - - // global path to output file - outputFilePath = filepath.Join(workingDirectory, *outputFile) } func main() { @@ -249,7 +241,7 @@ func main() { logger.Warning("User agent is not set. Forced to \"%s\"", conf.Requests.UserAgent) } - // create output directories and corresponding specialized ones + // create output directory and corresponding specialized ones, text output files if !filepath.IsAbs(conf.Save.OutputDir) { conf.Save.OutputDir = filepath.Join(workingDirectory, conf.Save.OutputDir) } @@ -289,6 +281,20 @@ func main() { return } + textOutputFile, err := os.Create(filepath.Join(conf.Save.OutputDir, textOutputFilename)) + if err != nil { + logger.Error("Failed to create text output file: %s", err) + return + } + defer textOutputFile.Close() + + emailsOutputFile, err := os.Create(filepath.Join(conf.Save.OutputDir, emailsOutputFilename)) + if err != nil { + logger.Error("Failed to create email addresses output file: %s", err) + return + } + defer emailsOutputFile.Close() + switch conf.Search.Query { case config.QueryEmail: logger.Info("Looking for email addresses") @@ -315,14 +321,6 @@ func main() { } } - // create output file - outputFile, err := os.Create(outputFilePath) - if err != nil { - logger.Error("Failed to create output file: %s", err) - return - } - defer outputFile.Close() - // create logs if needed if conf.Logging.OutputLogs { if conf.Logging.LogsFile != "" { @@ -354,14 +352,14 @@ func main() { var visitQueueFile *os.File = nil if !conf.InMemoryVisitQueue { var err error - visitQueueFile, err = os.Create(filepath.Join(workingDirectory, defaultVisitQueueFile)) + visitQueueFile, err = os.Create(filepath.Join(workingDirectory, visitQueueFilename)) if err != nil { logger.Error("Could not create visit queue temporary file: %s", err) return } defer func() { visitQueueFile.Close() - os.Remove(filepath.Join(workingDirectory, defaultVisitQueueFile)) + os.Remove(filepath.Join(workingDirectory, visitQueueFilename)) }() } @@ -443,13 +441,21 @@ func main() { }() } - // get text results and write them to the output file (files are handled by each worker separately) + // get text text results and write it to the output file (found files are handled by each worker separately) + var outputFile *os.File for { result, ok := <-results if !ok { break } + // as it is possible to change configuration "on the fly" - it's better to not mess up different outputs + if result.Search.Query == config.QueryEmail { + outputFile = emailsOutputFile + } else { + outputFile = textOutputFile + } + // each entry in output file is a self-standing JSON object entryBytes, err := json.MarshalIndent(result, " ", "\t") if err != nil { diff --git a/src/worker/worker.go b/src/worker/worker.go index abc7f89..078900d 100644 --- a/src/worker/worker.go +++ b/src/worker/worker.go @@ -375,7 +375,6 @@ func (w *Worker) Work() { } logger.Info("Found matches: %+v", matches) w.stats.MatchesFound += uint64(len(matches)) - savePage = true } case false: @@ -384,11 +383,10 @@ func (w *Worker) Work() { w.Results <- web.Result{ PageURL: job.URL, Search: job.Search, - Data: nil, + Data: []string{job.Search.Query}, } logger.Info("Found \"%s\" on page", job.Search.Query) w.stats.MatchesFound++ - savePage = true } }