diff --git a/.gitignore b/.gitignore index 1cec9a4..10565a6 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ websurf conf_mega_ita.json wecr release/ -scraped/ \ No newline at end of file +scraped/ +extracted_data.txt \ No newline at end of file diff --git a/Makefile b/Makefile index f232d8f..528ea23 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,9 @@ TESTDIR:=testing RELEASEDIR:=release LICENSE:=COPYING README:=README.md +UTILITIESDIR:=utilities +UTILITYEXTRACTOR:=extractor +BUILDDIR:=build LINUXDIR:=$(EXE)_linux WINDIR:=$(EXE)_windows @@ -17,7 +20,7 @@ WINDIR64:=$(WINDIR)_x64 DARWINDIR64:=$(DARWINDIR)_x64 -all: +all: clean cd $(SRCDIR) && go build && mv $(EXE) .. test: all @@ -27,9 +30,9 @@ test: all cp conf.json $(TESTDIR) clean: - rm -rf $(TESTDIR) $(EXE) + rm -rf $(TESTDIR) $(EXE) $(RELEASEDIR) -release: +release: clean rm -rf $(RELEASEDIR) mkdir -p $(RELEASEDIR)/$(LINUXDIR64) diff --git a/README.md b/README.md index 29e49e9..0fa4faf 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Just a simple HTML web spider with minimal dependencies. It is possible to searc ## Configuration -The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wDir` (working directory) flag is set to some other value. +The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wDir` (working directory) flag is set to some other value. To see al available flags run `wecr -h`. The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them. @@ -28,12 +28,11 @@ When `is_regexp` is enabled, the `query` is treated as a regexp string and pages By default, if the query is not something of special values all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be put in the corresponding directories inside `output_dir`, which is neatly created by the executable's side. -## TODO +## Build -- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - Done -- Search for videos - Done -- Search for audio - Done -- Search for documents +If you're on *nix - it's as easy as `make`. + +Otherwise - `go build` in the `src` directory to build `wecr`. ## License AGPLv3 \ No newline at end of file diff --git a/src/config/config.go b/src/config/config.go index 5fbd4d8..c6c7af2 100644 --- a/src/config/config.go +++ b/src/config/config.go @@ -89,7 +89,7 @@ func Default() *Conf { }, Requests: Requests{ UserAgent: "", - RequestWaitTimeoutMs: 1500, + RequestWaitTimeoutMs: 2500, RequestPauseMs: 100, ContentFetchTimeoutMs: 0, }, diff --git a/src/main.go b/src/main.go index 7b718ca..204ac5c 100644 --- a/src/main.go +++ b/src/main.go @@ -32,6 +32,7 @@ import ( "time" "unbewohnte/wecr/config" "unbewohnte/wecr/logger" + "unbewohnte/wecr/utilities" "unbewohnte/wecr/web" "unbewohnte/wecr/worker" ) @@ -39,8 +40,9 @@ import ( const version = "v0.2.1" const ( - defaultConfigFile string = "conf.json" - defaultOutputFile string = "output.json" + defaultConfigFile string = "conf.json" + defaultOutputFile string = "output.json" + defaultPrettifiedOutputFile string = "extracted_data.txt" ) var ( @@ -64,6 +66,11 @@ var ( "Output file name to output information into", ) + extractDataFilename = flag.String( + "extractData", "", + "Set filename for output JSON file and extract data from it, put each entry nicely on a new line in a new file, then exit", + ) + workingDirectory string configFilePath string outputFilePath string @@ -113,6 +120,18 @@ func init() { logger.Info("Working in \"%s\"", workingDirectory) + // extract data if needed + if strings.TrimSpace(*extractDataFilename) != "" { + logger.Info("Extracting data from %s...", *extractDataFilename) + err := utilities.ExtractDataFromOutput(*extractDataFilename, defaultPrettifiedOutputFile, "\n", false) + if err != nil { + logger.Error("Failed to extract data from %s: %s", *extractDataFilename, err) + os.Exit(1) + } + logger.Info("Outputted \"%s\"", defaultPrettifiedOutputFile) + os.Exit(0) + } + // global path to configuration file configFilePath = filepath.Join(workingDirectory, *configFile) diff --git a/src/utilities/dataextractor.go b/src/utilities/dataextractor.go new file mode 100644 index 0000000..338a8cd --- /dev/null +++ b/src/utilities/dataextractor.go @@ -0,0 +1,78 @@ +/* + Wecr - crawl the web for data + Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . +*/ + +package utilities + +import ( + "encoding/json" + "fmt" + "io" + "os" + "unbewohnte/wecr/web" +) + +// Extracts data from the output JSON file and puts it in a new file with separators between each entry +func ExtractDataFromOutput(inputFilename string, outputFilename string, separator string, keepDuplicates bool) error { + inputFile, err := os.Open(inputFilename) + if err != nil { + return err + } + defer inputFile.Close() + + outputFile, err := os.Create(outputFilename) + if err != nil { + return err + } + defer outputFile.Close() + + var processedData []string + + decoder := json.NewDecoder(inputFile) + for { + var result web.Result + + err := decoder.Decode(&result) + if err == io.EOF { + break + } + if err != nil { + return err + } + + for _, dataEntry := range result.Data { + var skip = false + if !keepDuplicates { + for _, processedEntry := range processedData { + if dataEntry == processedEntry { + skip = true + break + } + } + + if skip { + continue + } + processedData = append(processedData, dataEntry) + } + + outputFile.WriteString(fmt.Sprintf("%s%s", dataEntry, separator)) + } + } + + return nil +} diff --git a/src/web/text.go b/src/web/text.go index e52f58d..c032015 100644 --- a/src/web/text.go +++ b/src/web/text.go @@ -34,9 +34,9 @@ var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|' // matches src="link" or even something along the lines of SrC = 'link' var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`) -// var emailRegexp *regexp.Regexp = regexp.MustCompile(`[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`) +var emailRegexp *regexp.Regexp = regexp.MustCompile(`[A-Za-z0-9._%+\-!%&?~^#$]+@[A-Za-z0-9.\-]+\.[a-zA-Z]{2,4}`) -var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*") +// var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*") // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright func ResolveLink(url *url.URL, fromHost string) string { @@ -158,8 +158,8 @@ func FindPageEmailsWithCheck(pageBody []byte) []string { emailAddresses := FindPageEmails(pageBody) for _, email := range emailAddresses { - _, err := net.LookupMX(strings.Split(email, "@")[1]) - if err != nil { + mx, err := net.LookupMX(strings.Split(email, "@")[1]) + if err != nil || len(mx) == 0 { continue }