extractData flag

2 years ago · 7dbf55cb32
7 changed files with 117 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,3 +7,4 @@ conf_mega_ita.json
 wecr
 release/
 scraped/
+extracted_data.txt
--- a/9
+++ b/9
@ -4,6 +4,9 @@ TESTDIR:=testing
 RELEASEDIR:=release
 LICENSE:=COPYING
 README:=README.md
+UTILITIESDIR:=utilities
+UTILITYEXTRACTOR:=extractor
+BUILDDIR:=build

 LINUXDIR:=$(EXE)_linux
 WINDIR:=$(EXE)_windows
@ -17,7 +20,7 @@ WINDIR64:=$(WINDIR)_x64
 DARWINDIR64:=$(DARWINDIR)_x64


-all:
+all: clean
 	cd $(SRCDIR) && go build && mv $(EXE) ..

 test: all
@ -27,9 +30,9 @@ test: all
 	cp conf.json $(TESTDIR)

 clean:
-	rm -rf $(TESTDIR) $(EXE)
+	rm -rf $(TESTDIR) $(EXE) $(RELEASEDIR)

-release:
+release: clean
 	rm -rf $(RELEASEDIR)

 	mkdir -p $(RELEASEDIR)/$(LINUXDIR64)
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@ Just a simple HTML web spider with minimal dependencies. It is possible to searc

 ## Configuration

-The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wDir` (working directory) flag is set to some other value.
+The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wDir` (working directory) flag is set to some other value. To see al available flags run `wecr -h`.

 The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.

@ -28,12 +28,11 @@ When `is_regexp` is enabled, the `query` is treated as a regexp string and pages

 By default, if the query is not something of special values all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be put in the corresponding directories inside `output_dir`, which is neatly created by the executable's side.

-## TODO
+## Build

- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - Done
- Search for videos - Done
- Search for audio - Done
- Search for documents
+If you're on *nix - it's as easy as `make`.
+
+Otherwise - `go build` in the `src` directory to build `wecr`. 

 ## License
 AGPLv3
--- a/src/config/config.go
+++ b/src/config/config.go
@ -89,7 +89,7 @@ func Default() *Conf {
 		},
 		Requests: Requests{
 			UserAgent:             "",
-			RequestWaitTimeoutMs:  1500,
+			RequestWaitTimeoutMs:  2500,
 			RequestPauseMs:        100,
 			ContentFetchTimeoutMs: 0,
 		},
--- a/src/main.go
+++ b/src/main.go
@ -32,6 +32,7 @@ import (
 	"time"
 	"unbewohnte/wecr/config"
 	"unbewohnte/wecr/logger"
+	"unbewohnte/wecr/utilities"
 	"unbewohnte/wecr/web"
 	"unbewohnte/wecr/worker"
 )
@ -41,6 +42,7 @@ const version = "v0.2.1"
 const (
 	defaultConfigFile           string = "conf.json"
 	defaultOutputFile           string = "output.json"
+	defaultPrettifiedOutputFile string = "extracted_data.txt"
 )

 var (
@ -64,6 +66,11 @@ var (
 		"Output file name to output information into",
 	)

+	extractDataFilename = flag.String(
+		"extractData", "",
+		"Set filename for output JSON file and extract data from it, put each entry nicely on a new line in a new file, then exit",
+	)
+
 	workingDirectory string
 	configFilePath   string
 	outputFilePath   string
@ -113,6 +120,18 @@ func init() {

 	logger.Info("Working in \"%s\"", workingDirectory)

+	// extract data if needed
+	if strings.TrimSpace(*extractDataFilename) != "" {
+		logger.Info("Extracting data from %s...", *extractDataFilename)
+		err := utilities.ExtractDataFromOutput(*extractDataFilename, defaultPrettifiedOutputFile, "\n", false)
+		if err != nil {
+			logger.Error("Failed to extract data from %s: %s", *extractDataFilename, err)
+			os.Exit(1)
+		}
+		logger.Info("Outputted \"%s\"", defaultPrettifiedOutputFile)
+		os.Exit(0)
+	}
+
 	// global path to configuration file
 	configFilePath = filepath.Join(workingDirectory, *configFile)

--- a/src/utilities/dataextractor.go
+++ b/src/utilities/dataextractor.go
@ -0,0 +1,78 @@
+/*
+	Wecr - crawl the web for data
+	Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+
+	This program is free software: you can redistribute it and/or modify
+	it under the terms of the GNU Affero General Public License as published by
+	the Free Software Foundation, either version 3 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU Affero General Public License for more details.
+
+	You should have received a copy of the GNU Affero General Public License
+	along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+package utilities
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"unbewohnte/wecr/web"
+)
+
+// Extracts data from the output JSON file and puts it in a new file with separators between each entry
+func ExtractDataFromOutput(inputFilename string, outputFilename string, separator string, keepDuplicates bool) error {
+	inputFile, err := os.Open(inputFilename)
+	if err != nil {
+		return err
+	}
+	defer inputFile.Close()
+
+	outputFile, err := os.Create(outputFilename)
+	if err != nil {
+		return err
+	}
+	defer outputFile.Close()
+
+	var processedData []string
+
+	decoder := json.NewDecoder(inputFile)
+	for {
+		var result web.Result
+
+		err := decoder.Decode(&result)
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+
+		for _, dataEntry := range result.Data {
+			var skip = false
+			if !keepDuplicates {
+				for _, processedEntry := range processedData {
+					if dataEntry == processedEntry {
+						skip = true
+						break
+					}
+				}
+
+				if skip {
+					continue
+				}
+				processedData = append(processedData, dataEntry)
+			}
+
+			outputFile.WriteString(fmt.Sprintf("%s%s", dataEntry, separator))
+		}
+	}
+
+	return nil
+}
--- a/src/web/text.go
+++ b/src/web/text.go
@ -34,9 +34,9 @@ var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|'
 // matches src="link" or even something along the lines of SrC    =  'link'
 var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`)

-// var emailRegexp *regexp.Regexp = regexp.MustCompile(`[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`)
+var emailRegexp *regexp.Regexp = regexp.MustCompile(`[A-Za-z0-9._%+\-!%&?~^#$]+@[A-Za-z0-9.\-]+\.[a-zA-Z]{2,4}`)

-var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*")
+// var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*")

 // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
 func ResolveLink(url *url.URL, fromHost string) string {
@ -158,8 +158,8 @@ func FindPageEmailsWithCheck(pageBody []byte) []string {

 	emailAddresses := FindPageEmails(pageBody)
 	for _, email := range emailAddresses {
-		_, err := net.LookupMX(strings.Split(email, "@")[1])
-		if err != nil {
+		mx, err := net.LookupMX(strings.Split(email, "@")[1])
+		if err != nil || len(mx) == 0 {
 			continue
 		}