From 7dbf55cb329c7783e7901b99d3e38d0840fea85d Mon Sep 17 00:00:00 2001
From: Unbewohnte <me@unbewohnte.su>
Date: Sun, 15 Jan 2023 15:36:43 +0300
Subject: [PATCH] extractData flag

---
 .gitignore                     |  3 +-
 Makefile                       |  9 ++--
 README.md                      | 11 +++--
 src/config/config.go           |  2 +-
 src/main.go                    | 23 +++++++++-
 src/utilities/dataextractor.go | 78 ++++++++++++++++++++++++++++++++++
 src/web/text.go                |  8 ++--
 7 files changed, 117 insertions(+), 17 deletions(-)
 create mode 100644 src/utilities/dataextractor.go

diff --git a/.gitignore b/.gitignore
index 1cec9a4..10565a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,5 @@ websurf
 conf_mega_ita.json
 wecr
 release/
-scraped/
\ No newline at end of file
+scraped/
+extracted_data.txt
\ No newline at end of file
diff --git a/Makefile b/Makefile
index f232d8f..528ea23 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,9 @@ TESTDIR:=testing
 RELEASEDIR:=release
 LICENSE:=COPYING
 README:=README.md
+UTILITIESDIR:=utilities
+UTILITYEXTRACTOR:=extractor
+BUILDDIR:=build
 
 LINUXDIR:=$(EXE)_linux
 WINDIR:=$(EXE)_windows
@@ -17,7 +20,7 @@ WINDIR64:=$(WINDIR)_x64
 DARWINDIR64:=$(DARWINDIR)_x64
 
 
-all:
+all: clean
 	cd $(SRCDIR) && go build && mv $(EXE) ..
 
 test: all
@@ -27,9 +30,9 @@ test: all
 	cp conf.json $(TESTDIR)
 
 clean:
-	rm -rf $(TESTDIR) $(EXE)
+	rm -rf $(TESTDIR) $(EXE) $(RELEASEDIR)
 
-release:
+release: clean
 	rm -rf $(RELEASEDIR)
 
 	mkdir -p $(RELEASEDIR)/$(LINUXDIR64)
diff --git a/README.md b/README.md
index 29e49e9..0fa4faf 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ Just a simple HTML web spider with minimal dependencies. It is possible to searc
 
 ## Configuration
 
-The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wDir` (working directory) flag is set to some other value.
+The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wDir` (working directory) flag is set to some other value. To see al available flags run `wecr -h`.
 
 The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.
 
@@ -28,12 +28,11 @@ When `is_regexp` is enabled, the `query` is treated as a regexp string and pages
 
 By default, if the query is not something of special values all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be put in the corresponding directories inside `output_dir`, which is neatly created by the executable's side.
 
-## TODO
+## Build
 
-- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - Done
-- Search for videos - Done
-- Search for audio - Done
-- Search for documents
+If you're on *nix - it's as easy as `make`.
+
+Otherwise - `go build` in the `src` directory to build `wecr`. 
 
 ## License
 AGPLv3
\ No newline at end of file
diff --git a/src/config/config.go b/src/config/config.go
index 5fbd4d8..c6c7af2 100644
--- a/src/config/config.go
+++ b/src/config/config.go
@@ -89,7 +89,7 @@ func Default() *Conf {
 		},
 		Requests: Requests{
 			UserAgent:             "",
-			RequestWaitTimeoutMs:  1500,
+			RequestWaitTimeoutMs:  2500,
 			RequestPauseMs:        100,
 			ContentFetchTimeoutMs: 0,
 		},
diff --git a/src/main.go b/src/main.go
index 7b718ca..204ac5c 100644
--- a/src/main.go
+++ b/src/main.go
@@ -32,6 +32,7 @@ import (
 	"time"
 	"unbewohnte/wecr/config"
 	"unbewohnte/wecr/logger"
+	"unbewohnte/wecr/utilities"
 	"unbewohnte/wecr/web"
 	"unbewohnte/wecr/worker"
 )
@@ -39,8 +40,9 @@ import (
 const version = "v0.2.1"
 
 const (
-	defaultConfigFile string = "conf.json"
-	defaultOutputFile string = "output.json"
+	defaultConfigFile           string = "conf.json"
+	defaultOutputFile           string = "output.json"
+	defaultPrettifiedOutputFile string = "extracted_data.txt"
 )
 
 var (
@@ -64,6 +66,11 @@ var (
 		"Output file name to output information into",
 	)
 
+	extractDataFilename = flag.String(
+		"extractData", "",
+		"Set filename for output JSON file and extract data from it, put each entry nicely on a new line in a new file, then exit",
+	)
+
 	workingDirectory string
 	configFilePath   string
 	outputFilePath   string
@@ -113,6 +120,18 @@ func init() {
 
 	logger.Info("Working in \"%s\"", workingDirectory)
 
+	// extract data if needed
+	if strings.TrimSpace(*extractDataFilename) != "" {
+		logger.Info("Extracting data from %s...", *extractDataFilename)
+		err := utilities.ExtractDataFromOutput(*extractDataFilename, defaultPrettifiedOutputFile, "\n", false)
+		if err != nil {
+			logger.Error("Failed to extract data from %s: %s", *extractDataFilename, err)
+			os.Exit(1)
+		}
+		logger.Info("Outputted \"%s\"", defaultPrettifiedOutputFile)
+		os.Exit(0)
+	}
+
 	// global path to configuration file
 	configFilePath = filepath.Join(workingDirectory, *configFile)
 
diff --git a/src/utilities/dataextractor.go b/src/utilities/dataextractor.go
new file mode 100644
index 0000000..338a8cd
--- /dev/null
+++ b/src/utilities/dataextractor.go
@@ -0,0 +1,78 @@
+/*
+	Wecr - crawl the web for data
+	Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+
+	This program is free software: you can redistribute it and/or modify
+	it under the terms of the GNU Affero General Public License as published by
+	the Free Software Foundation, either version 3 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU Affero General Public License for more details.
+
+	You should have received a copy of the GNU Affero General Public License
+	along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+package utilities
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"unbewohnte/wecr/web"
+)
+
+// Extracts data from the output JSON file and puts it in a new file with separators between each entry
+func ExtractDataFromOutput(inputFilename string, outputFilename string, separator string, keepDuplicates bool) error {
+	inputFile, err := os.Open(inputFilename)
+	if err != nil {
+		return err
+	}
+	defer inputFile.Close()
+
+	outputFile, err := os.Create(outputFilename)
+	if err != nil {
+		return err
+	}
+	defer outputFile.Close()
+
+	var processedData []string
+
+	decoder := json.NewDecoder(inputFile)
+	for {
+		var result web.Result
+
+		err := decoder.Decode(&result)
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+
+		for _, dataEntry := range result.Data {
+			var skip = false
+			if !keepDuplicates {
+				for _, processedEntry := range processedData {
+					if dataEntry == processedEntry {
+						skip = true
+						break
+					}
+				}
+
+				if skip {
+					continue
+				}
+				processedData = append(processedData, dataEntry)
+			}
+
+			outputFile.WriteString(fmt.Sprintf("%s%s", dataEntry, separator))
+		}
+	}
+
+	return nil
+}
diff --git a/src/web/text.go b/src/web/text.go
index e52f58d..c032015 100644
--- a/src/web/text.go
+++ b/src/web/text.go
@@ -34,9 +34,9 @@ var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|'
 // matches src="link" or even something along the lines of SrC    =  'link'
 var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`)
 
-// var emailRegexp *regexp.Regexp = regexp.MustCompile(`[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`)
+var emailRegexp *regexp.Regexp = regexp.MustCompile(`[A-Za-z0-9._%+\-!%&?~^#$]+@[A-Za-z0-9.\-]+\.[a-zA-Z]{2,4}`)
 
-var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*")
+// var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*")
 
 // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
 func ResolveLink(url *url.URL, fromHost string) string {
@@ -158,8 +158,8 @@ func FindPageEmailsWithCheck(pageBody []byte) []string {
 
 	emailAddresses := FindPageEmails(pageBody)
 	for _, email := range emailAddresses {
-		_, err := net.LookupMX(strings.Split(email, "@")[1])
-		if err != nil {
+		mx, err := net.LookupMX(strings.Split(email, "@")[1])
+		if err != nil || len(mx) == 0 {
 			continue
 		}