From 812fd2adf7c079f8c61b1bb6b0c3077e70dea5e3 Mon Sep 17 00:00:00 2001
From: Unbewohnte <me@unbewohnte.su>
Date: Tue, 14 Feb 2023 19:03:57 +0300
Subject: [PATCH] Moved up until now separate text saving code to the worker
 package where it should be

---
 src/dashboard/dashboard.go | 18 +++++++++++++
 src/main.go                | 53 ++++++++++----------------------------
 src/web/text.go            |  2 --
 src/worker/worker.go       | 49 +++++++++++++++++++++++++----------
 4 files changed, 67 insertions(+), 55 deletions(-)

diff --git a/src/dashboard/dashboard.go b/src/dashboard/dashboard.go
index 5f941dd..303d751 100644
--- a/src/dashboard/dashboard.go
+++ b/src/dashboard/dashboard.go
@@ -1,3 +1,21 @@
+/*
+	Wecr - crawl the web for data
+	Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
+
+	This program is free software: you can redistribute it and/or modify
+	it under the terms of the GNU Affero General Public License as published by
+	the Free Software Foundation, either version 3 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU Affero General Public License for more details.
+
+	You should have received a copy of the GNU Affero General Public License
+	along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
 package dashboard
 
 import (
diff --git a/src/main.go b/src/main.go
index 1888573..201cddd 100644
--- a/src/main.go
+++ b/src/main.go
@@ -19,7 +19,6 @@
 package main
 
 import (
-	"encoding/json"
 	"flag"
 	"fmt"
 	"io"
@@ -40,7 +39,7 @@ import (
 	"unbewohnte/wecr/worker"
 )
 
-const version = "v0.3.2"
+const version = "v0.3.3"
 
 const (
 	configFilename               string = "conf.json"
@@ -68,7 +67,7 @@ var (
 
 	extractDataFilename = flag.String(
 		"extractData", "",
-		"Set filename for output JSON file and extract data from it, put each entry nicely on a new line in a new file, then exit",
+		"Specify previously outputted JSON file and extract data from it, put each entry nicely on a new line in a new file, exit afterwards",
 	)
 
 	workingDirectory string
@@ -321,7 +320,7 @@ func main() {
 		}
 	}
 
-	// create logs if needed
+	// create and redirect logs if needed
 	if conf.Logging.OutputLogs {
 		if conf.Logging.LogsFile != "" {
 			// output logs to a file
@@ -399,23 +398,11 @@ func main() {
 			VisitQueue: visitQueueFile,
 			Lock:       &sync.Mutex{},
 		},
+		EmailsOutput: emailsOutputFile,
+		TextOutput:   textOutputFile,
 	}, &statistics)
 	logger.Info("Created a worker pool with %d workers", conf.Workers)
 
-	// set up graceful shutdown
-	sig := make(chan os.Signal, 1)
-	signal.Notify(sig, os.Interrupt)
-	go func() {
-		<-sig
-		logger.Info("Received interrupt signal. Exiting...")
-
-		// stop workers
-		workerPool.Stop()
-
-		// close results channel
-		close(results)
-	}()
-
 	// launch concurrent scraping !
 	workerPool.Work()
 	logger.Info("Started scraping...")
@@ -441,27 +428,15 @@ func main() {
 		}()
 	}
 
-	// get text text results and write it to the output file (found files are handled by each worker separately)
-	var outputFile *os.File
-	for {
-		result, ok := <-results
-		if !ok {
-			break
-		}
+	// set up graceful shutdown
+	sig := make(chan os.Signal, 1)
+	signal.Notify(sig, os.Interrupt)
+	<-sig
+	logger.Info("Received interrupt signal. Exiting...")
 
-		// as it is possible to change configuration "on the fly" - it's better to not mess up different outputs
-		if result.Search.Query == config.QueryEmail {
-			outputFile = emailsOutputFile
-		} else {
-			outputFile = textOutputFile
-		}
+	// stop workers
+	workerPool.Stop()
 
-		// each entry in output file is a self-standing JSON object
-		entryBytes, err := json.MarshalIndent(result, " ", "\t")
-		if err != nil {
-			continue
-		}
-		outputFile.Write(entryBytes)
-		outputFile.Write([]byte("\n"))
-	}
+	// close results channel
+	close(results)
 }
diff --git a/src/web/text.go b/src/web/text.go
index c032015..594d28e 100644
--- a/src/web/text.go
+++ b/src/web/text.go
@@ -36,8 +36,6 @@ var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(
 
 var emailRegexp *regexp.Regexp = regexp.MustCompile(`[A-Za-z0-9._%+\-!%&?~^#$]+@[A-Za-z0-9.\-]+\.[a-zA-Z]{2,4}`)
 
-// var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*")
-
 // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
 func ResolveLink(url *url.URL, fromHost string) string {
 	if !url.IsAbs() {
diff --git a/src/worker/worker.go b/src/worker/worker.go
index 078900d..f25d20e 100644
--- a/src/worker/worker.go
+++ b/src/worker/worker.go
@@ -19,7 +19,9 @@
 package worker
 
 import (
+	"encoding/json"
 	"fmt"
+	"io"
 	"net/url"
 	"os"
 	"path"
@@ -46,12 +48,13 @@ type WorkerConf struct {
 	BlacklistedDomains []string
 	AllowedDomains     []string
 	VisitQueue         VisitQueue
+	TextOutput         io.Writer
+	EmailsOutput       io.Writer
 }
 
 // Web worker
 type Worker struct {
 	Jobs    chan web.Job
-	Results chan web.Result
 	Conf    *WorkerConf
 	visited *visited
 	stats   *Statistics
@@ -62,7 +65,6 @@ type Worker struct {
 func NewWorker(jobs chan web.Job, results chan web.Result, conf *WorkerConf, visited *visited, stats *Statistics) Worker {
 	return Worker{
 		Jobs:    jobs,
-		Results: results,
 		Conf:    conf,
 		visited: visited,
 		stats:   stats,
@@ -138,6 +140,25 @@ func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
 	}
 }
 
+func (w *Worker) saveResult(result web.Result) {
+	// write result to the output file
+
+	var output io.Writer
+	if result.Search.Query == config.QueryEmail {
+		output = w.Conf.EmailsOutput
+	} else {
+		output = w.Conf.TextOutput
+	}
+
+	// each entry in output file is a self-standing JSON object
+	entryBytes, err := json.MarshalIndent(result, " ", "\t")
+	if err != nil {
+		return
+	}
+	output.Write(entryBytes)
+	output.Write([]byte("\n"))
+}
+
 // Launch scraping process on this worker
 func (w *Worker) Work() {
 	if w.Stopped {
@@ -319,11 +340,11 @@ func (w *Worker) Work() {
 			// search for email
 			emailAddresses := web.FindPageEmailsWithCheck(pageData)
 			if len(emailAddresses) > 0 {
-				w.Results <- web.Result{
+				w.saveResult(web.Result{
 					PageURL: job.URL,
 					Search:  job.Search,
 					Data:    emailAddresses,
-				}
+				})
 				w.stats.MatchesFound += uint64(len(emailAddresses))
 				savePage = true
 			}
@@ -339,22 +360,22 @@ func (w *Worker) Work() {
 			contentLinks = append(contentLinks, web.FindPageDocuments(pageData, pageURL)...)
 			w.saveContent(contentLinks, pageURL)
 
+			if len(contentLinks) > 0 {
+				savePage = true
+			}
+
 			// email
 			emailAddresses := web.FindPageEmailsWithCheck(pageData)
 			if len(emailAddresses) > 0 {
-				w.Results <- web.Result{
+				w.saveResult(web.Result{
 					PageURL: job.URL,
 					Search:  job.Search,
 					Data:    emailAddresses,
-				}
+				})
 				w.stats.MatchesFound += uint64(len(emailAddresses))
 				savePage = true
 			}
 
-			if len(contentLinks) > 0 || len(emailAddresses) > 0 {
-				savePage = true
-			}
-
 		default:
 			// text search
 			switch job.Search.IsRegexp {
@@ -368,11 +389,11 @@ func (w *Worker) Work() {
 
 				matches := web.FindPageRegexp(re, pageData)
 				if len(matches) > 0 {
-					w.Results <- web.Result{
+					w.saveResult(web.Result{
 						PageURL: job.URL,
 						Search:  job.Search,
 						Data:    matches,
-					}
+					})
 					logger.Info("Found matches: %+v", matches)
 					w.stats.MatchesFound += uint64(len(matches))
 					savePage = true
@@ -380,11 +401,11 @@ func (w *Worker) Work() {
 			case false:
 				// just text
 				if web.IsTextOnPage(job.Search.Query, true, pageData) {
-					w.Results <- web.Result{
+					w.saveResult(web.Result{
 						PageURL: job.URL,
 						Search:  job.Search,
 						Data:    []string{job.Search.Query},
-					}
+					})
 					logger.Info("Found \"%s\" on page", job.Search.Query)
 					w.stats.MatchesFound++
 					savePage = true