From 812fd2adf7c079f8c61b1bb6b0c3077e70dea5e3 Mon Sep 17 00:00:00 2001 From: Unbewohnte Date: Tue, 14 Feb 2023 19:03:57 +0300 Subject: [PATCH] Moved up until now separate text saving code to the worker package where it should be --- src/dashboard/dashboard.go | 18 +++++++++++++ src/main.go | 53 ++++++++++---------------------------- src/web/text.go | 2 -- src/worker/worker.go | 49 +++++++++++++++++++++++++---------- 4 files changed, 67 insertions(+), 55 deletions(-) diff --git a/src/dashboard/dashboard.go b/src/dashboard/dashboard.go index 5f941dd..303d751 100644 --- a/src/dashboard/dashboard.go +++ b/src/dashboard/dashboard.go @@ -1,3 +1,21 @@ +/* + Wecr - crawl the web for data + Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . +*/ + package dashboard import ( diff --git a/src/main.go b/src/main.go index 1888573..201cddd 100644 --- a/src/main.go +++ b/src/main.go @@ -19,7 +19,6 @@ package main import ( - "encoding/json" "flag" "fmt" "io" @@ -40,7 +39,7 @@ import ( "unbewohnte/wecr/worker" ) -const version = "v0.3.2" +const version = "v0.3.3" const ( configFilename string = "conf.json" @@ -68,7 +67,7 @@ var ( extractDataFilename = flag.String( "extractData", "", - "Set filename for output JSON file and extract data from it, put each entry nicely on a new line in a new file, then exit", + "Specify previously outputted JSON file and extract data from it, put each entry nicely on a new line in a new file, exit afterwards", ) workingDirectory string @@ -321,7 +320,7 @@ func main() { } } - // create logs if needed + // create and redirect logs if needed if conf.Logging.OutputLogs { if conf.Logging.LogsFile != "" { // output logs to a file @@ -399,23 +398,11 @@ func main() { VisitQueue: visitQueueFile, Lock: &sync.Mutex{}, }, + EmailsOutput: emailsOutputFile, + TextOutput: textOutputFile, }, &statistics) logger.Info("Created a worker pool with %d workers", conf.Workers) - // set up graceful shutdown - sig := make(chan os.Signal, 1) - signal.Notify(sig, os.Interrupt) - go func() { - <-sig - logger.Info("Received interrupt signal. Exiting...") - - // stop workers - workerPool.Stop() - - // close results channel - close(results) - }() - // launch concurrent scraping ! workerPool.Work() logger.Info("Started scraping...") @@ -441,27 +428,15 @@ func main() { }() } - // get text text results and write it to the output file (found files are handled by each worker separately) - var outputFile *os.File - for { - result, ok := <-results - if !ok { - break - } + // set up graceful shutdown + sig := make(chan os.Signal, 1) + signal.Notify(sig, os.Interrupt) + <-sig + logger.Info("Received interrupt signal. Exiting...") - // as it is possible to change configuration "on the fly" - it's better to not mess up different outputs - if result.Search.Query == config.QueryEmail { - outputFile = emailsOutputFile - } else { - outputFile = textOutputFile - } + // stop workers + workerPool.Stop() - // each entry in output file is a self-standing JSON object - entryBytes, err := json.MarshalIndent(result, " ", "\t") - if err != nil { - continue - } - outputFile.Write(entryBytes) - outputFile.Write([]byte("\n")) - } + // close results channel + close(results) } diff --git a/src/web/text.go b/src/web/text.go index c032015..594d28e 100644 --- a/src/web/text.go +++ b/src/web/text.go @@ -36,8 +36,6 @@ var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')( var emailRegexp *regexp.Regexp = regexp.MustCompile(`[A-Za-z0-9._%+\-!%&?~^#$]+@[A-Za-z0-9.\-]+\.[a-zA-Z]{2,4}`) -// var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*") - // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright func ResolveLink(url *url.URL, fromHost string) string { if !url.IsAbs() { diff --git a/src/worker/worker.go b/src/worker/worker.go index 078900d..f25d20e 100644 --- a/src/worker/worker.go +++ b/src/worker/worker.go @@ -19,7 +19,9 @@ package worker import ( + "encoding/json" "fmt" + "io" "net/url" "os" "path" @@ -46,12 +48,13 @@ type WorkerConf struct { BlacklistedDomains []string AllowedDomains []string VisitQueue VisitQueue + TextOutput io.Writer + EmailsOutput io.Writer } // Web worker type Worker struct { Jobs chan web.Job - Results chan web.Result Conf *WorkerConf visited *visited stats *Statistics @@ -62,7 +65,6 @@ type Worker struct { func NewWorker(jobs chan web.Job, results chan web.Result, conf *WorkerConf, visited *visited, stats *Statistics) Worker { return Worker{ Jobs: jobs, - Results: results, Conf: conf, visited: visited, stats: stats, @@ -138,6 +140,25 @@ func (w *Worker) savePage(baseURL *url.URL, pageData []byte) { } } +func (w *Worker) saveResult(result web.Result) { + // write result to the output file + + var output io.Writer + if result.Search.Query == config.QueryEmail { + output = w.Conf.EmailsOutput + } else { + output = w.Conf.TextOutput + } + + // each entry in output file is a self-standing JSON object + entryBytes, err := json.MarshalIndent(result, " ", "\t") + if err != nil { + return + } + output.Write(entryBytes) + output.Write([]byte("\n")) +} + // Launch scraping process on this worker func (w *Worker) Work() { if w.Stopped { @@ -319,11 +340,11 @@ func (w *Worker) Work() { // search for email emailAddresses := web.FindPageEmailsWithCheck(pageData) if len(emailAddresses) > 0 { - w.Results <- web.Result{ + w.saveResult(web.Result{ PageURL: job.URL, Search: job.Search, Data: emailAddresses, - } + }) w.stats.MatchesFound += uint64(len(emailAddresses)) savePage = true } @@ -339,22 +360,22 @@ func (w *Worker) Work() { contentLinks = append(contentLinks, web.FindPageDocuments(pageData, pageURL)...) w.saveContent(contentLinks, pageURL) + if len(contentLinks) > 0 { + savePage = true + } + // email emailAddresses := web.FindPageEmailsWithCheck(pageData) if len(emailAddresses) > 0 { - w.Results <- web.Result{ + w.saveResult(web.Result{ PageURL: job.URL, Search: job.Search, Data: emailAddresses, - } + }) w.stats.MatchesFound += uint64(len(emailAddresses)) savePage = true } - if len(contentLinks) > 0 || len(emailAddresses) > 0 { - savePage = true - } - default: // text search switch job.Search.IsRegexp { @@ -368,11 +389,11 @@ func (w *Worker) Work() { matches := web.FindPageRegexp(re, pageData) if len(matches) > 0 { - w.Results <- web.Result{ + w.saveResult(web.Result{ PageURL: job.URL, Search: job.Search, Data: matches, - } + }) logger.Info("Found matches: %+v", matches) w.stats.MatchesFound += uint64(len(matches)) savePage = true @@ -380,11 +401,11 @@ func (w *Worker) Work() { case false: // just text if web.IsTextOnPage(job.Search.Query, true, pageData) { - w.Results <- web.Result{ + w.saveResult(web.Result{ PageURL: job.URL, Search: job.Search, Data: []string{job.Search.Query}, - } + }) logger.Info("Found \"%s\" on page", job.Search.Query) w.stats.MatchesFound++ savePage = true