Browse Source

Moved up until now separate text saving code to the worker package where it should be

master
parent
commit
812fd2adf7
  1. 18
      src/dashboard/dashboard.go
  2. 53
      src/main.go
  3. 2
      src/web/text.go
  4. 49
      src/worker/worker.go

18
src/dashboard/dashboard.go

@ -1,3 +1,21 @@
/*
Wecr - crawl the web for data
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package dashboard package dashboard
import ( import (

53
src/main.go

@ -19,7 +19,6 @@
package main package main
import ( import (
"encoding/json"
"flag" "flag"
"fmt" "fmt"
"io" "io"
@ -40,7 +39,7 @@ import (
"unbewohnte/wecr/worker" "unbewohnte/wecr/worker"
) )
const version = "v0.3.2" const version = "v0.3.3"
const ( const (
configFilename string = "conf.json" configFilename string = "conf.json"
@ -68,7 +67,7 @@ var (
extractDataFilename = flag.String( extractDataFilename = flag.String(
"extractData", "", "extractData", "",
"Set filename for output JSON file and extract data from it, put each entry nicely on a new line in a new file, then exit", "Specify previously outputted JSON file and extract data from it, put each entry nicely on a new line in a new file, exit afterwards",
) )
workingDirectory string workingDirectory string
@ -321,7 +320,7 @@ func main() {
} }
} }
// create logs if needed // create and redirect logs if needed
if conf.Logging.OutputLogs { if conf.Logging.OutputLogs {
if conf.Logging.LogsFile != "" { if conf.Logging.LogsFile != "" {
// output logs to a file // output logs to a file
@ -399,23 +398,11 @@ func main() {
VisitQueue: visitQueueFile, VisitQueue: visitQueueFile,
Lock: &sync.Mutex{}, Lock: &sync.Mutex{},
}, },
EmailsOutput: emailsOutputFile,
TextOutput: textOutputFile,
}, &statistics) }, &statistics)
logger.Info("Created a worker pool with %d workers", conf.Workers) logger.Info("Created a worker pool with %d workers", conf.Workers)
// set up graceful shutdown
sig := make(chan os.Signal, 1)
signal.Notify(sig, os.Interrupt)
go func() {
<-sig
logger.Info("Received interrupt signal. Exiting...")
// stop workers
workerPool.Stop()
// close results channel
close(results)
}()
// launch concurrent scraping ! // launch concurrent scraping !
workerPool.Work() workerPool.Work()
logger.Info("Started scraping...") logger.Info("Started scraping...")
@ -441,27 +428,15 @@ func main() {
}() }()
} }
// get text text results and write it to the output file (found files are handled by each worker separately) // set up graceful shutdown
var outputFile *os.File sig := make(chan os.Signal, 1)
for { signal.Notify(sig, os.Interrupt)
result, ok := <-results <-sig
if !ok { logger.Info("Received interrupt signal. Exiting...")
break
}
// as it is possible to change configuration "on the fly" - it's better to not mess up different outputs // stop workers
if result.Search.Query == config.QueryEmail { workerPool.Stop()
outputFile = emailsOutputFile
} else {
outputFile = textOutputFile
}
// each entry in output file is a self-standing JSON object // close results channel
entryBytes, err := json.MarshalIndent(result, " ", "\t") close(results)
if err != nil {
continue
}
outputFile.Write(entryBytes)
outputFile.Write([]byte("\n"))
}
} }

2
src/web/text.go

@ -36,8 +36,6 @@ var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(
var emailRegexp *regexp.Regexp = regexp.MustCompile(`[A-Za-z0-9._%+\-!%&?~^#$]+@[A-Za-z0-9.\-]+\.[a-zA-Z]{2,4}`) var emailRegexp *regexp.Regexp = regexp.MustCompile(`[A-Za-z0-9._%+\-!%&?~^#$]+@[A-Za-z0-9.\-]+\.[a-zA-Z]{2,4}`)
// var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*")
// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
func ResolveLink(url *url.URL, fromHost string) string { func ResolveLink(url *url.URL, fromHost string) string {
if !url.IsAbs() { if !url.IsAbs() {

49
src/worker/worker.go

@ -19,7 +19,9 @@
package worker package worker
import ( import (
"encoding/json"
"fmt" "fmt"
"io"
"net/url" "net/url"
"os" "os"
"path" "path"
@ -46,12 +48,13 @@ type WorkerConf struct {
BlacklistedDomains []string BlacklistedDomains []string
AllowedDomains []string AllowedDomains []string
VisitQueue VisitQueue VisitQueue VisitQueue
TextOutput io.Writer
EmailsOutput io.Writer
} }
// Web worker // Web worker
type Worker struct { type Worker struct {
Jobs chan web.Job Jobs chan web.Job
Results chan web.Result
Conf *WorkerConf Conf *WorkerConf
visited *visited visited *visited
stats *Statistics stats *Statistics
@ -62,7 +65,6 @@ type Worker struct {
func NewWorker(jobs chan web.Job, results chan web.Result, conf *WorkerConf, visited *visited, stats *Statistics) Worker { func NewWorker(jobs chan web.Job, results chan web.Result, conf *WorkerConf, visited *visited, stats *Statistics) Worker {
return Worker{ return Worker{
Jobs: jobs, Jobs: jobs,
Results: results,
Conf: conf, Conf: conf,
visited: visited, visited: visited,
stats: stats, stats: stats,
@ -138,6 +140,25 @@ func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
} }
} }
func (w *Worker) saveResult(result web.Result) {
// write result to the output file
var output io.Writer
if result.Search.Query == config.QueryEmail {
output = w.Conf.EmailsOutput
} else {
output = w.Conf.TextOutput
}
// each entry in output file is a self-standing JSON object
entryBytes, err := json.MarshalIndent(result, " ", "\t")
if err != nil {
return
}
output.Write(entryBytes)
output.Write([]byte("\n"))
}
// Launch scraping process on this worker // Launch scraping process on this worker
func (w *Worker) Work() { func (w *Worker) Work() {
if w.Stopped { if w.Stopped {
@ -319,11 +340,11 @@ func (w *Worker) Work() {
// search for email // search for email
emailAddresses := web.FindPageEmailsWithCheck(pageData) emailAddresses := web.FindPageEmailsWithCheck(pageData)
if len(emailAddresses) > 0 { if len(emailAddresses) > 0 {
w.Results <- web.Result{ w.saveResult(web.Result{
PageURL: job.URL, PageURL: job.URL,
Search: job.Search, Search: job.Search,
Data: emailAddresses, Data: emailAddresses,
} })
w.stats.MatchesFound += uint64(len(emailAddresses)) w.stats.MatchesFound += uint64(len(emailAddresses))
savePage = true savePage = true
} }
@ -339,22 +360,22 @@ func (w *Worker) Work() {
contentLinks = append(contentLinks, web.FindPageDocuments(pageData, pageURL)...) contentLinks = append(contentLinks, web.FindPageDocuments(pageData, pageURL)...)
w.saveContent(contentLinks, pageURL) w.saveContent(contentLinks, pageURL)
if len(contentLinks) > 0 {
savePage = true
}
// email // email
emailAddresses := web.FindPageEmailsWithCheck(pageData) emailAddresses := web.FindPageEmailsWithCheck(pageData)
if len(emailAddresses) > 0 { if len(emailAddresses) > 0 {
w.Results <- web.Result{ w.saveResult(web.Result{
PageURL: job.URL, PageURL: job.URL,
Search: job.Search, Search: job.Search,
Data: emailAddresses, Data: emailAddresses,
} })
w.stats.MatchesFound += uint64(len(emailAddresses)) w.stats.MatchesFound += uint64(len(emailAddresses))
savePage = true savePage = true
} }
if len(contentLinks) > 0 || len(emailAddresses) > 0 {
savePage = true
}
default: default:
// text search // text search
switch job.Search.IsRegexp { switch job.Search.IsRegexp {
@ -368,11 +389,11 @@ func (w *Worker) Work() {
matches := web.FindPageRegexp(re, pageData) matches := web.FindPageRegexp(re, pageData)
if len(matches) > 0 { if len(matches) > 0 {
w.Results <- web.Result{ w.saveResult(web.Result{
PageURL: job.URL, PageURL: job.URL,
Search: job.Search, Search: job.Search,
Data: matches, Data: matches,
} })
logger.Info("Found matches: %+v", matches) logger.Info("Found matches: %+v", matches)
w.stats.MatchesFound += uint64(len(matches)) w.stats.MatchesFound += uint64(len(matches))
savePage = true savePage = true
@ -380,11 +401,11 @@ func (w *Worker) Work() {
case false: case false:
// just text // just text
if web.IsTextOnPage(job.Search.Query, true, pageData) { if web.IsTextOnPage(job.Search.Query, true, pageData) {
w.Results <- web.Result{ w.saveResult(web.Result{
PageURL: job.URL, PageURL: job.URL,
Search: job.Search, Search: job.Search,
Data: []string{job.Search.Query}, Data: []string{job.Search.Query},
} })
logger.Info("Found \"%s\" on page", job.Search.Query) logger.Info("Found \"%s\" on page", job.Search.Query)
w.stats.MatchesFound++ w.stats.MatchesFound++
savePage = true savePage = true

Loading…
Cancel
Save