Browse Source

Moved up until now separate text saving code to the worker package where it should be

master
parent
commit
812fd2adf7
  1. 18
      src/dashboard/dashboard.go
  2. 53
      src/main.go
  3. 2
      src/web/text.go
  4. 49
      src/worker/worker.go

18
src/dashboard/dashboard.go

@ -1,3 +1,21 @@
/*
Wecr - crawl the web for data
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package dashboard
import (

53
src/main.go

@ -19,7 +19,6 @@
package main
import (
"encoding/json"
"flag"
"fmt"
"io"
@ -40,7 +39,7 @@ import (
"unbewohnte/wecr/worker"
)
const version = "v0.3.2"
const version = "v0.3.3"
const (
configFilename string = "conf.json"
@ -68,7 +67,7 @@ var (
extractDataFilename = flag.String(
"extractData", "",
"Set filename for output JSON file and extract data from it, put each entry nicely on a new line in a new file, then exit",
"Specify previously outputted JSON file and extract data from it, put each entry nicely on a new line in a new file, exit afterwards",
)
workingDirectory string
@ -321,7 +320,7 @@ func main() {
}
}
// create logs if needed
// create and redirect logs if needed
if conf.Logging.OutputLogs {
if conf.Logging.LogsFile != "" {
// output logs to a file
@ -399,23 +398,11 @@ func main() {
VisitQueue: visitQueueFile,
Lock: &sync.Mutex{},
},
EmailsOutput: emailsOutputFile,
TextOutput: textOutputFile,
}, &statistics)
logger.Info("Created a worker pool with %d workers", conf.Workers)
// set up graceful shutdown
sig := make(chan os.Signal, 1)
signal.Notify(sig, os.Interrupt)
go func() {
<-sig
logger.Info("Received interrupt signal. Exiting...")
// stop workers
workerPool.Stop()
// close results channel
close(results)
}()
// launch concurrent scraping !
workerPool.Work()
logger.Info("Started scraping...")
@ -441,27 +428,15 @@ func main() {
}()
}
// get text text results and write it to the output file (found files are handled by each worker separately)
var outputFile *os.File
for {
result, ok := <-results
if !ok {
break
}
// set up graceful shutdown
sig := make(chan os.Signal, 1)
signal.Notify(sig, os.Interrupt)
<-sig
logger.Info("Received interrupt signal. Exiting...")
// as it is possible to change configuration "on the fly" - it's better to not mess up different outputs
if result.Search.Query == config.QueryEmail {
outputFile = emailsOutputFile
} else {
outputFile = textOutputFile
}
// stop workers
workerPool.Stop()
// each entry in output file is a self-standing JSON object
entryBytes, err := json.MarshalIndent(result, " ", "\t")
if err != nil {
continue
}
outputFile.Write(entryBytes)
outputFile.Write([]byte("\n"))
}
// close results channel
close(results)
}

2
src/web/text.go

@ -36,8 +36,6 @@ var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(
var emailRegexp *regexp.Regexp = regexp.MustCompile(`[A-Za-z0-9._%+\-!%&?~^#$]+@[A-Za-z0-9.\-]+\.[a-zA-Z]{2,4}`)
// var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*")
// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
func ResolveLink(url *url.URL, fromHost string) string {
if !url.IsAbs() {

49
src/worker/worker.go

@ -19,7 +19,9 @@
package worker
import (
"encoding/json"
"fmt"
"io"
"net/url"
"os"
"path"
@ -46,12 +48,13 @@ type WorkerConf struct {
BlacklistedDomains []string
AllowedDomains []string
VisitQueue VisitQueue
TextOutput io.Writer
EmailsOutput io.Writer
}
// Web worker
type Worker struct {
Jobs chan web.Job
Results chan web.Result
Conf *WorkerConf
visited *visited
stats *Statistics
@ -62,7 +65,6 @@ type Worker struct {
func NewWorker(jobs chan web.Job, results chan web.Result, conf *WorkerConf, visited *visited, stats *Statistics) Worker {
return Worker{
Jobs: jobs,
Results: results,
Conf: conf,
visited: visited,
stats: stats,
@ -138,6 +140,25 @@ func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
}
}
func (w *Worker) saveResult(result web.Result) {
// write result to the output file
var output io.Writer
if result.Search.Query == config.QueryEmail {
output = w.Conf.EmailsOutput
} else {
output = w.Conf.TextOutput
}
// each entry in output file is a self-standing JSON object
entryBytes, err := json.MarshalIndent(result, " ", "\t")
if err != nil {
return
}
output.Write(entryBytes)
output.Write([]byte("\n"))
}
// Launch scraping process on this worker
func (w *Worker) Work() {
if w.Stopped {
@ -319,11 +340,11 @@ func (w *Worker) Work() {
// search for email
emailAddresses := web.FindPageEmailsWithCheck(pageData)
if len(emailAddresses) > 0 {
w.Results <- web.Result{
w.saveResult(web.Result{
PageURL: job.URL,
Search: job.Search,
Data: emailAddresses,
}
})
w.stats.MatchesFound += uint64(len(emailAddresses))
savePage = true
}
@ -339,22 +360,22 @@ func (w *Worker) Work() {
contentLinks = append(contentLinks, web.FindPageDocuments(pageData, pageURL)...)
w.saveContent(contentLinks, pageURL)
if len(contentLinks) > 0 {
savePage = true
}
// email
emailAddresses := web.FindPageEmailsWithCheck(pageData)
if len(emailAddresses) > 0 {
w.Results <- web.Result{
w.saveResult(web.Result{
PageURL: job.URL,
Search: job.Search,
Data: emailAddresses,
}
})
w.stats.MatchesFound += uint64(len(emailAddresses))
savePage = true
}
if len(contentLinks) > 0 || len(emailAddresses) > 0 {
savePage = true
}
default:
// text search
switch job.Search.IsRegexp {
@ -368,11 +389,11 @@ func (w *Worker) Work() {
matches := web.FindPageRegexp(re, pageData)
if len(matches) > 0 {
w.Results <- web.Result{
w.saveResult(web.Result{
PageURL: job.URL,
Search: job.Search,
Data: matches,
}
})
logger.Info("Found matches: %+v", matches)
w.stats.MatchesFound += uint64(len(matches))
savePage = true
@ -380,11 +401,11 @@ func (w *Worker) Work() {
case false:
// just text
if web.IsTextOnPage(job.Search.Query, true, pageData) {
w.Results <- web.Result{
w.saveResult(web.Result{
PageURL: job.URL,
Search: job.Search,
Data: []string{job.Search.Query},
}
})
logger.Info("Found \"%s\" on page", job.Search.Query)
w.stats.MatchesFound++
savePage = true

Loading…
Cancel
Save