|
|
@ -24,13 +24,12 @@ import ( |
|
|
|
"fmt" |
|
|
|
"fmt" |
|
|
|
"io" |
|
|
|
"io" |
|
|
|
"log" |
|
|
|
"log" |
|
|
|
"net/http" |
|
|
|
|
|
|
|
_ "net/http/pprof" |
|
|
|
|
|
|
|
"net/url" |
|
|
|
"net/url" |
|
|
|
"os" |
|
|
|
"os" |
|
|
|
"os/signal" |
|
|
|
"os/signal" |
|
|
|
"path/filepath" |
|
|
|
"path/filepath" |
|
|
|
"strings" |
|
|
|
"strings" |
|
|
|
|
|
|
|
"sync" |
|
|
|
"time" |
|
|
|
"time" |
|
|
|
"unbewohnte/wecr/config" |
|
|
|
"unbewohnte/wecr/config" |
|
|
|
"unbewohnte/wecr/logger" |
|
|
|
"unbewohnte/wecr/logger" |
|
|
@ -39,12 +38,13 @@ import ( |
|
|
|
"unbewohnte/wecr/worker" |
|
|
|
"unbewohnte/wecr/worker" |
|
|
|
) |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
const version = "v0.2.3" |
|
|
|
const version = "v0.2.4" |
|
|
|
|
|
|
|
|
|
|
|
const ( |
|
|
|
const ( |
|
|
|
defaultConfigFile string = "conf.json" |
|
|
|
defaultConfigFile string = "conf.json" |
|
|
|
defaultOutputFile string = "output.json" |
|
|
|
defaultOutputFile string = "output.json" |
|
|
|
defaultPrettifiedOutputFile string = "extracted_data.txt" |
|
|
|
defaultPrettifiedOutputFile string = "extracted_data.txt" |
|
|
|
|
|
|
|
defaultVisitQueueFile string = "visit_queue.tmp" |
|
|
|
) |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
var ( |
|
|
|
var ( |
|
|
@ -138,10 +138,6 @@ func init() { |
|
|
|
|
|
|
|
|
|
|
|
// global path to output file
|
|
|
|
// global path to output file
|
|
|
|
outputFilePath = filepath.Join(workingDirectory, *outputFile) |
|
|
|
outputFilePath = filepath.Join(workingDirectory, *outputFile) |
|
|
|
|
|
|
|
|
|
|
|
go func() { |
|
|
|
|
|
|
|
http.ListenAndServe(":8000", nil) |
|
|
|
|
|
|
|
}() |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
func main() { |
|
|
|
func main() { |
|
|
@ -325,11 +321,41 @@ func main() { |
|
|
|
} |
|
|
|
} |
|
|
|
defer outputFile.Close() |
|
|
|
defer outputFile.Close() |
|
|
|
|
|
|
|
|
|
|
|
// prepare channels
|
|
|
|
|
|
|
|
jobs := make(chan web.Job, conf.Workers*5) |
|
|
|
jobs := make(chan web.Job, conf.Workers*5) |
|
|
|
results := make(chan web.Result, conf.Workers*5) |
|
|
|
results := make(chan web.Result, conf.Workers*5) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// create visit queue file if not turned off
|
|
|
|
|
|
|
|
var visitQueueFile *os.File = nil |
|
|
|
|
|
|
|
if !conf.InMemoryVisitQueue { |
|
|
|
|
|
|
|
var err error |
|
|
|
|
|
|
|
visitQueueFile, err = os.Create(filepath.Join(workingDirectory, defaultVisitQueueFile)) |
|
|
|
|
|
|
|
if err != nil { |
|
|
|
|
|
|
|
logger.Error("Could not create visit queue temporary file: %s", err) |
|
|
|
|
|
|
|
return |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
defer func() { |
|
|
|
|
|
|
|
visitQueueFile.Close() |
|
|
|
|
|
|
|
// os.Remove(filepath.Join(workingDirectory, defaultVisitQueueFile))
|
|
|
|
|
|
|
|
}() |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// create initial jobs
|
|
|
|
// create initial jobs
|
|
|
|
|
|
|
|
if !conf.InMemoryVisitQueue { |
|
|
|
|
|
|
|
encoder := json.NewEncoder(visitQueueFile) |
|
|
|
|
|
|
|
for _, initialPage := range conf.InitialPages { |
|
|
|
|
|
|
|
var newJob web.Job = web.Job{ |
|
|
|
|
|
|
|
URL: initialPage, |
|
|
|
|
|
|
|
Search: conf.Search, |
|
|
|
|
|
|
|
Depth: conf.Depth, |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
err = encoder.Encode(&newJob) |
|
|
|
|
|
|
|
if err != nil { |
|
|
|
|
|
|
|
logger.Error("Failed to encode an initial job to the visit queue: %s", err) |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
visitQueueFile.Seek(0, io.SeekStart) |
|
|
|
|
|
|
|
} else { |
|
|
|
for _, initialPage := range conf.InitialPages { |
|
|
|
for _, initialPage := range conf.InitialPages { |
|
|
|
jobs <- web.Job{ |
|
|
|
jobs <- web.Job{ |
|
|
|
URL: initialPage, |
|
|
|
URL: initialPage, |
|
|
@ -337,6 +363,7 @@ func main() { |
|
|
|
Depth: conf.Depth, |
|
|
|
Depth: conf.Depth, |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// form a worker pool
|
|
|
|
// form a worker pool
|
|
|
|
workerPool := worker.NewWorkerPool(jobs, results, conf.Workers, worker.WorkerConf{ |
|
|
|
workerPool := worker.NewWorkerPool(jobs, results, conf.Workers, worker.WorkerConf{ |
|
|
@ -344,6 +371,10 @@ func main() { |
|
|
|
Save: conf.Save, |
|
|
|
Save: conf.Save, |
|
|
|
BlacklistedDomains: conf.BlacklistedDomains, |
|
|
|
BlacklistedDomains: conf.BlacklistedDomains, |
|
|
|
AllowedDomains: conf.AllowedDomains, |
|
|
|
AllowedDomains: conf.AllowedDomains, |
|
|
|
|
|
|
|
VisitQueue: worker.VisitQueue{ |
|
|
|
|
|
|
|
VisitQueue: visitQueueFile, |
|
|
|
|
|
|
|
Lock: &sync.Mutex{}, |
|
|
|
|
|
|
|
}, |
|
|
|
}) |
|
|
|
}) |
|
|
|
logger.Info("Created a worker pool with %d workers", conf.Workers) |
|
|
|
logger.Info("Created a worker pool with %d workers", conf.Workers) |
|
|
|
|
|
|
|
|
|
|
|