diff --git a/README.md b/README.md index 247c75d..cad5fb5 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ The flow of work fully depends on the configuration file. By default `conf.json` The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them. -The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains` or are NOT in `allowed_domains`. If all initial pages are happen to be on blacklisted domains or are not in the allowed list - the program will get stuck. +The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains` or are NOT in `allowed_domains`. If all initial pages are happen to be on blacklisted domains or are not in the allowed list - the program will get stuck. It is important to note that `*_domains` should be specified with an existing scheme (ie: https://en.wikipedia.org). Subdomains and ports **matter**: `https://unbewohnte.su:3000/` and `https://unbewohnte.su/` are **different**. ### Search query diff --git a/src/logger/logger.go b/src/logger/logger.go index 3ba1e83..08668a3 100644 --- a/src/logger/logger.go +++ b/src/logger/logger.go @@ -50,6 +50,11 @@ func SetOutput(writer io.Writer) { errorLog.SetOutput(writer) } +// Get current logger's output writer +func GetOutput() io.Writer { + return infoLog.Writer() +} + // Log information func Info(format string, a ...interface{}) { infoLog.Printf(format, a...) diff --git a/src/main.go b/src/main.go index 9f294a5..d6e56b9 100644 --- a/src/main.go +++ b/src/main.go @@ -35,7 +35,7 @@ import ( "unbewohnte/wecr/worker" ) -const version = "v0.1.3" +const version = "v0.1.4" const ( defaultConfigFile string = "conf.json" @@ -87,6 +87,17 @@ func init() { os.Exit(0) } + // print logo + logger.GetOutput().Write([]byte( + `██╗ ██╗███████╗ ██████╗██████╗ +██║ ██║██╔════╝██╔════╝██╔══██╗ +██║ █╗ ██║█████╗ ██║ ██████╔╝ +██║███╗██║██╔══╝ ██║ ██╔══██╗ +╚███╔███╔╝███████╗╚██████╗██║ ██║ + ╚══╝╚══╝ ╚══════╝ ╚═════╝╚═╝ ╚═╝`), + ) + logger.GetOutput().Write([]byte(version + "\n\n")) + // work out working directory path if *wDir != "" { workingDirectory = *wDir @@ -151,7 +162,7 @@ func main() { } } else { // no logging needed - logger.Info("No logs will be outputted") + logger.Info("No further logs will be outputted") logger.SetOutput(nil) } @@ -167,21 +178,33 @@ func main() { for index, blacklistedDomain := range conf.BlacklistedDomains { parsedURL, err := url.Parse(blacklistedDomain) if err != nil { - logger.Warning("Failed to parse blacklisted %s: %s", blacklistedDomain, err) + logger.Warning("Failed to parse blacklisted \"%s\": %s", blacklistedDomain, err) + continue + } + + if parsedURL.Scheme == "" { + // parsing is invalid, as stdlib says + logger.Warning("Failed to parse blacklisted \"%s\": no scheme specified", blacklistedDomain) continue } - conf.BlacklistedDomains[index] = parsedURL.Hostname() + conf.BlacklistedDomains[index] = parsedURL.Host } for index, allowedDomain := range conf.AllowedDomains { parsedURL, err := url.Parse(allowedDomain) if err != nil { - logger.Warning("Failed to parse allowed %s: %s", allowedDomain, err) + logger.Warning("Failed to parse allowed \"%s\": %s", allowedDomain, err) + continue + } + + if parsedURL.Scheme == "" { + // parsing is invalid, as stdlib says + logger.Warning("Failed to parse allowed \"%s\": no scheme specified", allowedDomain) continue } - conf.AllowedDomains[index] = parsedURL.Hostname() + conf.AllowedDomains[index] = parsedURL.Host } if conf.Depth <= 0 { @@ -252,6 +275,7 @@ func main() { Requests: conf.Requests, Save: conf.Save, BlacklistedDomains: conf.BlacklistedDomains, + AllowedDomains: conf.AllowedDomains, }) logger.Info("Created a worker pool with %d workers", conf.Workers) @@ -282,9 +306,10 @@ func main() { timeSince := time.Since(workerPool.Stats.StartTime).Round(time.Second) - fmt.Fprintf(os.Stdout, "\r[%s] %d pages; %d matches (%d pages/sec)", + fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d saved; %d matches (%d pages/sec)", timeSince.String(), workerPool.Stats.PagesVisited, + workerPool.Stats.PagesSaved, workerPool.Stats.MatchesFound, workerPool.Stats.PagesVisited/uint64(timeSince.Seconds()), ) diff --git a/src/web/images.go b/src/web/images.go index cd120dc..a6aad61 100644 --- a/src/web/images.go +++ b/src/web/images.go @@ -82,7 +82,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string { imageURLString := ResolveLink(imageURL, from.Host) - if attribute.Key == "src" { + if token.Data == "img" { // tag -> don't check urls = append(urls, imageURLString) } else { diff --git a/src/worker/pool.go b/src/worker/pool.go index 9e83347..95e5662 100644 --- a/src/worker/pool.go +++ b/src/worker/pool.go @@ -34,6 +34,7 @@ type visited struct { type Statistics struct { PagesVisited uint64 MatchesFound uint64 + PagesSaved uint64 StartTime time.Time } diff --git a/src/worker/worker.go b/src/worker/worker.go index 8014e7b..2e0e96d 100644 --- a/src/worker/worker.go +++ b/src/worker/worker.go @@ -93,7 +93,6 @@ func (w *Worker) Work() { return } - // see if the domain is allowed and is not blacklisted var skip bool = false pageURL, err := url.Parse(job.URL) if err != nil { @@ -101,30 +100,34 @@ func (w *Worker) Work() { continue } - for _, allowedDomain := range w.Conf.AllowedDomains { - if pageURL.Hostname() != allowedDomain { - skip = true + // see if the domain is allowed and is not blacklisted + if len(w.Conf.AllowedDomains) > 0 { + skip = true + for _, allowedDomain := range w.Conf.AllowedDomains { + if pageURL.Host == allowedDomain { + skip = false + break + } + } + if skip { logger.Info("Skipped non-allowed %s", job.URL) - break + continue } } - for _, blacklistedDomain := range w.Conf.BlacklistedDomains { - if skip { - break + if len(w.Conf.BlacklistedDomains) > 0 { + for _, blacklistedDomain := range w.Conf.BlacklistedDomains { + if pageURL.Host == blacklistedDomain { + skip = true + logger.Info("Skipped blacklisted %s", job.URL) + break + } } - - if pageURL.Hostname() == blacklistedDomain { - skip = true - logger.Info("Skipped blacklisted %s", job.URL) - break + if skip { + continue } } - if skip { - continue - } - // check if it is the first occurence w.visited.Lock.Lock() for _, visitedURL := range w.visited.URLs { @@ -281,6 +284,7 @@ func (w *Worker) Work() { // save page if savePage { w.savePage(pageURL, pageData) + w.stats.PagesSaved++ } // sleep before the next request