Fixed allowed_domains not being used

2 years ago · b84af90d08
6 changed files with 61 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@ The flow of work fully depends on the configuration file. By default `conf.json`

 The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.

-The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains` or are NOT in `allowed_domains`. If all initial pages are happen to be on blacklisted domains or are not in the allowed list - the program will get stuck.
+The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains` or are NOT in `allowed_domains`. If all initial pages are happen to be on blacklisted domains or are not in the allowed list - the program will get stuck. It is important to note that `*_domains` should be specified with an existing scheme (ie: https://en.wikipedia.org). Subdomains and ports **matter**: `https://unbewohnte.su:3000/` and `https://unbewohnte.su/` are **different**.  

 ### Search query

--- a/src/logger/logger.go
+++ b/src/logger/logger.go
@ -50,6 +50,11 @@ func SetOutput(writer io.Writer) {
 	errorLog.SetOutput(writer)
 }

+// Get current logger's output writer
+func GetOutput() io.Writer {
+	return infoLog.Writer()
+}
+
 // Log information
 func Info(format string, a ...interface{}) {
 	infoLog.Printf(format, a...)
--- a/src/main.go
+++ b/src/main.go
@ -35,7 +35,7 @@ import (
 	"unbewohnte/wecr/worker"
 )

-const version = "v0.1.3"
+const version = "v0.1.4"

 const (
 	defaultConfigFile string = "conf.json"
@ -87,6 +87,17 @@ func init() {
 		os.Exit(0)
 	}

+	// print logo
+	logger.GetOutput().Write([]byte(
+		`██╗    ██╗███████╗ ██████╗██████╗ 
+██║    ██║██╔════╝██╔════╝██╔══██╗
+██║ █╗ ██║█████╗  ██║     ██████╔╝
+██║███╗██║██╔══╝  ██║     ██╔══██╗
+╚███╔███╔╝███████╗╚██████╗██║  ██║
+ ╚══╝╚══╝ ╚══════╝ ╚═════╝╚═╝  ╚═╝`),
+	)
+	logger.GetOutput().Write([]byte(version + "\n\n"))
+
 	// work out working directory path
 	if *wDir != "" {
 		workingDirectory = *wDir
@ -151,7 +162,7 @@ func main() {
 		}
 	} else {
 		// no logging needed
-		logger.Info("No logs will be outputted")
+		logger.Info("No further logs will be outputted")
 		logger.SetOutput(nil)
 	}

@ -167,21 +178,33 @@ func main() {
 	for index, blacklistedDomain := range conf.BlacklistedDomains {
 		parsedURL, err := url.Parse(blacklistedDomain)
 		if err != nil {
-			logger.Warning("Failed to parse blacklisted %s: %s", blacklistedDomain, err)
+			logger.Warning("Failed to parse blacklisted \"%s\": %s", blacklistedDomain, err)
+			continue
+		}
+
+		if parsedURL.Scheme == "" {
+			// parsing is invalid, as stdlib says
+			logger.Warning("Failed to parse blacklisted \"%s\": no scheme specified", blacklistedDomain)
 			continue
 		}

-		conf.BlacklistedDomains[index] = parsedURL.Hostname()
+		conf.BlacklistedDomains[index] = parsedURL.Host
 	}

 	for index, allowedDomain := range conf.AllowedDomains {
 		parsedURL, err := url.Parse(allowedDomain)
 		if err != nil {
-			logger.Warning("Failed to parse allowed %s: %s", allowedDomain, err)
+			logger.Warning("Failed to parse allowed \"%s\": %s", allowedDomain, err)
+			continue
+		}
+
+		if parsedURL.Scheme == "" {
+			// parsing is invalid, as stdlib says
+			logger.Warning("Failed to parse allowed \"%s\": no scheme specified", allowedDomain)
 			continue
 		}

-		conf.AllowedDomains[index] = parsedURL.Hostname()
+		conf.AllowedDomains[index] = parsedURL.Host
 	}

 	if conf.Depth <= 0 {
@ -252,6 +275,7 @@ func main() {
 		Requests:           conf.Requests,
 		Save:               conf.Save,
 		BlacklistedDomains: conf.BlacklistedDomains,
+		AllowedDomains:     conf.AllowedDomains,
 	})
 	logger.Info("Created a worker pool with %d workers", conf.Workers)

@ -282,9 +306,10 @@ func main() {

 				timeSince := time.Since(workerPool.Stats.StartTime).Round(time.Second)

-				fmt.Fprintf(os.Stdout, "\r[%s] %d pages; %d matches (%d pages/sec)",
+				fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d saved; %d matches (%d pages/sec)",
 					timeSince.String(),
 					workerPool.Stats.PagesVisited,
+					workerPool.Stats.PagesSaved,
 					workerPool.Stats.MatchesFound,
 					workerPool.Stats.PagesVisited/uint64(timeSince.Seconds()),
 				)
--- a/src/web/images.go
+++ b/src/web/images.go
@ -82,7 +82,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string {

 				imageURLString := ResolveLink(imageURL, from.Host)

-				if attribute.Key == "src" {
+				if token.Data == "img" {
 					// <img> tag -> don't check
 					urls = append(urls, imageURLString)
 				} else {
--- a/src/worker/pool.go
+++ b/src/worker/pool.go
@ -34,6 +34,7 @@ type visited struct {
 type Statistics struct {
 	PagesVisited uint64
 	MatchesFound uint64
+	PagesSaved   uint64
 	StartTime    time.Time
 }

--- a/src/worker/worker.go
+++ b/src/worker/worker.go
@ -93,7 +93,6 @@ func (w *Worker) Work() {
 			return
 		}

-		// see if the domain is allowed and is not blacklisted
 		var skip bool = false
 		pageURL, err := url.Parse(job.URL)
 		if err != nil {
@ -101,30 +100,34 @@ func (w *Worker) Work() {
 			continue
 		}

-		for _, allowedDomain := range w.Conf.AllowedDomains {
-			if pageURL.Hostname() != allowedDomain {
-				skip = true
+		// see if the domain is allowed and is not blacklisted
+		if len(w.Conf.AllowedDomains) > 0 {
+			skip = true
+			for _, allowedDomain := range w.Conf.AllowedDomains {
+				if pageURL.Host == allowedDomain {
+					skip = false
+					break
+				}
+			}
+			if skip {
 				logger.Info("Skipped non-allowed %s", job.URL)
-				break
+				continue
 			}
 		}

-		for _, blacklistedDomain := range w.Conf.BlacklistedDomains {
-			if skip {
-				break
+		if len(w.Conf.BlacklistedDomains) > 0 {
+			for _, blacklistedDomain := range w.Conf.BlacklistedDomains {
+				if pageURL.Host == blacklistedDomain {
+					skip = true
+					logger.Info("Skipped blacklisted %s", job.URL)
+					break
+				}
 			}
-
-			if pageURL.Hostname() == blacklistedDomain {
-				skip = true
-				logger.Info("Skipped blacklisted %s", job.URL)
-				break
+			if skip {
+				continue
 			}
 		}

-		if skip {
-			continue
-		}
-
 		// check if it is the first occurence
 		w.visited.Lock.Lock()
 		for _, visitedURL := range w.visited.URLs {
@ -281,6 +284,7 @@ func (w *Worker) Work() {
 		// save page
 		if savePage {
 			w.savePage(pageURL, pageData)
+			w.stats.PagesSaved++
 		}

 		// sleep before the next request