From 7fa094d80fcfb0ad3daf4148f99014d4c2f4c6a4 Mon Sep 17 00:00:00 2001
From: Unbewohnte <me@unbewohnte.su>
Date: Wed, 28 Dec 2022 13:03:23 +0300
Subject: [PATCH] Fixed a terrible logical "bug" in worker`s Work function;
 Added comments; Added more image extentions; Merged and improved ResolveLink
 function

---
 src/config/config.go |  6 ++++++
 src/main.go          |  3 ++-
 src/web/images.go    | 42 +++++++++++++++++++---------------------
 src/web/job.go       |  1 +
 src/web/requests.go  |  1 +
 src/web/result.go    |  1 +
 src/web/text.go      | 46 ++++++++++++++++++++++++--------------------
 src/worker/pool.go   |  6 ++++++
 src/worker/worker.go | 40 +++++++++++++++++++++++---------------
 9 files changed, 87 insertions(+), 59 deletions(-)

diff --git a/src/config/config.go b/src/config/config.go
index 77f8a22..1f0eed3 100644
--- a/src/config/config.go
+++ b/src/config/config.go
@@ -51,6 +51,7 @@ type Logging struct {
 	LogsFile   string `json:"logs_file"`
 }
 
+// Configuration file structure
 type Conf struct {
 	Search             Search   `json:"search"`
 	Requests           Requests `json:"requests"`
@@ -63,6 +64,7 @@ type Conf struct {
 	Logging            Logging  `json:"logging"`
 }
 
+// Default configuration file structure
 func Default() *Conf {
 	return &Conf{
 		Search: Search{
@@ -91,6 +93,7 @@ func Default() *Conf {
 	}
 }
 
+// Write current configuration to w
 func (c *Conf) WriteTo(w io.Writer) error {
 	jsonData, err := json.MarshalIndent(c, "", "  ")
 	if err != nil {
@@ -105,6 +108,7 @@ func (c *Conf) WriteTo(w io.Writer) error {
 	return nil
 }
 
+// Read configuration from r
 func (c *Conf) ReadFrom(r io.Reader) error {
 	jsonData, err := io.ReadAll(r)
 	if err != nil {
@@ -119,6 +123,7 @@ func (c *Conf) ReadFrom(r io.Reader) error {
 	return nil
 }
 
+// Creates configuration file at path
 func CreateConfigFile(conf Conf, path string) error {
 	confFile, err := os.Create(path)
 	if err != nil {
@@ -134,6 +139,7 @@ func CreateConfigFile(conf Conf, path string) error {
 	return nil
 }
 
+// Tries to open configuration file at path. If it fails - returns default configuration
 func OpenConfigFile(path string) (*Conf, error) {
 	confFile, err := os.Open(path)
 	if err != nil {
diff --git a/src/main.go b/src/main.go
index 10073df..9f294a5 100644
--- a/src/main.go
+++ b/src/main.go
@@ -35,7 +35,7 @@ import (
 	"unbewohnte/wecr/worker"
 )
 
-const version = "v0.1.2"
+const version = "v0.1.3"
 
 const (
 	defaultConfigFile string = "conf.json"
@@ -299,6 +299,7 @@ func main() {
 			break
 		}
 
+		// each entry in output file is a self-standing JSON object
 		entryBytes, err := json.MarshalIndent(result, "", "  ")
 		if err != nil {
 			continue
diff --git a/src/web/images.go b/src/web/images.go
index 850f1cb..cd120dc 100644
--- a/src/web/images.go
+++ b/src/web/images.go
@@ -20,7 +20,7 @@ package web
 
 import (
 	"bytes"
-	"fmt"
+	"net/url"
 	"strings"
 
 	"golang.org/x/net/html"
@@ -31,9 +31,15 @@ func hasImageExtention(url string) bool {
 		".jpeg",
 		".jpg",
 		".jpe",
+		".jfif",
 		".png",
 		".ppm",
 		".svg",
+		".gif",
+		".tiff",
+		".bmp",
+		".webp",
+		".ico",
 	}
 
 	for _, extention := range extentions {
@@ -46,7 +52,7 @@ func hasImageExtention(url string) bool {
 }
 
 // Tries to find images' URLs on the page
-func FindPageImages(pageBody []byte, hostname string) []string {
+func FindPageImages(pageBody []byte, from *url.URL) []string {
 	var urls []string
 
 	tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
@@ -69,29 +75,21 @@ func FindPageImages(pageBody []byte, hostname string) []string {
 					continue
 				}
 
-				var imageURL string = attribute.Val
-
-				if !strings.Contains(imageURL, hostname) {
-					// add hostname
-					if strings.HasPrefix(imageURL, "/") && strings.HasSuffix(hostname, "/") {
-						imageURL = fmt.Sprintf("%s%s", hostname, imageURL[1:])
-					} else if !strings.HasPrefix(imageURL, "/") && !strings.HasSuffix(hostname, "/") {
-						imageURL = fmt.Sprintf("%s/%s", hostname, imageURL)
-					} else {
-						imageURL = fmt.Sprintf("%s%s", hostname, imageURL)
-					}
+				imageURL, err := url.Parse(attribute.Val)
+				if err != nil {
+					break
 				}
 
-				imageURL = strings.TrimPrefix(imageURL, "//")
+				imageURLString := ResolveLink(imageURL, from.Host)
 
-				if !strings.HasPrefix(imageURL, "http://") && !strings.HasPrefix(imageURL, "https://") {
-					// add scheme
-					imageURL = "http://" + imageURL
-				}
-
-				// check for image extention
-				if hasImageExtention(imageURL) {
-					urls = append(urls, imageURL)
+				if attribute.Key == "src" {
+					// <img> tag -> don't check
+					urls = append(urls, imageURLString)
+				} else {
+					// <a> tag -> check for image extention
+					if hasImageExtention(imageURLString) {
+						urls = append(urls, imageURLString)
+					}
 				}
 			}
 		}
diff --git a/src/web/job.go b/src/web/job.go
index f98747d..4c66a7a 100644
--- a/src/web/job.go
+++ b/src/web/job.go
@@ -20,6 +20,7 @@ package web
 
 import "unbewohnte/wecr/config"
 
+// Job to pass around workers
 type Job struct {
 	URL    string
 	Search config.Search
diff --git a/src/web/requests.go b/src/web/requests.go
index ffbd3d8..286a43f 100644
--- a/src/web/requests.go
+++ b/src/web/requests.go
@@ -24,6 +24,7 @@ import (
 	"time"
 )
 
+// Get page data coming from url with optional user agent and timeout
 func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
 	req, err := http.NewRequest("GET", url, nil)
 	if err != nil {
diff --git a/src/web/result.go b/src/web/result.go
index d92388f..43c63ad 100644
--- a/src/web/result.go
+++ b/src/web/result.go
@@ -20,6 +20,7 @@ package web
 
 import "unbewohnte/wecr/config"
 
+// Result of page parsing
 type Result struct {
 	PageURL string
 	Search  config.Search
diff --git a/src/web/text.go b/src/web/text.go
index 10b49c4..e2b0659 100644
--- a/src/web/text.go
+++ b/src/web/text.go
@@ -21,14 +21,33 @@ package web
 import (
 	"bufio"
 	"bytes"
-	"fmt"
+	"net/url"
 	"regexp"
 	"strings"
 
 	"golang.org/x/net/html"
 )
 
-func FindPageLinks(pageBody []byte, hostname string) []string {
+// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
+func ResolveLink(url *url.URL, fromHost string) string {
+	if !url.IsAbs() {
+		if url.Scheme == "" {
+			// add scheme
+			url.Scheme = "http"
+		}
+
+		if url.Host == "" {
+			// add host
+			url.Host = fromHost
+
+		}
+	}
+
+	return url.String()
+}
+
+// Find all links on page that are specified in <a> tag
+func FindPageLinks(pageBody []byte, from *url.URL) []string {
 	var urls []string
 
 	tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
@@ -52,27 +71,12 @@ func FindPageLinks(pageBody []byte, hostname string) []string {
 					continue
 				}
 
-				var link string = attribute.Val
-
-				if !strings.Contains(link, hostname) {
-					// add hostname
-					if strings.HasPrefix(link, "/") && strings.HasSuffix(hostname, "/") {
-						link = fmt.Sprintf("%s%s", hostname, link[1:])
-					} else if !strings.HasPrefix(link, "/") && !strings.HasSuffix(hostname, "/") {
-						link = fmt.Sprintf("%s/%s", hostname, link)
-					} else {
-						link = fmt.Sprintf("%s%s", hostname, link)
-					}
-				}
-
-				link = strings.TrimPrefix(link, "//")
-
-				if !strings.HasPrefix(link, "http://") && !strings.HasPrefix(link, "https://") {
-					// add scheme
-					link = "http://" + link
+				link, err := url.Parse(attribute.Val)
+				if err != nil {
+					break
 				}
 
-				urls = append(urls, link)
+				urls = append(urls, ResolveLink(link, from.Host))
 			}
 		}
 	}
diff --git a/src/worker/pool.go b/src/worker/pool.go
index 9f23e48..9e83347 100644
--- a/src/worker/pool.go
+++ b/src/worker/pool.go
@@ -24,17 +24,20 @@ import (
 	"unbewohnte/wecr/web"
 )
 
+// Already visited URLs
 type visited struct {
 	URLs []string
 	Lock sync.Mutex
 }
 
+// Whole worker pool's statistics
 type Statistics struct {
 	PagesVisited uint64
 	MatchesFound uint64
 	StartTime    time.Time
 }
 
+// Web-Worker pool
 type Pool struct {
 	workersCount uint
 	workers      []*Worker
@@ -42,6 +45,7 @@ type Pool struct {
 	Stats        Statistics
 }
 
+// Create a new worker pool
 func NewWorkerPool(jobs chan web.Job, results chan web.Result, workerCount uint, workerConf WorkerConf) *Pool {
 	var newPool Pool = Pool{
 		workersCount: workerCount,
@@ -66,6 +70,7 @@ func NewWorkerPool(jobs chan web.Job, results chan web.Result, workerCount uint,
 	return &newPool
 }
 
+// Notify all workers in pool to start scraping
 func (p *Pool) Work() {
 	p.Stats.StartTime = time.Now()
 
@@ -75,6 +80,7 @@ func (p *Pool) Work() {
 	}
 }
 
+// Notify all workers in pool to stop scraping
 func (p *Pool) Stop() {
 	for _, worker := range p.workers {
 		worker.Stopped = true
diff --git a/src/worker/worker.go b/src/worker/worker.go
index 3e5961c..8014e7b 100644
--- a/src/worker/worker.go
+++ b/src/worker/worker.go
@@ -33,6 +33,7 @@ import (
 	"unbewohnte/wecr/web"
 )
 
+// Worker configuration
 type WorkerConf struct {
 	Requests           config.Requests
 	Save               config.Save
@@ -40,6 +41,7 @@ type WorkerConf struct {
 	AllowedDomains     []string
 }
 
+// Web worker
 type Worker struct {
 	Jobs    chan web.Job
 	Results chan web.Result
@@ -49,6 +51,7 @@ type Worker struct {
 	Stopped bool
 }
 
+// Create a new worker
 func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visited *visited, stats *Statistics) Worker {
 	return Worker{
 		Jobs:    jobs,
@@ -60,6 +63,7 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
 	}
 }
 
+// Save page to the disk with a corresponding name
 func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
 	if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" {
 		var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String()))
@@ -71,9 +75,12 @@ func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
 		}
 
 		pageFile.Close()
+
+		logger.Info("Saved \"%s\"", pageName)
 	}
 }
 
+// Launch scraping process on this worker
 func (w *Worker) Work() {
 	if w.Stopped {
 		return
@@ -88,14 +95,14 @@ func (w *Worker) Work() {
 
 		// see if the domain is allowed and is not blacklisted
 		var skip bool = false
-		parsedURL, err := url.Parse(job.URL)
+		pageURL, err := url.Parse(job.URL)
 		if err != nil {
 			logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err)
 			continue
 		}
 
 		for _, allowedDomain := range w.Conf.AllowedDomains {
-			if parsedURL.Hostname() != allowedDomain {
+			if pageURL.Hostname() != allowedDomain {
 				skip = true
 				logger.Info("Skipped non-allowed %s", job.URL)
 				break
@@ -107,7 +114,7 @@ func (w *Worker) Work() {
 				break
 			}
 
-			if parsedURL.Hostname() == blacklistedDomain {
+			if pageURL.Hostname() == blacklistedDomain {
 				skip = true
 				logger.Info("Skipped blacklisted %s", job.URL)
 				break
@@ -129,6 +136,7 @@ func (w *Worker) Work() {
 				break
 			}
 		}
+
 		if skip {
 			continue
 		}
@@ -147,7 +155,7 @@ func (w *Worker) Work() {
 		}
 
 		// find links
-		pageLinks := web.FindPageLinks(pageData, parsedURL.Host)
+		pageLinks := web.FindPageLinks(pageData, pageURL)
 
 		go func() {
 			if job.Depth > 1 {
@@ -178,31 +186,33 @@ func (w *Worker) Work() {
 					Search:  job.Search,
 					Data:    pageLinks,
 				}
+				w.stats.MatchesFound += uint64(len(pageLinks))
 				savePage = true
 			}
 
 		case config.QueryImages:
 			// find image URLs, output images to the file while not saving already outputted ones
-			imageLinks := web.FindPageImages(pageData, parsedURL.Host)
+			imageLinks := web.FindPageImages(pageData, pageURL)
 
 			var alreadyProcessedImgUrls []string
 			for count, imageLink := range imageLinks {
 				// check if this URL has been processed already
 				var skipImage bool = false
+
 				for _, processedURL := range alreadyProcessedImgUrls {
 					if imageLink == processedURL {
 						skipImage = true
 						break
 					}
 				}
+
 				if skipImage {
 					skipImage = false
 					continue
-				} else {
-					alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
 				}
+				alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
 
-				var imageName string = fmt.Sprintf("%s_%d_%s", parsedURL.Host, count, path.Base(imageLink))
+				var imageName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(imageLink))
 
 				response, err := http.Get(imageLink)
 				if err != nil {
@@ -266,14 +276,14 @@ func (w *Worker) Work() {
 					savePage = true
 				}
 			}
+		}
 
-			// save page
-			if savePage {
-				w.savePage(parsedURL, pageData)
-			}
-
-			// sleep before the next request
-			time.Sleep(time.Duration(w.Conf.Requests.RequestPauseMs * uint64(time.Millisecond)))
+		// save page
+		if savePage {
+			w.savePage(pageURL, pageData)
 		}
+
+		// sleep before the next request
+		time.Sleep(time.Duration(w.Conf.Requests.RequestPauseMs * uint64(time.Millisecond)))
 	}
 }