From 7fa094d80fcfb0ad3daf4148f99014d4c2f4c6a4 Mon Sep 17 00:00:00 2001 From: Unbewohnte Date: Wed, 28 Dec 2022 13:03:23 +0300 Subject: [PATCH] Fixed a terrible logical "bug" in worker`s Work function; Added comments; Added more image extentions; Merged and improved ResolveLink function --- src/config/config.go | 6 ++++++ src/main.go | 3 ++- src/web/images.go | 42 +++++++++++++++++++--------------------- src/web/job.go | 1 + src/web/requests.go | 1 + src/web/result.go | 1 + src/web/text.go | 46 ++++++++++++++++++++++++-------------------- src/worker/pool.go | 6 ++++++ src/worker/worker.go | 40 +++++++++++++++++++++++--------------- 9 files changed, 87 insertions(+), 59 deletions(-) diff --git a/src/config/config.go b/src/config/config.go index 77f8a22..1f0eed3 100644 --- a/src/config/config.go +++ b/src/config/config.go @@ -51,6 +51,7 @@ type Logging struct { LogsFile string `json:"logs_file"` } +// Configuration file structure type Conf struct { Search Search `json:"search"` Requests Requests `json:"requests"` @@ -63,6 +64,7 @@ type Conf struct { Logging Logging `json:"logging"` } +// Default configuration file structure func Default() *Conf { return &Conf{ Search: Search{ @@ -91,6 +93,7 @@ func Default() *Conf { } } +// Write current configuration to w func (c *Conf) WriteTo(w io.Writer) error { jsonData, err := json.MarshalIndent(c, "", " ") if err != nil { @@ -105,6 +108,7 @@ func (c *Conf) WriteTo(w io.Writer) error { return nil } +// Read configuration from r func (c *Conf) ReadFrom(r io.Reader) error { jsonData, err := io.ReadAll(r) if err != nil { @@ -119,6 +123,7 @@ func (c *Conf) ReadFrom(r io.Reader) error { return nil } +// Creates configuration file at path func CreateConfigFile(conf Conf, path string) error { confFile, err := os.Create(path) if err != nil { @@ -134,6 +139,7 @@ func CreateConfigFile(conf Conf, path string) error { return nil } +// Tries to open configuration file at path. If it fails - returns default configuration func OpenConfigFile(path string) (*Conf, error) { confFile, err := os.Open(path) if err != nil { diff --git a/src/main.go b/src/main.go index 10073df..9f294a5 100644 --- a/src/main.go +++ b/src/main.go @@ -35,7 +35,7 @@ import ( "unbewohnte/wecr/worker" ) -const version = "v0.1.2" +const version = "v0.1.3" const ( defaultConfigFile string = "conf.json" @@ -299,6 +299,7 @@ func main() { break } + // each entry in output file is a self-standing JSON object entryBytes, err := json.MarshalIndent(result, "", " ") if err != nil { continue diff --git a/src/web/images.go b/src/web/images.go index 850f1cb..cd120dc 100644 --- a/src/web/images.go +++ b/src/web/images.go @@ -20,7 +20,7 @@ package web import ( "bytes" - "fmt" + "net/url" "strings" "golang.org/x/net/html" @@ -31,9 +31,15 @@ func hasImageExtention(url string) bool { ".jpeg", ".jpg", ".jpe", + ".jfif", ".png", ".ppm", ".svg", + ".gif", + ".tiff", + ".bmp", + ".webp", + ".ico", } for _, extention := range extentions { @@ -46,7 +52,7 @@ func hasImageExtention(url string) bool { } // Tries to find images' URLs on the page -func FindPageImages(pageBody []byte, hostname string) []string { +func FindPageImages(pageBody []byte, from *url.URL) []string { var urls []string tokenizer := html.NewTokenizer(bytes.NewReader(pageBody)) @@ -69,29 +75,21 @@ func FindPageImages(pageBody []byte, hostname string) []string { continue } - var imageURL string = attribute.Val - - if !strings.Contains(imageURL, hostname) { - // add hostname - if strings.HasPrefix(imageURL, "/") && strings.HasSuffix(hostname, "/") { - imageURL = fmt.Sprintf("%s%s", hostname, imageURL[1:]) - } else if !strings.HasPrefix(imageURL, "/") && !strings.HasSuffix(hostname, "/") { - imageURL = fmt.Sprintf("%s/%s", hostname, imageURL) - } else { - imageURL = fmt.Sprintf("%s%s", hostname, imageURL) - } + imageURL, err := url.Parse(attribute.Val) + if err != nil { + break } - imageURL = strings.TrimPrefix(imageURL, "//") + imageURLString := ResolveLink(imageURL, from.Host) - if !strings.HasPrefix(imageURL, "http://") && !strings.HasPrefix(imageURL, "https://") { - // add scheme - imageURL = "http://" + imageURL - } - - // check for image extention - if hasImageExtention(imageURL) { - urls = append(urls, imageURL) + if attribute.Key == "src" { + // tag -> don't check + urls = append(urls, imageURLString) + } else { + // tag -> check for image extention + if hasImageExtention(imageURLString) { + urls = append(urls, imageURLString) + } } } } diff --git a/src/web/job.go b/src/web/job.go index f98747d..4c66a7a 100644 --- a/src/web/job.go +++ b/src/web/job.go @@ -20,6 +20,7 @@ package web import "unbewohnte/wecr/config" +// Job to pass around workers type Job struct { URL string Search config.Search diff --git a/src/web/requests.go b/src/web/requests.go index ffbd3d8..286a43f 100644 --- a/src/web/requests.go +++ b/src/web/requests.go @@ -24,6 +24,7 @@ import ( "time" ) +// Get page data coming from url with optional user agent and timeout func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) { req, err := http.NewRequest("GET", url, nil) if err != nil { diff --git a/src/web/result.go b/src/web/result.go index d92388f..43c63ad 100644 --- a/src/web/result.go +++ b/src/web/result.go @@ -20,6 +20,7 @@ package web import "unbewohnte/wecr/config" +// Result of page parsing type Result struct { PageURL string Search config.Search diff --git a/src/web/text.go b/src/web/text.go index 10b49c4..e2b0659 100644 --- a/src/web/text.go +++ b/src/web/text.go @@ -21,14 +21,33 @@ package web import ( "bufio" "bytes" - "fmt" + "net/url" "regexp" "strings" "golang.org/x/net/html" ) -func FindPageLinks(pageBody []byte, hostname string) []string { +// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright +func ResolveLink(url *url.URL, fromHost string) string { + if !url.IsAbs() { + if url.Scheme == "" { + // add scheme + url.Scheme = "http" + } + + if url.Host == "" { + // add host + url.Host = fromHost + + } + } + + return url.String() +} + +// Find all links on page that are specified in tag +func FindPageLinks(pageBody []byte, from *url.URL) []string { var urls []string tokenizer := html.NewTokenizer(bytes.NewReader(pageBody)) @@ -52,27 +71,12 @@ func FindPageLinks(pageBody []byte, hostname string) []string { continue } - var link string = attribute.Val - - if !strings.Contains(link, hostname) { - // add hostname - if strings.HasPrefix(link, "/") && strings.HasSuffix(hostname, "/") { - link = fmt.Sprintf("%s%s", hostname, link[1:]) - } else if !strings.HasPrefix(link, "/") && !strings.HasSuffix(hostname, "/") { - link = fmt.Sprintf("%s/%s", hostname, link) - } else { - link = fmt.Sprintf("%s%s", hostname, link) - } - } - - link = strings.TrimPrefix(link, "//") - - if !strings.HasPrefix(link, "http://") && !strings.HasPrefix(link, "https://") { - // add scheme - link = "http://" + link + link, err := url.Parse(attribute.Val) + if err != nil { + break } - urls = append(urls, link) + urls = append(urls, ResolveLink(link, from.Host)) } } } diff --git a/src/worker/pool.go b/src/worker/pool.go index 9f23e48..9e83347 100644 --- a/src/worker/pool.go +++ b/src/worker/pool.go @@ -24,17 +24,20 @@ import ( "unbewohnte/wecr/web" ) +// Already visited URLs type visited struct { URLs []string Lock sync.Mutex } +// Whole worker pool's statistics type Statistics struct { PagesVisited uint64 MatchesFound uint64 StartTime time.Time } +// Web-Worker pool type Pool struct { workersCount uint workers []*Worker @@ -42,6 +45,7 @@ type Pool struct { Stats Statistics } +// Create a new worker pool func NewWorkerPool(jobs chan web.Job, results chan web.Result, workerCount uint, workerConf WorkerConf) *Pool { var newPool Pool = Pool{ workersCount: workerCount, @@ -66,6 +70,7 @@ func NewWorkerPool(jobs chan web.Job, results chan web.Result, workerCount uint, return &newPool } +// Notify all workers in pool to start scraping func (p *Pool) Work() { p.Stats.StartTime = time.Now() @@ -75,6 +80,7 @@ func (p *Pool) Work() { } } +// Notify all workers in pool to stop scraping func (p *Pool) Stop() { for _, worker := range p.workers { worker.Stopped = true diff --git a/src/worker/worker.go b/src/worker/worker.go index 3e5961c..8014e7b 100644 --- a/src/worker/worker.go +++ b/src/worker/worker.go @@ -33,6 +33,7 @@ import ( "unbewohnte/wecr/web" ) +// Worker configuration type WorkerConf struct { Requests config.Requests Save config.Save @@ -40,6 +41,7 @@ type WorkerConf struct { AllowedDomains []string } +// Web worker type Worker struct { Jobs chan web.Job Results chan web.Result @@ -49,6 +51,7 @@ type Worker struct { Stopped bool } +// Create a new worker func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visited *visited, stats *Statistics) Worker { return Worker{ Jobs: jobs, @@ -60,6 +63,7 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi } } +// Save page to the disk with a corresponding name func (w *Worker) savePage(baseURL *url.URL, pageData []byte) { if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" { var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String())) @@ -71,9 +75,12 @@ func (w *Worker) savePage(baseURL *url.URL, pageData []byte) { } pageFile.Close() + + logger.Info("Saved \"%s\"", pageName) } } +// Launch scraping process on this worker func (w *Worker) Work() { if w.Stopped { return @@ -88,14 +95,14 @@ func (w *Worker) Work() { // see if the domain is allowed and is not blacklisted var skip bool = false - parsedURL, err := url.Parse(job.URL) + pageURL, err := url.Parse(job.URL) if err != nil { logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err) continue } for _, allowedDomain := range w.Conf.AllowedDomains { - if parsedURL.Hostname() != allowedDomain { + if pageURL.Hostname() != allowedDomain { skip = true logger.Info("Skipped non-allowed %s", job.URL) break @@ -107,7 +114,7 @@ func (w *Worker) Work() { break } - if parsedURL.Hostname() == blacklistedDomain { + if pageURL.Hostname() == blacklistedDomain { skip = true logger.Info("Skipped blacklisted %s", job.URL) break @@ -129,6 +136,7 @@ func (w *Worker) Work() { break } } + if skip { continue } @@ -147,7 +155,7 @@ func (w *Worker) Work() { } // find links - pageLinks := web.FindPageLinks(pageData, parsedURL.Host) + pageLinks := web.FindPageLinks(pageData, pageURL) go func() { if job.Depth > 1 { @@ -178,31 +186,33 @@ func (w *Worker) Work() { Search: job.Search, Data: pageLinks, } + w.stats.MatchesFound += uint64(len(pageLinks)) savePage = true } case config.QueryImages: // find image URLs, output images to the file while not saving already outputted ones - imageLinks := web.FindPageImages(pageData, parsedURL.Host) + imageLinks := web.FindPageImages(pageData, pageURL) var alreadyProcessedImgUrls []string for count, imageLink := range imageLinks { // check if this URL has been processed already var skipImage bool = false + for _, processedURL := range alreadyProcessedImgUrls { if imageLink == processedURL { skipImage = true break } } + if skipImage { skipImage = false continue - } else { - alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink) } + alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink) - var imageName string = fmt.Sprintf("%s_%d_%s", parsedURL.Host, count, path.Base(imageLink)) + var imageName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(imageLink)) response, err := http.Get(imageLink) if err != nil { @@ -266,14 +276,14 @@ func (w *Worker) Work() { savePage = true } } + } - // save page - if savePage { - w.savePage(parsedURL, pageData) - } - - // sleep before the next request - time.Sleep(time.Duration(w.Conf.Requests.RequestPauseMs * uint64(time.Millisecond))) + // save page + if savePage { + w.savePage(pageURL, pageData) } + + // sleep before the next request + time.Sleep(time.Duration(w.Conf.Requests.RequestPauseMs * uint64(time.Millisecond))) } }