Blacklisting domains

2 years ago · 7c30787990
14 changed files with 83 additions and 55 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,4 +3,5 @@ conf.json
 logs.log
 output.json
 websurf
-conf_mega_ita.json
+conf_mega_ita.json
+wecr
--- a/2
+++ b/2
@ -1,5 +1,5 @@
 SRCDIR:=src
-EXE:=websurf
+EXE:=wecr
 TESTDIR:=testing

 all:
--- a/README.md
+++ b/README.md
@ -1,8 +1,8 @@
-# Websurf
+# Wecr - simple web crawler 

 ## Overview

-Just a simple HTML web spider with minimal dependencies. It is possible to search for pages with a text on them or for the text itself, extract images and save pages that satisfy the criteria along the way.
+Just a simple HTML web spider with minimal dependencies. It is possible to search for pages with a text on them or for the text itself, extract images and save pages that satisfy the criteria along the way. 

 ## Configuration

@ -10,6 +10,7 @@ The flow of work fully depends on the configuration file. By default `conf.json`

 The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.

+The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains`. If all initial pages are happen to be blacklisted - the program will end.

 ### Search query

--- a/src/config/config.go
+++ b/src/config/config.go
@ -1,5 +1,5 @@
 /*
-	websurf - surf the web for data recursively
+	Wecr - crawl the web for data
 	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)

 	This program is free software: you can redistribute it and/or modify
@ -52,13 +52,14 @@ type Logging struct {
 }

 type Conf struct {
-	Search         Search   `json:"search"`
-	Requests       Requests `json:"requests"`
-	Depth          uint     `json:"depth"`
-	Workers        uint     `json:"workers"`
-	InitialDomains []string `json:"initial_domains"`
-	Save           Save     `json:"save"`
-	Logging        Logging  `json:"logging"`
+	Search             Search   `json:"search"`
+	Requests           Requests `json:"requests"`
+	Depth              uint     `json:"depth"`
+	Workers            uint     `json:"workers"`
+	InitialPages       []string `json:"initial_pages"`
+	BlacklistedDomains []string `json:"blacklisted_domains"`
+	Save               Save     `json:"save"`
+	Logging            Logging  `json:"logging"`
 }

 func Default() *Conf {
@ -77,9 +78,10 @@ func Default() *Conf {
 			WaitTimeoutMs:  1500,
 			RequestPauseMs: 100,
 		},
-		InitialDomains: []string{""},
-		Depth:          5,
-		Workers:        20,
+		InitialPages:       []string{""},
+		Depth:              5,
+		Workers:            20,
+		BlacklistedDomains: []string{""},
 		Logging: Logging{
 			OutputLogs: true,
 			LogsFile:   "logs.log",
--- a/src/go.mod
+++ b/src/go.mod
@ -1,4 +1,4 @@
-module unbewohnte/websurf
+module unbewohnte/wecr

 go 1.18

--- a/src/logger/logger.go
+++ b/src/logger/logger.go
@ -1,5 +1,5 @@
 /*
-	websurf - surf the web for data recursively
+	Wecr - crawl the web for data
 	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)

 	This program is free software: you can redistribute it and/or modify
--- a/src/main.go
+++ b/src/main.go
@ -1,5 +1,5 @@
 /*
-	websurf - surf the web for data recursively
+	Wecr - crawl the web for data
 	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)

 	This program is free software: you can redistribute it and/or modify
@ -24,14 +24,15 @@ import (
 	"fmt"
 	"io"
 	"log"
+	"net/url"
 	"os"
 	"os/signal"
 	"path/filepath"
 	"time"
-	"unbewohnte/websurf/config"
-	"unbewohnte/websurf/logger"
-	"unbewohnte/websurf/web"
-	"unbewohnte/websurf/worker"
+	"unbewohnte/wecr/config"
+	"unbewohnte/wecr/logger"
+	"unbewohnte/wecr/web"
+	"unbewohnte/wecr/worker"
 )

 const (
@ -78,7 +79,7 @@ func init() {

 	if *version {
 		fmt.Printf(
-			"webscrape - scrape the web\n(c) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n\n",
+			"Webcrawl - crawl the web for data\n(c) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n\n",
 		)
 		os.Exit(0)
 	}
@ -152,14 +153,24 @@ func main() {
 	}

 	// sanitize and correct inputs
-	if len(conf.InitialDomains) == 0 {
-		logger.Warning("No initial domain URLs have been set")
+	if len(conf.InitialPages) == 0 {
+		logger.Warning("No initial page URLs have been set")
 		return
-	} else if len(conf.InitialDomains) != 0 && conf.InitialDomains[0] == "" {
-		logger.Warning("No initial domain URLs have been set")
+	} else if len(conf.InitialPages) != 0 && conf.InitialPages[0] == "" {
+		logger.Warning("No initial page URLs have been set")
 		return
 	}

+	for index, blacklistedDomain := range conf.BlacklistedDomains {
+		parsedURL, err := url.Parse(blacklistedDomain)
+		if err != nil {
+			logger.Warning("Failed to parse blacklisted %s: %s", blacklistedDomain, err)
+			continue
+		}
+
+		conf.BlacklistedDomains[index] = parsedURL.Host
+	}
+
 	if conf.Depth <= 0 {
 		conf.Depth = 1
 		logger.Warning("Depth is <= 0. Set to %d", conf.Depth)
@ -215,9 +226,9 @@ func main() {
 	results := make(chan web.Result, conf.Workers*5)

 	// create initial jobs
-	for _, initialDomain := range conf.InitialDomains {
+	for _, initialPage := range conf.InitialPages {
 		jobs <- web.Job{
-			URL:    initialDomain,
+			URL:    initialPage,
 			Search: conf.Search,
 			Depth:  conf.Depth,
 		}
@ -225,8 +236,9 @@ func main() {

 	// form a worker pool
 	workerPool := worker.NewWorkerPool(jobs, results, conf.Workers, worker.WorkerConf{
-		Requests: conf.Requests,
-		Save:     conf.Save,
+		Requests:           conf.Requests,
+		Save:               conf.Save,
+		BlacklistedDomains: conf.BlacklistedDomains,
 	})
 	logger.Info("Created a worker pool with %d workers", conf.Workers)

--- a/src/web/images.go
+++ b/src/web/images.go
@ -1,5 +1,5 @@
 /*
-	websurf - surf the web for data recursively
+	Wecr - crawl the web for data
 	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)

 	This program is free software: you can redistribute it and/or modify
@ -22,7 +22,7 @@ import (
 	"bytes"
 	"net/url"
 	"strings"
-	"unbewohnte/websurf/logger"
+	"unbewohnte/wecr/logger"

 	"golang.org/x/net/html"
 )
--- a/src/web/job.go
+++ b/src/web/job.go
@ -1,5 +1,5 @@
 /*
-	websurf - surf the web for data recursively
+	Wecr - crawl the web for data
 	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)

 	This program is free software: you can redistribute it and/or modify
@ -18,7 +18,7 @@

 package web

-import "unbewohnte/websurf/config"
+import "unbewohnte/wecr/config"

 type Job struct {
 	URL    string
--- a/src/web/requests.go
+++ b/src/web/requests.go
@ -1,5 +1,5 @@
 /*
-	websurf - surf the web for data recursively
+	Wecr - crawl the web for data
 	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)

 	This program is free software: you can redistribute it and/or modify
--- a/src/web/result.go
+++ b/src/web/result.go
@ -1,5 +1,5 @@
 /*
-	websurf - surf the web for data recursively
+	Wecr - crawl the web for data
 	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)

 	This program is free software: you can redistribute it and/or modify
@ -18,7 +18,7 @@

 package web

-import "unbewohnte/websurf/config"
+import "unbewohnte/wecr/config"

 type Result struct {
 	PageURL string
--- a/src/web/text.go
+++ b/src/web/text.go
@ -1,5 +1,5 @@
 /*
-	websurf - surf the web for data recursively
+	Wecr - crawl the web for data
 	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)

 	This program is free software: you can redistribute it and/or modify
--- a/src/worker/pool.go
+++ b/src/worker/pool.go
@ -1,5 +1,5 @@
 /*
-	websurf - surf the web for data recursively
+	Wecr - crawl the web for data
 	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)

 	This program is free software: you can redistribute it and/or modify
@ -21,7 +21,7 @@ package worker
 import (
 	"sync"
 	"time"
-	"unbewohnte/websurf/web"
+	"unbewohnte/wecr/web"
 )

 type visited struct {
--- a/src/worker/worker.go
+++ b/src/worker/worker.go
@ -1,5 +1,5 @@
 /*
-	websurf - surf the web for data recursively
+	Wecr - crawl the web for data
 	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)

 	This program is free software: you can redistribute it and/or modify
@ -28,14 +28,15 @@ import (
 	"path/filepath"
 	"regexp"
 	"time"
-	"unbewohnte/websurf/config"
-	"unbewohnte/websurf/logger"
-	"unbewohnte/websurf/web"
+	"unbewohnte/wecr/config"
+	"unbewohnte/wecr/logger"
+	"unbewohnte/wecr/web"
 )

 type WorkerConf struct {
-	Requests config.Requests
-	Save     config.Save
+	Requests           config.Requests
+	Save               config.Save
+	BlacklistedDomains []string
 }

 type Worker struct {
@ -70,14 +71,31 @@ func (w *Worker) Work() {
 			return
 		}

+		// see if the domain is not blacklisted
+		var skip bool = false
+		parsedURL, err := url.Parse(job.URL)
+		if err != nil {
+			logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err)
+			continue
+		}
+		for _, blacklistedDomain := range w.Conf.BlacklistedDomains {
+			if parsedURL.Hostname() == blacklistedDomain {
+				skip = true
+				logger.Info("Skipping blacklisted %s", job.URL)
+				break
+			}
+		}
+		if skip {
+			continue
+		}
+
 		// check if it is the first occurence
 		w.visited.Lock.Lock()
-		var skip bool = false
 		for _, visitedURL := range w.visited.URLs {
 			if job.URL == visitedURL {
 				// okay, don't even bother. Move onto the next job
 				skip = true
-				logger.Info("Skipping %s", job.URL)
+				logger.Info("Skipping visited %s", job.URL)
 				w.visited.Lock.Unlock()
 				break
 			}
@ -136,12 +154,6 @@ func (w *Worker) Work() {

 		case config.QueryImages:
 			// find image URLs, output data to the file
-			parsedURL, err := url.Parse(job.URL)
-			if err != nil {
-				logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err)
-				continue
-			}
-
 			imageLinks := web.FindPageImages(pageData, parsedURL.Host)

 			for count, imageLink := range imageLinks {