diff --git a/src/config/config.go b/src/config/config.go
index 77f8a22..1f0eed3 100644
--- a/src/config/config.go
+++ b/src/config/config.go
@@ -51,6 +51,7 @@ type Logging struct {
LogsFile string `json:"logs_file"`
}
+// Configuration file structure
type Conf struct {
Search Search `json:"search"`
Requests Requests `json:"requests"`
@@ -63,6 +64,7 @@ type Conf struct {
Logging Logging `json:"logging"`
}
+// Default configuration file structure
func Default() *Conf {
return &Conf{
Search: Search{
@@ -91,6 +93,7 @@ func Default() *Conf {
}
}
+// Write current configuration to w
func (c *Conf) WriteTo(w io.Writer) error {
jsonData, err := json.MarshalIndent(c, "", " ")
if err != nil {
@@ -105,6 +108,7 @@ func (c *Conf) WriteTo(w io.Writer) error {
return nil
}
+// Read configuration from r
func (c *Conf) ReadFrom(r io.Reader) error {
jsonData, err := io.ReadAll(r)
if err != nil {
@@ -119,6 +123,7 @@ func (c *Conf) ReadFrom(r io.Reader) error {
return nil
}
+// Creates configuration file at path
func CreateConfigFile(conf Conf, path string) error {
confFile, err := os.Create(path)
if err != nil {
@@ -134,6 +139,7 @@ func CreateConfigFile(conf Conf, path string) error {
return nil
}
+// Tries to open configuration file at path. If it fails - returns default configuration
func OpenConfigFile(path string) (*Conf, error) {
confFile, err := os.Open(path)
if err != nil {
diff --git a/src/main.go b/src/main.go
index 10073df..9f294a5 100644
--- a/src/main.go
+++ b/src/main.go
@@ -35,7 +35,7 @@ import (
"unbewohnte/wecr/worker"
)
-const version = "v0.1.2"
+const version = "v0.1.3"
const (
defaultConfigFile string = "conf.json"
@@ -299,6 +299,7 @@ func main() {
break
}
+ // each entry in output file is a self-standing JSON object
entryBytes, err := json.MarshalIndent(result, "", " ")
if err != nil {
continue
diff --git a/src/web/images.go b/src/web/images.go
index 850f1cb..cd120dc 100644
--- a/src/web/images.go
+++ b/src/web/images.go
@@ -20,7 +20,7 @@ package web
import (
"bytes"
- "fmt"
+ "net/url"
"strings"
"golang.org/x/net/html"
@@ -31,9 +31,15 @@ func hasImageExtention(url string) bool {
".jpeg",
".jpg",
".jpe",
+ ".jfif",
".png",
".ppm",
".svg",
+ ".gif",
+ ".tiff",
+ ".bmp",
+ ".webp",
+ ".ico",
}
for _, extention := range extentions {
@@ -46,7 +52,7 @@ func hasImageExtention(url string) bool {
}
// Tries to find images' URLs on the page
-func FindPageImages(pageBody []byte, hostname string) []string {
+func FindPageImages(pageBody []byte, from *url.URL) []string {
var urls []string
tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
@@ -69,29 +75,21 @@ func FindPageImages(pageBody []byte, hostname string) []string {
continue
}
- var imageURL string = attribute.Val
-
- if !strings.Contains(imageURL, hostname) {
- // add hostname
- if strings.HasPrefix(imageURL, "/") && strings.HasSuffix(hostname, "/") {
- imageURL = fmt.Sprintf("%s%s", hostname, imageURL[1:])
- } else if !strings.HasPrefix(imageURL, "/") && !strings.HasSuffix(hostname, "/") {
- imageURL = fmt.Sprintf("%s/%s", hostname, imageURL)
- } else {
- imageURL = fmt.Sprintf("%s%s", hostname, imageURL)
- }
+ imageURL, err := url.Parse(attribute.Val)
+ if err != nil {
+ break
}
- imageURL = strings.TrimPrefix(imageURL, "//")
+ imageURLString := ResolveLink(imageURL, from.Host)
- if !strings.HasPrefix(imageURL, "http://") && !strings.HasPrefix(imageURL, "https://") {
- // add scheme
- imageURL = "http://" + imageURL
- }
-
- // check for image extention
- if hasImageExtention(imageURL) {
- urls = append(urls, imageURL)
+ if attribute.Key == "src" {
+ // tag -> don't check
+ urls = append(urls, imageURLString)
+ } else {
+ // tag -> check for image extention
+ if hasImageExtention(imageURLString) {
+ urls = append(urls, imageURLString)
+ }
}
}
}
diff --git a/src/web/job.go b/src/web/job.go
index f98747d..4c66a7a 100644
--- a/src/web/job.go
+++ b/src/web/job.go
@@ -20,6 +20,7 @@ package web
import "unbewohnte/wecr/config"
+// Job to pass around workers
type Job struct {
URL string
Search config.Search
diff --git a/src/web/requests.go b/src/web/requests.go
index ffbd3d8..286a43f 100644
--- a/src/web/requests.go
+++ b/src/web/requests.go
@@ -24,6 +24,7 @@ import (
"time"
)
+// Get page data coming from url with optional user agent and timeout
func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
diff --git a/src/web/result.go b/src/web/result.go
index d92388f..43c63ad 100644
--- a/src/web/result.go
+++ b/src/web/result.go
@@ -20,6 +20,7 @@ package web
import "unbewohnte/wecr/config"
+// Result of page parsing
type Result struct {
PageURL string
Search config.Search
diff --git a/src/web/text.go b/src/web/text.go
index 10b49c4..e2b0659 100644
--- a/src/web/text.go
+++ b/src/web/text.go
@@ -21,14 +21,33 @@ package web
import (
"bufio"
"bytes"
- "fmt"
+ "net/url"
"regexp"
"strings"
"golang.org/x/net/html"
)
-func FindPageLinks(pageBody []byte, hostname string) []string {
+// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
+func ResolveLink(url *url.URL, fromHost string) string {
+ if !url.IsAbs() {
+ if url.Scheme == "" {
+ // add scheme
+ url.Scheme = "http"
+ }
+
+ if url.Host == "" {
+ // add host
+ url.Host = fromHost
+
+ }
+ }
+
+ return url.String()
+}
+
+// Find all links on page that are specified in tag
+func FindPageLinks(pageBody []byte, from *url.URL) []string {
var urls []string
tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
@@ -52,27 +71,12 @@ func FindPageLinks(pageBody []byte, hostname string) []string {
continue
}
- var link string = attribute.Val
-
- if !strings.Contains(link, hostname) {
- // add hostname
- if strings.HasPrefix(link, "/") && strings.HasSuffix(hostname, "/") {
- link = fmt.Sprintf("%s%s", hostname, link[1:])
- } else if !strings.HasPrefix(link, "/") && !strings.HasSuffix(hostname, "/") {
- link = fmt.Sprintf("%s/%s", hostname, link)
- } else {
- link = fmt.Sprintf("%s%s", hostname, link)
- }
- }
-
- link = strings.TrimPrefix(link, "//")
-
- if !strings.HasPrefix(link, "http://") && !strings.HasPrefix(link, "https://") {
- // add scheme
- link = "http://" + link
+ link, err := url.Parse(attribute.Val)
+ if err != nil {
+ break
}
- urls = append(urls, link)
+ urls = append(urls, ResolveLink(link, from.Host))
}
}
}
diff --git a/src/worker/pool.go b/src/worker/pool.go
index 9f23e48..9e83347 100644
--- a/src/worker/pool.go
+++ b/src/worker/pool.go
@@ -24,17 +24,20 @@ import (
"unbewohnte/wecr/web"
)
+// Already visited URLs
type visited struct {
URLs []string
Lock sync.Mutex
}
+// Whole worker pool's statistics
type Statistics struct {
PagesVisited uint64
MatchesFound uint64
StartTime time.Time
}
+// Web-Worker pool
type Pool struct {
workersCount uint
workers []*Worker
@@ -42,6 +45,7 @@ type Pool struct {
Stats Statistics
}
+// Create a new worker pool
func NewWorkerPool(jobs chan web.Job, results chan web.Result, workerCount uint, workerConf WorkerConf) *Pool {
var newPool Pool = Pool{
workersCount: workerCount,
@@ -66,6 +70,7 @@ func NewWorkerPool(jobs chan web.Job, results chan web.Result, workerCount uint,
return &newPool
}
+// Notify all workers in pool to start scraping
func (p *Pool) Work() {
p.Stats.StartTime = time.Now()
@@ -75,6 +80,7 @@ func (p *Pool) Work() {
}
}
+// Notify all workers in pool to stop scraping
func (p *Pool) Stop() {
for _, worker := range p.workers {
worker.Stopped = true
diff --git a/src/worker/worker.go b/src/worker/worker.go
index 3e5961c..8014e7b 100644
--- a/src/worker/worker.go
+++ b/src/worker/worker.go
@@ -33,6 +33,7 @@ import (
"unbewohnte/wecr/web"
)
+// Worker configuration
type WorkerConf struct {
Requests config.Requests
Save config.Save
@@ -40,6 +41,7 @@ type WorkerConf struct {
AllowedDomains []string
}
+// Web worker
type Worker struct {
Jobs chan web.Job
Results chan web.Result
@@ -49,6 +51,7 @@ type Worker struct {
Stopped bool
}
+// Create a new worker
func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visited *visited, stats *Statistics) Worker {
return Worker{
Jobs: jobs,
@@ -60,6 +63,7 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
}
}
+// Save page to the disk with a corresponding name
func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" {
var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String()))
@@ -71,9 +75,12 @@ func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
}
pageFile.Close()
+
+ logger.Info("Saved \"%s\"", pageName)
}
}
+// Launch scraping process on this worker
func (w *Worker) Work() {
if w.Stopped {
return
@@ -88,14 +95,14 @@ func (w *Worker) Work() {
// see if the domain is allowed and is not blacklisted
var skip bool = false
- parsedURL, err := url.Parse(job.URL)
+ pageURL, err := url.Parse(job.URL)
if err != nil {
logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err)
continue
}
for _, allowedDomain := range w.Conf.AllowedDomains {
- if parsedURL.Hostname() != allowedDomain {
+ if pageURL.Hostname() != allowedDomain {
skip = true
logger.Info("Skipped non-allowed %s", job.URL)
break
@@ -107,7 +114,7 @@ func (w *Worker) Work() {
break
}
- if parsedURL.Hostname() == blacklistedDomain {
+ if pageURL.Hostname() == blacklistedDomain {
skip = true
logger.Info("Skipped blacklisted %s", job.URL)
break
@@ -129,6 +136,7 @@ func (w *Worker) Work() {
break
}
}
+
if skip {
continue
}
@@ -147,7 +155,7 @@ func (w *Worker) Work() {
}
// find links
- pageLinks := web.FindPageLinks(pageData, parsedURL.Host)
+ pageLinks := web.FindPageLinks(pageData, pageURL)
go func() {
if job.Depth > 1 {
@@ -178,31 +186,33 @@ func (w *Worker) Work() {
Search: job.Search,
Data: pageLinks,
}
+ w.stats.MatchesFound += uint64(len(pageLinks))
savePage = true
}
case config.QueryImages:
// find image URLs, output images to the file while not saving already outputted ones
- imageLinks := web.FindPageImages(pageData, parsedURL.Host)
+ imageLinks := web.FindPageImages(pageData, pageURL)
var alreadyProcessedImgUrls []string
for count, imageLink := range imageLinks {
// check if this URL has been processed already
var skipImage bool = false
+
for _, processedURL := range alreadyProcessedImgUrls {
if imageLink == processedURL {
skipImage = true
break
}
}
+
if skipImage {
skipImage = false
continue
- } else {
- alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
}
+ alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
- var imageName string = fmt.Sprintf("%s_%d_%s", parsedURL.Host, count, path.Base(imageLink))
+ var imageName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(imageLink))
response, err := http.Get(imageLink)
if err != nil {
@@ -266,14 +276,14 @@ func (w *Worker) Work() {
savePage = true
}
}
+ }
- // save page
- if savePage {
- w.savePage(parsedURL, pageData)
- }
-
- // sleep before the next request
- time.Sleep(time.Duration(w.Conf.Requests.RequestPauseMs * uint64(time.Millisecond)))
+ // save page
+ if savePage {
+ w.savePage(pageURL, pageData)
}
+
+ // sleep before the next request
+ time.Sleep(time.Duration(w.Conf.Requests.RequestPauseMs * uint64(time.Millisecond)))
}
}