diff --git a/.gitignore b/.gitignore index 7dcc942..8f7efde 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ conf.json logs.log output.json websurf -conf_mega_ita.json \ No newline at end of file +conf_mega_ita.json +wecr \ No newline at end of file diff --git a/Makefile b/Makefile index a16aaa1..67ed9f8 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ SRCDIR:=src -EXE:=websurf +EXE:=wecr TESTDIR:=testing all: diff --git a/README.md b/README.md index f2ac480..658df28 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ -# Websurf +# Wecr - simple web crawler ## Overview -Just a simple HTML web spider with minimal dependencies. It is possible to search for pages with a text on them or for the text itself, extract images and save pages that satisfy the criteria along the way. +Just a simple HTML web spider with minimal dependencies. It is possible to search for pages with a text on them or for the text itself, extract images and save pages that satisfy the criteria along the way. ## Configuration @@ -10,6 +10,7 @@ The flow of work fully depends on the configuration file. By default `conf.json` The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them. +The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains`. If all initial pages are happen to be blacklisted - the program will end. ### Search query diff --git a/src/config/config.go b/src/config/config.go index db82be4..aed0e11 100644 --- a/src/config/config.go +++ b/src/config/config.go @@ -1,5 +1,5 @@ /* - websurf - surf the web for data recursively + Wecr - crawl the web for data Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify @@ -52,13 +52,14 @@ type Logging struct { } type Conf struct { - Search Search `json:"search"` - Requests Requests `json:"requests"` - Depth uint `json:"depth"` - Workers uint `json:"workers"` - InitialDomains []string `json:"initial_domains"` - Save Save `json:"save"` - Logging Logging `json:"logging"` + Search Search `json:"search"` + Requests Requests `json:"requests"` + Depth uint `json:"depth"` + Workers uint `json:"workers"` + InitialPages []string `json:"initial_pages"` + BlacklistedDomains []string `json:"blacklisted_domains"` + Save Save `json:"save"` + Logging Logging `json:"logging"` } func Default() *Conf { @@ -77,9 +78,10 @@ func Default() *Conf { WaitTimeoutMs: 1500, RequestPauseMs: 100, }, - InitialDomains: []string{""}, - Depth: 5, - Workers: 20, + InitialPages: []string{""}, + Depth: 5, + Workers: 20, + BlacklistedDomains: []string{""}, Logging: Logging{ OutputLogs: true, LogsFile: "logs.log", diff --git a/src/go.mod b/src/go.mod index 277fca7..dc03b88 100644 --- a/src/go.mod +++ b/src/go.mod @@ -1,4 +1,4 @@ -module unbewohnte/websurf +module unbewohnte/wecr go 1.18 diff --git a/src/logger/logger.go b/src/logger/logger.go index ab13bbb..3ba1e83 100644 --- a/src/logger/logger.go +++ b/src/logger/logger.go @@ -1,5 +1,5 @@ /* - websurf - surf the web for data recursively + Wecr - crawl the web for data Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify diff --git a/src/main.go b/src/main.go index f9056f5..b3ca9b7 100644 --- a/src/main.go +++ b/src/main.go @@ -1,5 +1,5 @@ /* - websurf - surf the web for data recursively + Wecr - crawl the web for data Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify @@ -24,14 +24,15 @@ import ( "fmt" "io" "log" + "net/url" "os" "os/signal" "path/filepath" "time" - "unbewohnte/websurf/config" - "unbewohnte/websurf/logger" - "unbewohnte/websurf/web" - "unbewohnte/websurf/worker" + "unbewohnte/wecr/config" + "unbewohnte/wecr/logger" + "unbewohnte/wecr/web" + "unbewohnte/wecr/worker" ) const ( @@ -78,7 +79,7 @@ func init() { if *version { fmt.Printf( - "webscrape - scrape the web\n(c) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n\n", + "Webcrawl - crawl the web for data\n(c) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n\n", ) os.Exit(0) } @@ -152,14 +153,24 @@ func main() { } // sanitize and correct inputs - if len(conf.InitialDomains) == 0 { - logger.Warning("No initial domain URLs have been set") + if len(conf.InitialPages) == 0 { + logger.Warning("No initial page URLs have been set") return - } else if len(conf.InitialDomains) != 0 && conf.InitialDomains[0] == "" { - logger.Warning("No initial domain URLs have been set") + } else if len(conf.InitialPages) != 0 && conf.InitialPages[0] == "" { + logger.Warning("No initial page URLs have been set") return } + for index, blacklistedDomain := range conf.BlacklistedDomains { + parsedURL, err := url.Parse(blacklistedDomain) + if err != nil { + logger.Warning("Failed to parse blacklisted %s: %s", blacklistedDomain, err) + continue + } + + conf.BlacklistedDomains[index] = parsedURL.Host + } + if conf.Depth <= 0 { conf.Depth = 1 logger.Warning("Depth is <= 0. Set to %d", conf.Depth) @@ -215,9 +226,9 @@ func main() { results := make(chan web.Result, conf.Workers*5) // create initial jobs - for _, initialDomain := range conf.InitialDomains { + for _, initialPage := range conf.InitialPages { jobs <- web.Job{ - URL: initialDomain, + URL: initialPage, Search: conf.Search, Depth: conf.Depth, } @@ -225,8 +236,9 @@ func main() { // form a worker pool workerPool := worker.NewWorkerPool(jobs, results, conf.Workers, worker.WorkerConf{ - Requests: conf.Requests, - Save: conf.Save, + Requests: conf.Requests, + Save: conf.Save, + BlacklistedDomains: conf.BlacklistedDomains, }) logger.Info("Created a worker pool with %d workers", conf.Workers) diff --git a/src/web/images.go b/src/web/images.go index 03e260b..88f2e8d 100644 --- a/src/web/images.go +++ b/src/web/images.go @@ -1,5 +1,5 @@ /* - websurf - surf the web for data recursively + Wecr - crawl the web for data Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify @@ -22,7 +22,7 @@ import ( "bytes" "net/url" "strings" - "unbewohnte/websurf/logger" + "unbewohnte/wecr/logger" "golang.org/x/net/html" ) diff --git a/src/web/job.go b/src/web/job.go index 28edd1e..f98747d 100644 --- a/src/web/job.go +++ b/src/web/job.go @@ -1,5 +1,5 @@ /* - websurf - surf the web for data recursively + Wecr - crawl the web for data Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify @@ -18,7 +18,7 @@ package web -import "unbewohnte/websurf/config" +import "unbewohnte/wecr/config" type Job struct { URL string diff --git a/src/web/requests.go b/src/web/requests.go index 5d643ee..ffbd3d8 100644 --- a/src/web/requests.go +++ b/src/web/requests.go @@ -1,5 +1,5 @@ /* - websurf - surf the web for data recursively + Wecr - crawl the web for data Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify diff --git a/src/web/result.go b/src/web/result.go index f32e233..d92388f 100644 --- a/src/web/result.go +++ b/src/web/result.go @@ -1,5 +1,5 @@ /* - websurf - surf the web for data recursively + Wecr - crawl the web for data Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify @@ -18,7 +18,7 @@ package web -import "unbewohnte/websurf/config" +import "unbewohnte/wecr/config" type Result struct { PageURL string diff --git a/src/web/text.go b/src/web/text.go index c1b15cc..2102902 100644 --- a/src/web/text.go +++ b/src/web/text.go @@ -1,5 +1,5 @@ /* - websurf - surf the web for data recursively + Wecr - crawl the web for data Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify diff --git a/src/worker/pool.go b/src/worker/pool.go index 0132605..9f23e48 100644 --- a/src/worker/pool.go +++ b/src/worker/pool.go @@ -1,5 +1,5 @@ /* - websurf - surf the web for data recursively + Wecr - crawl the web for data Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify @@ -21,7 +21,7 @@ package worker import ( "sync" "time" - "unbewohnte/websurf/web" + "unbewohnte/wecr/web" ) type visited struct { diff --git a/src/worker/worker.go b/src/worker/worker.go index db22efc..c000b43 100644 --- a/src/worker/worker.go +++ b/src/worker/worker.go @@ -1,5 +1,5 @@ /* - websurf - surf the web for data recursively + Wecr - crawl the web for data Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify @@ -28,14 +28,15 @@ import ( "path/filepath" "regexp" "time" - "unbewohnte/websurf/config" - "unbewohnte/websurf/logger" - "unbewohnte/websurf/web" + "unbewohnte/wecr/config" + "unbewohnte/wecr/logger" + "unbewohnte/wecr/web" ) type WorkerConf struct { - Requests config.Requests - Save config.Save + Requests config.Requests + Save config.Save + BlacklistedDomains []string } type Worker struct { @@ -70,14 +71,31 @@ func (w *Worker) Work() { return } + // see if the domain is not blacklisted + var skip bool = false + parsedURL, err := url.Parse(job.URL) + if err != nil { + logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err) + continue + } + for _, blacklistedDomain := range w.Conf.BlacklistedDomains { + if parsedURL.Hostname() == blacklistedDomain { + skip = true + logger.Info("Skipping blacklisted %s", job.URL) + break + } + } + if skip { + continue + } + // check if it is the first occurence w.visited.Lock.Lock() - var skip bool = false for _, visitedURL := range w.visited.URLs { if job.URL == visitedURL { // okay, don't even bother. Move onto the next job skip = true - logger.Info("Skipping %s", job.URL) + logger.Info("Skipping visited %s", job.URL) w.visited.Lock.Unlock() break } @@ -136,12 +154,6 @@ func (w *Worker) Work() { case config.QueryImages: // find image URLs, output data to the file - parsedURL, err := url.Parse(job.URL) - if err != nil { - logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err) - continue - } - imageLinks := web.FindPageImages(pageData, parsedURL.Host) for count, imageLink := range imageLinks {