From fc7f9f3c70c26c23d0ca2286fb661b12dfd83d01 Mon Sep 17 00:00:00 2001 From: Unbewohnte Date: Tue, 27 Dec 2022 16:18:42 +0300 Subject: [PATCH] Fixed relative links not being resolved --- src/config/config.go | 2 ++ src/main.go | 18 ++++++++++++++---- src/web/requests.go | 4 +--- src/web/text.go | 25 ++++++++++++++++++++++--- src/worker/worker.go | 21 ++++++++++++++++++--- 5 files changed, 57 insertions(+), 13 deletions(-) diff --git a/src/config/config.go b/src/config/config.go index aed0e11..77f8a22 100644 --- a/src/config/config.go +++ b/src/config/config.go @@ -57,6 +57,7 @@ type Conf struct { Depth uint `json:"depth"` Workers uint `json:"workers"` InitialPages []string `json:"initial_pages"` + AllowedDomains []string `json:"allowed_domains"` BlacklistedDomains []string `json:"blacklisted_domains"` Save Save `json:"save"` Logging Logging `json:"logging"` @@ -81,6 +82,7 @@ func Default() *Conf { InitialPages: []string{""}, Depth: 5, Workers: 20, + AllowedDomains: []string{""}, BlacklistedDomains: []string{""}, Logging: Logging{ OutputLogs: true, diff --git a/src/main.go b/src/main.go index e89169d..10073df 100644 --- a/src/main.go +++ b/src/main.go @@ -35,7 +35,7 @@ import ( "unbewohnte/wecr/worker" ) -const version = "v0.1.1" +const version = "v0.1.2" const ( defaultConfigFile string = "conf.json" @@ -157,10 +157,10 @@ func main() { // sanitize and correct inputs if len(conf.InitialPages) == 0 { - logger.Warning("No initial page URLs have been set") + logger.Error("No initial page URLs have been set") return } else if len(conf.InitialPages) != 0 && conf.InitialPages[0] == "" { - logger.Warning("No initial page URLs have been set") + logger.Error("No initial page URLs have been set") return } @@ -171,7 +171,17 @@ func main() { continue } - conf.BlacklistedDomains[index] = parsedURL.Host + conf.BlacklistedDomains[index] = parsedURL.Hostname() + } + + for index, allowedDomain := range conf.AllowedDomains { + parsedURL, err := url.Parse(allowedDomain) + if err != nil { + logger.Warning("Failed to parse allowed %s: %s", allowedDomain, err) + continue + } + + conf.AllowedDomains[index] = parsedURL.Hostname() } if conf.Depth <= 0 { diff --git a/src/web/requests.go b/src/web/requests.go index 3c267a5..ffbd3d8 100644 --- a/src/web/requests.go +++ b/src/web/requests.go @@ -31,9 +31,7 @@ func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) { } req.Header.Set("User-Agent", userAgent) - if timeOutMs != 0 { - http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond)) - } + http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond)) response, err := http.DefaultClient.Do(req) if err != nil { return nil, err diff --git a/src/web/text.go b/src/web/text.go index 2102902..10b49c4 100644 --- a/src/web/text.go +++ b/src/web/text.go @@ -21,13 +21,14 @@ package web import ( "bufio" "bytes" + "fmt" "regexp" "strings" "golang.org/x/net/html" ) -func FindPageLinks(pageBody []byte) []string { +func FindPageLinks(pageBody []byte, hostname string) []string { var urls []string tokenizer := html.NewTokenizer(bytes.NewReader(pageBody)) @@ -51,9 +52,27 @@ func FindPageLinks(pageBody []byte) []string { continue } - if strings.HasPrefix(attribute.Val, "http") { - urls = append(urls, attribute.Val) + var link string = attribute.Val + + if !strings.Contains(link, hostname) { + // add hostname + if strings.HasPrefix(link, "/") && strings.HasSuffix(hostname, "/") { + link = fmt.Sprintf("%s%s", hostname, link[1:]) + } else if !strings.HasPrefix(link, "/") && !strings.HasSuffix(hostname, "/") { + link = fmt.Sprintf("%s/%s", hostname, link) + } else { + link = fmt.Sprintf("%s%s", hostname, link) + } } + + link = strings.TrimPrefix(link, "//") + + if !strings.HasPrefix(link, "http://") && !strings.HasPrefix(link, "https://") { + // add scheme + link = "http://" + link + } + + urls = append(urls, link) } } } diff --git a/src/worker/worker.go b/src/worker/worker.go index e7cf3e8..a129f0c 100644 --- a/src/worker/worker.go +++ b/src/worker/worker.go @@ -37,6 +37,7 @@ type WorkerConf struct { Requests config.Requests Save config.Save BlacklistedDomains []string + AllowedDomains []string } type Worker struct { @@ -127,20 +128,34 @@ func (w *Worker) Work() { return } - // see if the domain is not blacklisted + // see if the domain is allowed and is not blacklisted var skip bool = false parsedURL, err := url.Parse(job.URL) if err != nil { logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err) continue } + + for _, allowedDomain := range w.Conf.AllowedDomains { + if parsedURL.Hostname() != allowedDomain { + skip = true + logger.Info("Skipped non-allowed %s", job.URL) + break + } + } + for _, blacklistedDomain := range w.Conf.BlacklistedDomains { + if skip { + break + } + if parsedURL.Hostname() == blacklistedDomain { skip = true - logger.Info("Skipping blacklisted %s", job.URL) + logger.Info("Skipped blacklisted %s", job.URL) break } } + if skip { continue } @@ -174,7 +189,7 @@ func (w *Worker) Work() { } // find links - pageLinks := web.FindPageLinks(pageData) + pageLinks := web.FindPageLinks(pageData, parsedURL.Host) go func() { if job.Depth > 1 {