Browse Source

Fixed relative links not being resolved

master
parent
commit
fc7f9f3c70
  1. 2
      src/config/config.go
  2. 18
      src/main.go
  3. 2
      src/web/requests.go
  4. 25
      src/web/text.go
  5. 21
      src/worker/worker.go

2
src/config/config.go

@ -57,6 +57,7 @@ type Conf struct {
Depth uint `json:"depth"` Depth uint `json:"depth"`
Workers uint `json:"workers"` Workers uint `json:"workers"`
InitialPages []string `json:"initial_pages"` InitialPages []string `json:"initial_pages"`
AllowedDomains []string `json:"allowed_domains"`
BlacklistedDomains []string `json:"blacklisted_domains"` BlacklistedDomains []string `json:"blacklisted_domains"`
Save Save `json:"save"` Save Save `json:"save"`
Logging Logging `json:"logging"` Logging Logging `json:"logging"`
@ -81,6 +82,7 @@ func Default() *Conf {
InitialPages: []string{""}, InitialPages: []string{""},
Depth: 5, Depth: 5,
Workers: 20, Workers: 20,
AllowedDomains: []string{""},
BlacklistedDomains: []string{""}, BlacklistedDomains: []string{""},
Logging: Logging{ Logging: Logging{
OutputLogs: true, OutputLogs: true,

18
src/main.go

@ -35,7 +35,7 @@ import (
"unbewohnte/wecr/worker" "unbewohnte/wecr/worker"
) )
const version = "v0.1.1" const version = "v0.1.2"
const ( const (
defaultConfigFile string = "conf.json" defaultConfigFile string = "conf.json"
@ -157,10 +157,10 @@ func main() {
// sanitize and correct inputs // sanitize and correct inputs
if len(conf.InitialPages) == 0 { if len(conf.InitialPages) == 0 {
logger.Warning("No initial page URLs have been set") logger.Error("No initial page URLs have been set")
return return
} else if len(conf.InitialPages) != 0 && conf.InitialPages[0] == "" { } else if len(conf.InitialPages) != 0 && conf.InitialPages[0] == "" {
logger.Warning("No initial page URLs have been set") logger.Error("No initial page URLs have been set")
return return
} }
@ -171,7 +171,17 @@ func main() {
continue continue
} }
conf.BlacklistedDomains[index] = parsedURL.Host conf.BlacklistedDomains[index] = parsedURL.Hostname()
}
for index, allowedDomain := range conf.AllowedDomains {
parsedURL, err := url.Parse(allowedDomain)
if err != nil {
logger.Warning("Failed to parse allowed %s: %s", allowedDomain, err)
continue
}
conf.AllowedDomains[index] = parsedURL.Hostname()
} }
if conf.Depth <= 0 { if conf.Depth <= 0 {

2
src/web/requests.go

@ -31,9 +31,7 @@ func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
} }
req.Header.Set("User-Agent", userAgent) req.Header.Set("User-Agent", userAgent)
if timeOutMs != 0 {
http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond)) http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond))
}
response, err := http.DefaultClient.Do(req) response, err := http.DefaultClient.Do(req)
if err != nil { if err != nil {
return nil, err return nil, err

25
src/web/text.go

@ -21,13 +21,14 @@ package web
import ( import (
"bufio" "bufio"
"bytes" "bytes"
"fmt"
"regexp" "regexp"
"strings" "strings"
"golang.org/x/net/html" "golang.org/x/net/html"
) )
func FindPageLinks(pageBody []byte) []string { func FindPageLinks(pageBody []byte, hostname string) []string {
var urls []string var urls []string
tokenizer := html.NewTokenizer(bytes.NewReader(pageBody)) tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
@ -51,10 +52,28 @@ func FindPageLinks(pageBody []byte) []string {
continue continue
} }
if strings.HasPrefix(attribute.Val, "http") { var link string = attribute.Val
urls = append(urls, attribute.Val)
if !strings.Contains(link, hostname) {
// add hostname
if strings.HasPrefix(link, "/") && strings.HasSuffix(hostname, "/") {
link = fmt.Sprintf("%s%s", hostname, link[1:])
} else if !strings.HasPrefix(link, "/") && !strings.HasSuffix(hostname, "/") {
link = fmt.Sprintf("%s/%s", hostname, link)
} else {
link = fmt.Sprintf("%s%s", hostname, link)
} }
} }
link = strings.TrimPrefix(link, "//")
if !strings.HasPrefix(link, "http://") && !strings.HasPrefix(link, "https://") {
// add scheme
link = "http://" + link
}
urls = append(urls, link)
}
} }
} }
} }

21
src/worker/worker.go

@ -37,6 +37,7 @@ type WorkerConf struct {
Requests config.Requests Requests config.Requests
Save config.Save Save config.Save
BlacklistedDomains []string BlacklistedDomains []string
AllowedDomains []string
} }
type Worker struct { type Worker struct {
@ -127,20 +128,34 @@ func (w *Worker) Work() {
return return
} }
// see if the domain is not blacklisted // see if the domain is allowed and is not blacklisted
var skip bool = false var skip bool = false
parsedURL, err := url.Parse(job.URL) parsedURL, err := url.Parse(job.URL)
if err != nil { if err != nil {
logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err) logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err)
continue continue
} }
for _, allowedDomain := range w.Conf.AllowedDomains {
if parsedURL.Hostname() != allowedDomain {
skip = true
logger.Info("Skipped non-allowed %s", job.URL)
break
}
}
for _, blacklistedDomain := range w.Conf.BlacklistedDomains { for _, blacklistedDomain := range w.Conf.BlacklistedDomains {
if skip {
break
}
if parsedURL.Hostname() == blacklistedDomain { if parsedURL.Hostname() == blacklistedDomain {
skip = true skip = true
logger.Info("Skipping blacklisted %s", job.URL) logger.Info("Skipped blacklisted %s", job.URL)
break break
} }
} }
if skip { if skip {
continue continue
} }
@ -174,7 +189,7 @@ func (w *Worker) Work() {
} }
// find links // find links
pageLinks := web.FindPageLinks(pageData) pageLinks := web.FindPageLinks(pageData, parsedURL.Host)
go func() { go func() {
if job.Depth > 1 { if job.Depth > 1 {

Loading…
Cancel
Save