Browse Source

Fixed relative links not being resolved

master
parent
commit
fc7f9f3c70
  1. 2
      src/config/config.go
  2. 18
      src/main.go
  3. 4
      src/web/requests.go
  4. 25
      src/web/text.go
  5. 21
      src/worker/worker.go

2
src/config/config.go

@ -57,6 +57,7 @@ type Conf struct {
Depth uint `json:"depth"`
Workers uint `json:"workers"`
InitialPages []string `json:"initial_pages"`
AllowedDomains []string `json:"allowed_domains"`
BlacklistedDomains []string `json:"blacklisted_domains"`
Save Save `json:"save"`
Logging Logging `json:"logging"`
@ -81,6 +82,7 @@ func Default() *Conf {
InitialPages: []string{""},
Depth: 5,
Workers: 20,
AllowedDomains: []string{""},
BlacklistedDomains: []string{""},
Logging: Logging{
OutputLogs: true,

18
src/main.go

@ -35,7 +35,7 @@ import (
"unbewohnte/wecr/worker"
)
const version = "v0.1.1"
const version = "v0.1.2"
const (
defaultConfigFile string = "conf.json"
@ -157,10 +157,10 @@ func main() {
// sanitize and correct inputs
if len(conf.InitialPages) == 0 {
logger.Warning("No initial page URLs have been set")
logger.Error("No initial page URLs have been set")
return
} else if len(conf.InitialPages) != 0 && conf.InitialPages[0] == "" {
logger.Warning("No initial page URLs have been set")
logger.Error("No initial page URLs have been set")
return
}
@ -171,7 +171,17 @@ func main() {
continue
}
conf.BlacklistedDomains[index] = parsedURL.Host
conf.BlacklistedDomains[index] = parsedURL.Hostname()
}
for index, allowedDomain := range conf.AllowedDomains {
parsedURL, err := url.Parse(allowedDomain)
if err != nil {
logger.Warning("Failed to parse allowed %s: %s", allowedDomain, err)
continue
}
conf.AllowedDomains[index] = parsedURL.Hostname()
}
if conf.Depth <= 0 {

4
src/web/requests.go

@ -31,9 +31,7 @@ func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
}
req.Header.Set("User-Agent", userAgent)
if timeOutMs != 0 {
http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond))
}
http.DefaultClient.Timeout = time.Duration(timeOutMs * uint64(time.Millisecond))
response, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err

25
src/web/text.go

@ -21,13 +21,14 @@ package web
import (
"bufio"
"bytes"
"fmt"
"regexp"
"strings"
"golang.org/x/net/html"
)
func FindPageLinks(pageBody []byte) []string {
func FindPageLinks(pageBody []byte, hostname string) []string {
var urls []string
tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
@ -51,9 +52,27 @@ func FindPageLinks(pageBody []byte) []string {
continue
}
if strings.HasPrefix(attribute.Val, "http") {
urls = append(urls, attribute.Val)
var link string = attribute.Val
if !strings.Contains(link, hostname) {
// add hostname
if strings.HasPrefix(link, "/") && strings.HasSuffix(hostname, "/") {
link = fmt.Sprintf("%s%s", hostname, link[1:])
} else if !strings.HasPrefix(link, "/") && !strings.HasSuffix(hostname, "/") {
link = fmt.Sprintf("%s/%s", hostname, link)
} else {
link = fmt.Sprintf("%s%s", hostname, link)
}
}
link = strings.TrimPrefix(link, "//")
if !strings.HasPrefix(link, "http://") && !strings.HasPrefix(link, "https://") {
// add scheme
link = "http://" + link
}
urls = append(urls, link)
}
}
}

21
src/worker/worker.go

@ -37,6 +37,7 @@ type WorkerConf struct {
Requests config.Requests
Save config.Save
BlacklistedDomains []string
AllowedDomains []string
}
type Worker struct {
@ -127,20 +128,34 @@ func (w *Worker) Work() {
return
}
// see if the domain is not blacklisted
// see if the domain is allowed and is not blacklisted
var skip bool = false
parsedURL, err := url.Parse(job.URL)
if err != nil {
logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err)
continue
}
for _, allowedDomain := range w.Conf.AllowedDomains {
if parsedURL.Hostname() != allowedDomain {
skip = true
logger.Info("Skipped non-allowed %s", job.URL)
break
}
}
for _, blacklistedDomain := range w.Conf.BlacklistedDomains {
if skip {
break
}
if parsedURL.Hostname() == blacklistedDomain {
skip = true
logger.Info("Skipping blacklisted %s", job.URL)
logger.Info("Skipped blacklisted %s", job.URL)
break
}
}
if skip {
continue
}
@ -174,7 +189,7 @@ func (w *Worker) Work() {
}
// find links
pageLinks := web.FindPageLinks(pageData)
pageLinks := web.FindPageLinks(pageData, parsedURL.Host)
go func() {
if job.Depth > 1 {

Loading…
Cancel
Save