Browse Source

Fixed allowed_domains not being used

master
parent
commit
b84af90d08
  1. 2
      README.md
  2. 5
      src/logger/logger.go
  3. 39
      src/main.go
  4. 2
      src/web/images.go
  5. 1
      src/worker/pool.go
  6. 38
      src/worker/worker.go

2
README.md

@ -10,7 +10,7 @@ The flow of work fully depends on the configuration file. By default `conf.json`
The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.
The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains` or are NOT in `allowed_domains`. If all initial pages are happen to be on blacklisted domains or are not in the allowed list - the program will get stuck.
The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains` or are NOT in `allowed_domains`. If all initial pages are happen to be on blacklisted domains or are not in the allowed list - the program will get stuck. It is important to note that `*_domains` should be specified with an existing scheme (ie: https://en.wikipedia.org). Subdomains and ports **matter**: `https://unbewohnte.su:3000/` and `https://unbewohnte.su/` are **different**.
### Search query

5
src/logger/logger.go

@ -50,6 +50,11 @@ func SetOutput(writer io.Writer) {
errorLog.SetOutput(writer)
}
// Get current logger's output writer
func GetOutput() io.Writer {
return infoLog.Writer()
}
// Log information
func Info(format string, a ...interface{}) {
infoLog.Printf(format, a...)

39
src/main.go

@ -35,7 +35,7 @@ import (
"unbewohnte/wecr/worker"
)
const version = "v0.1.3"
const version = "v0.1.4"
const (
defaultConfigFile string = "conf.json"
@ -87,6 +87,17 @@ func init() {
os.Exit(0)
}
// print logo
logger.GetOutput().Write([]byte(
`
`),
)
logger.GetOutput().Write([]byte(version + "\n\n"))
// work out working directory path
if *wDir != "" {
workingDirectory = *wDir
@ -151,7 +162,7 @@ func main() {
}
} else {
// no logging needed
logger.Info("No logs will be outputted")
logger.Info("No further logs will be outputted")
logger.SetOutput(nil)
}
@ -167,21 +178,33 @@ func main() {
for index, blacklistedDomain := range conf.BlacklistedDomains {
parsedURL, err := url.Parse(blacklistedDomain)
if err != nil {
logger.Warning("Failed to parse blacklisted %s: %s", blacklistedDomain, err)
logger.Warning("Failed to parse blacklisted \"%s\": %s", blacklistedDomain, err)
continue
}
if parsedURL.Scheme == "" {
// parsing is invalid, as stdlib says
logger.Warning("Failed to parse blacklisted \"%s\": no scheme specified", blacklistedDomain)
continue
}
conf.BlacklistedDomains[index] = parsedURL.Hostname()
conf.BlacklistedDomains[index] = parsedURL.Host
}
for index, allowedDomain := range conf.AllowedDomains {
parsedURL, err := url.Parse(allowedDomain)
if err != nil {
logger.Warning("Failed to parse allowed %s: %s", allowedDomain, err)
logger.Warning("Failed to parse allowed \"%s\": %s", allowedDomain, err)
continue
}
if parsedURL.Scheme == "" {
// parsing is invalid, as stdlib says
logger.Warning("Failed to parse allowed \"%s\": no scheme specified", allowedDomain)
continue
}
conf.AllowedDomains[index] = parsedURL.Hostname()
conf.AllowedDomains[index] = parsedURL.Host
}
if conf.Depth <= 0 {
@ -252,6 +275,7 @@ func main() {
Requests: conf.Requests,
Save: conf.Save,
BlacklistedDomains: conf.BlacklistedDomains,
AllowedDomains: conf.AllowedDomains,
})
logger.Info("Created a worker pool with %d workers", conf.Workers)
@ -282,9 +306,10 @@ func main() {
timeSince := time.Since(workerPool.Stats.StartTime).Round(time.Second)
fmt.Fprintf(os.Stdout, "\r[%s] %d pages; %d matches (%d pages/sec)",
fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d saved; %d matches (%d pages/sec)",
timeSince.String(),
workerPool.Stats.PagesVisited,
workerPool.Stats.PagesSaved,
workerPool.Stats.MatchesFound,
workerPool.Stats.PagesVisited/uint64(timeSince.Seconds()),
)

2
src/web/images.go

@ -82,7 +82,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string {
imageURLString := ResolveLink(imageURL, from.Host)
if attribute.Key == "src" {
if token.Data == "img" {
// <img> tag -> don't check
urls = append(urls, imageURLString)
} else {

1
src/worker/pool.go

@ -34,6 +34,7 @@ type visited struct {
type Statistics struct {
PagesVisited uint64
MatchesFound uint64
PagesSaved uint64
StartTime time.Time
}

38
src/worker/worker.go

@ -93,7 +93,6 @@ func (w *Worker) Work() {
return
}
// see if the domain is allowed and is not blacklisted
var skip bool = false
pageURL, err := url.Parse(job.URL)
if err != nil {
@ -101,30 +100,34 @@ func (w *Worker) Work() {
continue
}
for _, allowedDomain := range w.Conf.AllowedDomains {
if pageURL.Hostname() != allowedDomain {
skip = true
// see if the domain is allowed and is not blacklisted
if len(w.Conf.AllowedDomains) > 0 {
skip = true
for _, allowedDomain := range w.Conf.AllowedDomains {
if pageURL.Host == allowedDomain {
skip = false
break
}
}
if skip {
logger.Info("Skipped non-allowed %s", job.URL)
break
continue
}
}
for _, blacklistedDomain := range w.Conf.BlacklistedDomains {
if skip {
break
if len(w.Conf.BlacklistedDomains) > 0 {
for _, blacklistedDomain := range w.Conf.BlacklistedDomains {
if pageURL.Host == blacklistedDomain {
skip = true
logger.Info("Skipped blacklisted %s", job.URL)
break
}
}
if pageURL.Hostname() == blacklistedDomain {
skip = true
logger.Info("Skipped blacklisted %s", job.URL)
break
if skip {
continue
}
}
if skip {
continue
}
// check if it is the first occurence
w.visited.Lock.Lock()
for _, visitedURL := range w.visited.URLs {
@ -281,6 +284,7 @@ func (w *Worker) Work() {
// save page
if savePage {
w.savePage(pageURL, pageData)
w.stats.PagesSaved++
}
// sleep before the next request

Loading…
Cancel
Save