Browse Source

Fixed allowed_domains not being used

master
parent
commit
b84af90d08
  1. 2
      README.md
  2. 5
      src/logger/logger.go
  3. 39
      src/main.go
  4. 2
      src/web/images.go
  5. 1
      src/worker/pool.go
  6. 22
      src/worker/worker.go

2
README.md

@ -10,7 +10,7 @@ The flow of work fully depends on the configuration file. By default `conf.json`
The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them. The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.
The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains` or are NOT in `allowed_domains`. If all initial pages are happen to be on blacklisted domains or are not in the allowed list - the program will get stuck. The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains` or are NOT in `allowed_domains`. If all initial pages are happen to be on blacklisted domains or are not in the allowed list - the program will get stuck. It is important to note that `*_domains` should be specified with an existing scheme (ie: https://en.wikipedia.org). Subdomains and ports **matter**: `https://unbewohnte.su:3000/` and `https://unbewohnte.su/` are **different**.
### Search query ### Search query

5
src/logger/logger.go

@ -50,6 +50,11 @@ func SetOutput(writer io.Writer) {
errorLog.SetOutput(writer) errorLog.SetOutput(writer)
} }
// Get current logger's output writer
func GetOutput() io.Writer {
return infoLog.Writer()
}
// Log information // Log information
func Info(format string, a ...interface{}) { func Info(format string, a ...interface{}) {
infoLog.Printf(format, a...) infoLog.Printf(format, a...)

39
src/main.go

@ -35,7 +35,7 @@ import (
"unbewohnte/wecr/worker" "unbewohnte/wecr/worker"
) )
const version = "v0.1.3" const version = "v0.1.4"
const ( const (
defaultConfigFile string = "conf.json" defaultConfigFile string = "conf.json"
@ -87,6 +87,17 @@ func init() {
os.Exit(0) os.Exit(0)
} }
// print logo
logger.GetOutput().Write([]byte(
`
`),
)
logger.GetOutput().Write([]byte(version + "\n\n"))
// work out working directory path // work out working directory path
if *wDir != "" { if *wDir != "" {
workingDirectory = *wDir workingDirectory = *wDir
@ -151,7 +162,7 @@ func main() {
} }
} else { } else {
// no logging needed // no logging needed
logger.Info("No logs will be outputted") logger.Info("No further logs will be outputted")
logger.SetOutput(nil) logger.SetOutput(nil)
} }
@ -167,21 +178,33 @@ func main() {
for index, blacklistedDomain := range conf.BlacklistedDomains { for index, blacklistedDomain := range conf.BlacklistedDomains {
parsedURL, err := url.Parse(blacklistedDomain) parsedURL, err := url.Parse(blacklistedDomain)
if err != nil { if err != nil {
logger.Warning("Failed to parse blacklisted %s: %s", blacklistedDomain, err) logger.Warning("Failed to parse blacklisted \"%s\": %s", blacklistedDomain, err)
continue
}
if parsedURL.Scheme == "" {
// parsing is invalid, as stdlib says
logger.Warning("Failed to parse blacklisted \"%s\": no scheme specified", blacklistedDomain)
continue continue
} }
conf.BlacklistedDomains[index] = parsedURL.Hostname() conf.BlacklistedDomains[index] = parsedURL.Host
} }
for index, allowedDomain := range conf.AllowedDomains { for index, allowedDomain := range conf.AllowedDomains {
parsedURL, err := url.Parse(allowedDomain) parsedURL, err := url.Parse(allowedDomain)
if err != nil { if err != nil {
logger.Warning("Failed to parse allowed %s: %s", allowedDomain, err) logger.Warning("Failed to parse allowed \"%s\": %s", allowedDomain, err)
continue
}
if parsedURL.Scheme == "" {
// parsing is invalid, as stdlib says
logger.Warning("Failed to parse allowed \"%s\": no scheme specified", allowedDomain)
continue continue
} }
conf.AllowedDomains[index] = parsedURL.Hostname() conf.AllowedDomains[index] = parsedURL.Host
} }
if conf.Depth <= 0 { if conf.Depth <= 0 {
@ -252,6 +275,7 @@ func main() {
Requests: conf.Requests, Requests: conf.Requests,
Save: conf.Save, Save: conf.Save,
BlacklistedDomains: conf.BlacklistedDomains, BlacklistedDomains: conf.BlacklistedDomains,
AllowedDomains: conf.AllowedDomains,
}) })
logger.Info("Created a worker pool with %d workers", conf.Workers) logger.Info("Created a worker pool with %d workers", conf.Workers)
@ -282,9 +306,10 @@ func main() {
timeSince := time.Since(workerPool.Stats.StartTime).Round(time.Second) timeSince := time.Since(workerPool.Stats.StartTime).Round(time.Second)
fmt.Fprintf(os.Stdout, "\r[%s] %d pages; %d matches (%d pages/sec)", fmt.Fprintf(os.Stdout, "\r[%s] %d pages visited; %d saved; %d matches (%d pages/sec)",
timeSince.String(), timeSince.String(),
workerPool.Stats.PagesVisited, workerPool.Stats.PagesVisited,
workerPool.Stats.PagesSaved,
workerPool.Stats.MatchesFound, workerPool.Stats.MatchesFound,
workerPool.Stats.PagesVisited/uint64(timeSince.Seconds()), workerPool.Stats.PagesVisited/uint64(timeSince.Seconds()),
) )

2
src/web/images.go

@ -82,7 +82,7 @@ func FindPageImages(pageBody []byte, from *url.URL) []string {
imageURLString := ResolveLink(imageURL, from.Host) imageURLString := ResolveLink(imageURL, from.Host)
if attribute.Key == "src" { if token.Data == "img" {
// <img> tag -> don't check // <img> tag -> don't check
urls = append(urls, imageURLString) urls = append(urls, imageURLString)
} else { } else {

1
src/worker/pool.go

@ -34,6 +34,7 @@ type visited struct {
type Statistics struct { type Statistics struct {
PagesVisited uint64 PagesVisited uint64
MatchesFound uint64 MatchesFound uint64
PagesSaved uint64
StartTime time.Time StartTime time.Time
} }

22
src/worker/worker.go

@ -93,7 +93,6 @@ func (w *Worker) Work() {
return return
} }
// see if the domain is allowed and is not blacklisted
var skip bool = false var skip bool = false
pageURL, err := url.Parse(job.URL) pageURL, err := url.Parse(job.URL)
if err != nil { if err != nil {
@ -101,29 +100,33 @@ func (w *Worker) Work() {
continue continue
} }
for _, allowedDomain := range w.Conf.AllowedDomains { // see if the domain is allowed and is not blacklisted
if pageURL.Hostname() != allowedDomain { if len(w.Conf.AllowedDomains) > 0 {
skip = true skip = true
logger.Info("Skipped non-allowed %s", job.URL) for _, allowedDomain := range w.Conf.AllowedDomains {
if pageURL.Host == allowedDomain {
skip = false
break break
} }
} }
for _, blacklistedDomain := range w.Conf.BlacklistedDomains {
if skip { if skip {
break logger.Info("Skipped non-allowed %s", job.URL)
continue
}
} }
if pageURL.Hostname() == blacklistedDomain { if len(w.Conf.BlacklistedDomains) > 0 {
for _, blacklistedDomain := range w.Conf.BlacklistedDomains {
if pageURL.Host == blacklistedDomain {
skip = true skip = true
logger.Info("Skipped blacklisted %s", job.URL) logger.Info("Skipped blacklisted %s", job.URL)
break break
} }
} }
if skip { if skip {
continue continue
} }
}
// check if it is the first occurence // check if it is the first occurence
w.visited.Lock.Lock() w.visited.Lock.Lock()
@ -281,6 +284,7 @@ func (w *Worker) Work() {
// save page // save page
if savePage { if savePage {
w.savePage(pageURL, pageData) w.savePage(pageURL, pageData)
w.stats.PagesSaved++
} }
// sleep before the next request // sleep before the next request

Loading…
Cancel
Save