Browse Source

Blacklisting domains

master
parent
commit
7c30787990
  1. 3
      .gitignore
  2. 2
      Makefile
  3. 5
      README.md
  4. 24
      src/config/config.go
  5. 2
      src/go.mod
  6. 2
      src/logger/logger.go
  7. 40
      src/main.go
  8. 4
      src/web/images.go
  9. 4
      src/web/job.go
  10. 2
      src/web/requests.go
  11. 4
      src/web/result.go
  12. 2
      src/web/text.go
  13. 4
      src/worker/pool.go
  14. 40
      src/worker/worker.go

3
.gitignore vendored

@ -3,4 +3,5 @@ conf.json
logs.log
output.json
websurf
conf_mega_ita.json
conf_mega_ita.json
wecr

2
Makefile

@ -1,5 +1,5 @@
SRCDIR:=src
EXE:=websurf
EXE:=wecr
TESTDIR:=testing
all:

5
README.md

@ -1,8 +1,8 @@
# Websurf
# Wecr - simple web crawler
## Overview
Just a simple HTML web spider with minimal dependencies. It is possible to search for pages with a text on them or for the text itself, extract images and save pages that satisfy the criteria along the way.
Just a simple HTML web spider with minimal dependencies. It is possible to search for pages with a text on them or for the text itself, extract images and save pages that satisfy the criteria along the way.
## Configuration
@ -10,6 +10,7 @@ The flow of work fully depends on the configuration file. By default `conf.json`
The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.
The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains`. If all initial pages are happen to be blacklisted - the program will end.
### Search query

24
src/config/config.go

@ -1,5 +1,5 @@
/*
websurf - surf the web for data recursively
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
@ -52,13 +52,14 @@ type Logging struct {
}
type Conf struct {
Search Search `json:"search"`
Requests Requests `json:"requests"`
Depth uint `json:"depth"`
Workers uint `json:"workers"`
InitialDomains []string `json:"initial_domains"`
Save Save `json:"save"`
Logging Logging `json:"logging"`
Search Search `json:"search"`
Requests Requests `json:"requests"`
Depth uint `json:"depth"`
Workers uint `json:"workers"`
InitialPages []string `json:"initial_pages"`
BlacklistedDomains []string `json:"blacklisted_domains"`
Save Save `json:"save"`
Logging Logging `json:"logging"`
}
func Default() *Conf {
@ -77,9 +78,10 @@ func Default() *Conf {
WaitTimeoutMs: 1500,
RequestPauseMs: 100,
},
InitialDomains: []string{""},
Depth: 5,
Workers: 20,
InitialPages: []string{""},
Depth: 5,
Workers: 20,
BlacklistedDomains: []string{""},
Logging: Logging{
OutputLogs: true,
LogsFile: "logs.log",

2
src/go.mod

@ -1,4 +1,4 @@
module unbewohnte/websurf
module unbewohnte/wecr
go 1.18

2
src/logger/logger.go

@ -1,5 +1,5 @@
/*
websurf - surf the web for data recursively
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify

40
src/main.go

@ -1,5 +1,5 @@
/*
websurf - surf the web for data recursively
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
@ -24,14 +24,15 @@ import (
"fmt"
"io"
"log"
"net/url"
"os"
"os/signal"
"path/filepath"
"time"
"unbewohnte/websurf/config"
"unbewohnte/websurf/logger"
"unbewohnte/websurf/web"
"unbewohnte/websurf/worker"
"unbewohnte/wecr/config"
"unbewohnte/wecr/logger"
"unbewohnte/wecr/web"
"unbewohnte/wecr/worker"
)
const (
@ -78,7 +79,7 @@ func init() {
if *version {
fmt.Printf(
"webscrape - scrape the web\n(c) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n\n",
"Webcrawl - crawl the web for data\n(c) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n\n",
)
os.Exit(0)
}
@ -152,14 +153,24 @@ func main() {
}
// sanitize and correct inputs
if len(conf.InitialDomains) == 0 {
logger.Warning("No initial domain URLs have been set")
if len(conf.InitialPages) == 0 {
logger.Warning("No initial page URLs have been set")
return
} else if len(conf.InitialDomains) != 0 && conf.InitialDomains[0] == "" {
logger.Warning("No initial domain URLs have been set")
} else if len(conf.InitialPages) != 0 && conf.InitialPages[0] == "" {
logger.Warning("No initial page URLs have been set")
return
}
for index, blacklistedDomain := range conf.BlacklistedDomains {
parsedURL, err := url.Parse(blacklistedDomain)
if err != nil {
logger.Warning("Failed to parse blacklisted %s: %s", blacklistedDomain, err)
continue
}
conf.BlacklistedDomains[index] = parsedURL.Host
}
if conf.Depth <= 0 {
conf.Depth = 1
logger.Warning("Depth is <= 0. Set to %d", conf.Depth)
@ -215,9 +226,9 @@ func main() {
results := make(chan web.Result, conf.Workers*5)
// create initial jobs
for _, initialDomain := range conf.InitialDomains {
for _, initialPage := range conf.InitialPages {
jobs <- web.Job{
URL: initialDomain,
URL: initialPage,
Search: conf.Search,
Depth: conf.Depth,
}
@ -225,8 +236,9 @@ func main() {
// form a worker pool
workerPool := worker.NewWorkerPool(jobs, results, conf.Workers, worker.WorkerConf{
Requests: conf.Requests,
Save: conf.Save,
Requests: conf.Requests,
Save: conf.Save,
BlacklistedDomains: conf.BlacklistedDomains,
})
logger.Info("Created a worker pool with %d workers", conf.Workers)

4
src/web/images.go

@ -1,5 +1,5 @@
/*
websurf - surf the web for data recursively
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
@ -22,7 +22,7 @@ import (
"bytes"
"net/url"
"strings"
"unbewohnte/websurf/logger"
"unbewohnte/wecr/logger"
"golang.org/x/net/html"
)

4
src/web/job.go

@ -1,5 +1,5 @@
/*
websurf - surf the web for data recursively
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
@ -18,7 +18,7 @@
package web
import "unbewohnte/websurf/config"
import "unbewohnte/wecr/config"
type Job struct {
URL string

2
src/web/requests.go

@ -1,5 +1,5 @@
/*
websurf - surf the web for data recursively
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify

4
src/web/result.go

@ -1,5 +1,5 @@
/*
websurf - surf the web for data recursively
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
@ -18,7 +18,7 @@
package web
import "unbewohnte/websurf/config"
import "unbewohnte/wecr/config"
type Result struct {
PageURL string

2
src/web/text.go

@ -1,5 +1,5 @@
/*
websurf - surf the web for data recursively
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify

4
src/worker/pool.go

@ -1,5 +1,5 @@
/*
websurf - surf the web for data recursively
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
@ -21,7 +21,7 @@ package worker
import (
"sync"
"time"
"unbewohnte/websurf/web"
"unbewohnte/wecr/web"
)
type visited struct {

40
src/worker/worker.go

@ -1,5 +1,5 @@
/*
websurf - surf the web for data recursively
Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
@ -28,14 +28,15 @@ import (
"path/filepath"
"regexp"
"time"
"unbewohnte/websurf/config"
"unbewohnte/websurf/logger"
"unbewohnte/websurf/web"
"unbewohnte/wecr/config"
"unbewohnte/wecr/logger"
"unbewohnte/wecr/web"
)
type WorkerConf struct {
Requests config.Requests
Save config.Save
Requests config.Requests
Save config.Save
BlacklistedDomains []string
}
type Worker struct {
@ -70,14 +71,31 @@ func (w *Worker) Work() {
return
}
// see if the domain is not blacklisted
var skip bool = false
parsedURL, err := url.Parse(job.URL)
if err != nil {
logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err)
continue
}
for _, blacklistedDomain := range w.Conf.BlacklistedDomains {
if parsedURL.Hostname() == blacklistedDomain {
skip = true
logger.Info("Skipping blacklisted %s", job.URL)
break
}
}
if skip {
continue
}
// check if it is the first occurence
w.visited.Lock.Lock()
var skip bool = false
for _, visitedURL := range w.visited.URLs {
if job.URL == visitedURL {
// okay, don't even bother. Move onto the next job
skip = true
logger.Info("Skipping %s", job.URL)
logger.Info("Skipping visited %s", job.URL)
w.visited.Lock.Unlock()
break
}
@ -136,12 +154,6 @@ func (w *Worker) Work() {
case config.QueryImages:
// find image URLs, output data to the file
parsedURL, err := url.Parse(job.URL)
if err != nil {
logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err)
continue
}
imageLinks := web.FindPageImages(pageData, parsedURL.Host)
for count, imageLink := range imageLinks {

Loading…
Cancel
Save