Browse Source

Fixed a terrible logical "bug" in worker`s Work function; Added comments; Added more image extentions; Merged and improved ResolveLink function

master
parent
commit
7fa094d80f
  1. 6
      src/config/config.go
  2. 3
      src/main.go
  3. 40
      src/web/images.go
  4. 1
      src/web/job.go
  5. 1
      src/web/requests.go
  6. 1
      src/web/result.go
  7. 46
      src/web/text.go
  8. 6
      src/worker/pool.go
  9. 30
      src/worker/worker.go

6
src/config/config.go

@ -51,6 +51,7 @@ type Logging struct {
LogsFile string `json:"logs_file"` LogsFile string `json:"logs_file"`
} }
// Configuration file structure
type Conf struct { type Conf struct {
Search Search `json:"search"` Search Search `json:"search"`
Requests Requests `json:"requests"` Requests Requests `json:"requests"`
@ -63,6 +64,7 @@ type Conf struct {
Logging Logging `json:"logging"` Logging Logging `json:"logging"`
} }
// Default configuration file structure
func Default() *Conf { func Default() *Conf {
return &Conf{ return &Conf{
Search: Search{ Search: Search{
@ -91,6 +93,7 @@ func Default() *Conf {
} }
} }
// Write current configuration to w
func (c *Conf) WriteTo(w io.Writer) error { func (c *Conf) WriteTo(w io.Writer) error {
jsonData, err := json.MarshalIndent(c, "", " ") jsonData, err := json.MarshalIndent(c, "", " ")
if err != nil { if err != nil {
@ -105,6 +108,7 @@ func (c *Conf) WriteTo(w io.Writer) error {
return nil return nil
} }
// Read configuration from r
func (c *Conf) ReadFrom(r io.Reader) error { func (c *Conf) ReadFrom(r io.Reader) error {
jsonData, err := io.ReadAll(r) jsonData, err := io.ReadAll(r)
if err != nil { if err != nil {
@ -119,6 +123,7 @@ func (c *Conf) ReadFrom(r io.Reader) error {
return nil return nil
} }
// Creates configuration file at path
func CreateConfigFile(conf Conf, path string) error { func CreateConfigFile(conf Conf, path string) error {
confFile, err := os.Create(path) confFile, err := os.Create(path)
if err != nil { if err != nil {
@ -134,6 +139,7 @@ func CreateConfigFile(conf Conf, path string) error {
return nil return nil
} }
// Tries to open configuration file at path. If it fails - returns default configuration
func OpenConfigFile(path string) (*Conf, error) { func OpenConfigFile(path string) (*Conf, error) {
confFile, err := os.Open(path) confFile, err := os.Open(path)
if err != nil { if err != nil {

3
src/main.go

@ -35,7 +35,7 @@ import (
"unbewohnte/wecr/worker" "unbewohnte/wecr/worker"
) )
const version = "v0.1.2" const version = "v0.1.3"
const ( const (
defaultConfigFile string = "conf.json" defaultConfigFile string = "conf.json"
@ -299,6 +299,7 @@ func main() {
break break
} }
// each entry in output file is a self-standing JSON object
entryBytes, err := json.MarshalIndent(result, "", " ") entryBytes, err := json.MarshalIndent(result, "", " ")
if err != nil { if err != nil {
continue continue

40
src/web/images.go

@ -20,7 +20,7 @@ package web
import ( import (
"bytes" "bytes"
"fmt" "net/url"
"strings" "strings"
"golang.org/x/net/html" "golang.org/x/net/html"
@ -31,9 +31,15 @@ func hasImageExtention(url string) bool {
".jpeg", ".jpeg",
".jpg", ".jpg",
".jpe", ".jpe",
".jfif",
".png", ".png",
".ppm", ".ppm",
".svg", ".svg",
".gif",
".tiff",
".bmp",
".webp",
".ico",
} }
for _, extention := range extentions { for _, extention := range extentions {
@ -46,7 +52,7 @@ func hasImageExtention(url string) bool {
} }
// Tries to find images' URLs on the page // Tries to find images' URLs on the page
func FindPageImages(pageBody []byte, hostname string) []string { func FindPageImages(pageBody []byte, from *url.URL) []string {
var urls []string var urls []string
tokenizer := html.NewTokenizer(bytes.NewReader(pageBody)) tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
@ -69,29 +75,21 @@ func FindPageImages(pageBody []byte, hostname string) []string {
continue continue
} }
var imageURL string = attribute.Val imageURL, err := url.Parse(attribute.Val)
if err != nil {
if !strings.Contains(imageURL, hostname) { break
// add hostname
if strings.HasPrefix(imageURL, "/") && strings.HasSuffix(hostname, "/") {
imageURL = fmt.Sprintf("%s%s", hostname, imageURL[1:])
} else if !strings.HasPrefix(imageURL, "/") && !strings.HasSuffix(hostname, "/") {
imageURL = fmt.Sprintf("%s/%s", hostname, imageURL)
} else {
imageURL = fmt.Sprintf("%s%s", hostname, imageURL)
}
} }
imageURL = strings.TrimPrefix(imageURL, "//") imageURLString := ResolveLink(imageURL, from.Host)
if !strings.HasPrefix(imageURL, "http://") && !strings.HasPrefix(imageURL, "https://") { if attribute.Key == "src" {
// add scheme // <img> tag -> don't check
imageURL = "http://" + imageURL urls = append(urls, imageURLString)
} else {
// <a> tag -> check for image extention
if hasImageExtention(imageURLString) {
urls = append(urls, imageURLString)
} }
// check for image extention
if hasImageExtention(imageURL) {
urls = append(urls, imageURL)
} }
} }
} }

1
src/web/job.go

@ -20,6 +20,7 @@ package web
import "unbewohnte/wecr/config" import "unbewohnte/wecr/config"
// Job to pass around workers
type Job struct { type Job struct {
URL string URL string
Search config.Search Search config.Search

1
src/web/requests.go

@ -24,6 +24,7 @@ import (
"time" "time"
) )
// Get page data coming from url with optional user agent and timeout
func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) { func GetPage(url string, userAgent string, timeOutMs uint64) ([]byte, error) {
req, err := http.NewRequest("GET", url, nil) req, err := http.NewRequest("GET", url, nil)
if err != nil { if err != nil {

1
src/web/result.go

@ -20,6 +20,7 @@ package web
import "unbewohnte/wecr/config" import "unbewohnte/wecr/config"
// Result of page parsing
type Result struct { type Result struct {
PageURL string PageURL string
Search config.Search Search config.Search

46
src/web/text.go

@ -21,14 +21,33 @@ package web
import ( import (
"bufio" "bufio"
"bytes" "bytes"
"fmt" "net/url"
"regexp" "regexp"
"strings" "strings"
"golang.org/x/net/html" "golang.org/x/net/html"
) )
func FindPageLinks(pageBody []byte, hostname string) []string { // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
func ResolveLink(url *url.URL, fromHost string) string {
if !url.IsAbs() {
if url.Scheme == "" {
// add scheme
url.Scheme = "http"
}
if url.Host == "" {
// add host
url.Host = fromHost
}
}
return url.String()
}
// Find all links on page that are specified in <a> tag
func FindPageLinks(pageBody []byte, from *url.URL) []string {
var urls []string var urls []string
tokenizer := html.NewTokenizer(bytes.NewReader(pageBody)) tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
@ -52,27 +71,12 @@ func FindPageLinks(pageBody []byte, hostname string) []string {
continue continue
} }
var link string = attribute.Val link, err := url.Parse(attribute.Val)
if err != nil {
if !strings.Contains(link, hostname) { break
// add hostname
if strings.HasPrefix(link, "/") && strings.HasSuffix(hostname, "/") {
link = fmt.Sprintf("%s%s", hostname, link[1:])
} else if !strings.HasPrefix(link, "/") && !strings.HasSuffix(hostname, "/") {
link = fmt.Sprintf("%s/%s", hostname, link)
} else {
link = fmt.Sprintf("%s%s", hostname, link)
}
}
link = strings.TrimPrefix(link, "//")
if !strings.HasPrefix(link, "http://") && !strings.HasPrefix(link, "https://") {
// add scheme
link = "http://" + link
} }
urls = append(urls, link) urls = append(urls, ResolveLink(link, from.Host))
} }
} }
} }

6
src/worker/pool.go

@ -24,17 +24,20 @@ import (
"unbewohnte/wecr/web" "unbewohnte/wecr/web"
) )
// Already visited URLs
type visited struct { type visited struct {
URLs []string URLs []string
Lock sync.Mutex Lock sync.Mutex
} }
// Whole worker pool's statistics
type Statistics struct { type Statistics struct {
PagesVisited uint64 PagesVisited uint64
MatchesFound uint64 MatchesFound uint64
StartTime time.Time StartTime time.Time
} }
// Web-Worker pool
type Pool struct { type Pool struct {
workersCount uint workersCount uint
workers []*Worker workers []*Worker
@ -42,6 +45,7 @@ type Pool struct {
Stats Statistics Stats Statistics
} }
// Create a new worker pool
func NewWorkerPool(jobs chan web.Job, results chan web.Result, workerCount uint, workerConf WorkerConf) *Pool { func NewWorkerPool(jobs chan web.Job, results chan web.Result, workerCount uint, workerConf WorkerConf) *Pool {
var newPool Pool = Pool{ var newPool Pool = Pool{
workersCount: workerCount, workersCount: workerCount,
@ -66,6 +70,7 @@ func NewWorkerPool(jobs chan web.Job, results chan web.Result, workerCount uint,
return &newPool return &newPool
} }
// Notify all workers in pool to start scraping
func (p *Pool) Work() { func (p *Pool) Work() {
p.Stats.StartTime = time.Now() p.Stats.StartTime = time.Now()
@ -75,6 +80,7 @@ func (p *Pool) Work() {
} }
} }
// Notify all workers in pool to stop scraping
func (p *Pool) Stop() { func (p *Pool) Stop() {
for _, worker := range p.workers { for _, worker := range p.workers {
worker.Stopped = true worker.Stopped = true

30
src/worker/worker.go

@ -33,6 +33,7 @@ import (
"unbewohnte/wecr/web" "unbewohnte/wecr/web"
) )
// Worker configuration
type WorkerConf struct { type WorkerConf struct {
Requests config.Requests Requests config.Requests
Save config.Save Save config.Save
@ -40,6 +41,7 @@ type WorkerConf struct {
AllowedDomains []string AllowedDomains []string
} }
// Web worker
type Worker struct { type Worker struct {
Jobs chan web.Job Jobs chan web.Job
Results chan web.Result Results chan web.Result
@ -49,6 +51,7 @@ type Worker struct {
Stopped bool Stopped bool
} }
// Create a new worker
func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visited *visited, stats *Statistics) Worker { func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visited *visited, stats *Statistics) Worker {
return Worker{ return Worker{
Jobs: jobs, Jobs: jobs,
@ -60,6 +63,7 @@ func NewWorker(jobs chan web.Job, results chan web.Result, conf WorkerConf, visi
} }
} }
// Save page to the disk with a corresponding name
func (w *Worker) savePage(baseURL *url.URL, pageData []byte) { func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" { if w.Conf.Save.SavePages && w.Conf.Save.OutputDir != "" {
var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String())) var pageName string = fmt.Sprintf("%s_%s.html", baseURL.Host, path.Base(baseURL.String()))
@ -71,9 +75,12 @@ func (w *Worker) savePage(baseURL *url.URL, pageData []byte) {
} }
pageFile.Close() pageFile.Close()
logger.Info("Saved \"%s\"", pageName)
} }
} }
// Launch scraping process on this worker
func (w *Worker) Work() { func (w *Worker) Work() {
if w.Stopped { if w.Stopped {
return return
@ -88,14 +95,14 @@ func (w *Worker) Work() {
// see if the domain is allowed and is not blacklisted // see if the domain is allowed and is not blacklisted
var skip bool = false var skip bool = false
parsedURL, err := url.Parse(job.URL) pageURL, err := url.Parse(job.URL)
if err != nil { if err != nil {
logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err) logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err)
continue continue
} }
for _, allowedDomain := range w.Conf.AllowedDomains { for _, allowedDomain := range w.Conf.AllowedDomains {
if parsedURL.Hostname() != allowedDomain { if pageURL.Hostname() != allowedDomain {
skip = true skip = true
logger.Info("Skipped non-allowed %s", job.URL) logger.Info("Skipped non-allowed %s", job.URL)
break break
@ -107,7 +114,7 @@ func (w *Worker) Work() {
break break
} }
if parsedURL.Hostname() == blacklistedDomain { if pageURL.Hostname() == blacklistedDomain {
skip = true skip = true
logger.Info("Skipped blacklisted %s", job.URL) logger.Info("Skipped blacklisted %s", job.URL)
break break
@ -129,6 +136,7 @@ func (w *Worker) Work() {
break break
} }
} }
if skip { if skip {
continue continue
} }
@ -147,7 +155,7 @@ func (w *Worker) Work() {
} }
// find links // find links
pageLinks := web.FindPageLinks(pageData, parsedURL.Host) pageLinks := web.FindPageLinks(pageData, pageURL)
go func() { go func() {
if job.Depth > 1 { if job.Depth > 1 {
@ -178,31 +186,33 @@ func (w *Worker) Work() {
Search: job.Search, Search: job.Search,
Data: pageLinks, Data: pageLinks,
} }
w.stats.MatchesFound += uint64(len(pageLinks))
savePage = true savePage = true
} }
case config.QueryImages: case config.QueryImages:
// find image URLs, output images to the file while not saving already outputted ones // find image URLs, output images to the file while not saving already outputted ones
imageLinks := web.FindPageImages(pageData, parsedURL.Host) imageLinks := web.FindPageImages(pageData, pageURL)
var alreadyProcessedImgUrls []string var alreadyProcessedImgUrls []string
for count, imageLink := range imageLinks { for count, imageLink := range imageLinks {
// check if this URL has been processed already // check if this URL has been processed already
var skipImage bool = false var skipImage bool = false
for _, processedURL := range alreadyProcessedImgUrls { for _, processedURL := range alreadyProcessedImgUrls {
if imageLink == processedURL { if imageLink == processedURL {
skipImage = true skipImage = true
break break
} }
} }
if skipImage { if skipImage {
skipImage = false skipImage = false
continue continue
} else {
alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
} }
alreadyProcessedImgUrls = append(alreadyProcessedImgUrls, imageLink)
var imageName string = fmt.Sprintf("%s_%d_%s", parsedURL.Host, count, path.Base(imageLink)) var imageName string = fmt.Sprintf("%s_%d_%s", pageURL.Host, count, path.Base(imageLink))
response, err := http.Get(imageLink) response, err := http.Get(imageLink)
if err != nil { if err != nil {
@ -266,14 +276,14 @@ func (w *Worker) Work() {
savePage = true savePage = true
} }
} }
}
// save page // save page
if savePage { if savePage {
w.savePage(parsedURL, pageData) w.savePage(pageURL, pageData)
} }
// sleep before the next request // sleep before the next request
time.Sleep(time.Duration(w.Conf.Requests.RequestPauseMs * uint64(time.Millisecond))) time.Sleep(time.Duration(w.Conf.Requests.RequestPauseMs * uint64(time.Millisecond)))
} }
}
} }

Loading…
Cancel
Save