Browse Source

Blacklisting domains

master
parent
commit
7c30787990
  1. 3
      .gitignore
  2. 2
      Makefile
  3. 5
      README.md
  4. 24
      src/config/config.go
  5. 2
      src/go.mod
  6. 2
      src/logger/logger.go
  7. 40
      src/main.go
  8. 4
      src/web/images.go
  9. 4
      src/web/job.go
  10. 2
      src/web/requests.go
  11. 4
      src/web/result.go
  12. 2
      src/web/text.go
  13. 4
      src/worker/pool.go
  14. 40
      src/worker/worker.go

3
.gitignore vendored

@ -3,4 +3,5 @@ conf.json
logs.log logs.log
output.json output.json
websurf websurf
conf_mega_ita.json conf_mega_ita.json
wecr

2
Makefile

@ -1,5 +1,5 @@
SRCDIR:=src SRCDIR:=src
EXE:=websurf EXE:=wecr
TESTDIR:=testing TESTDIR:=testing
all: all:

5
README.md

@ -1,8 +1,8 @@
# Websurf # Wecr - simple web crawler
## Overview ## Overview
Just a simple HTML web spider with minimal dependencies. It is possible to search for pages with a text on them or for the text itself, extract images and save pages that satisfy the criteria along the way. Just a simple HTML web spider with minimal dependencies. It is possible to search for pages with a text on them or for the text itself, extract images and save pages that satisfy the criteria along the way.
## Configuration ## Configuration
@ -10,6 +10,7 @@ The flow of work fully depends on the configuration file. By default `conf.json`
The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them. The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.
The parsing starts from `initial_pages` and goes deeper while ignoring the pages on domains that are in `blacklisted_domains`. If all initial pages are happen to be blacklisted - the program will end.
### Search query ### Search query

24
src/config/config.go

@ -1,5 +1,5 @@
/* /*
websurf - surf the web for data recursively Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
@ -52,13 +52,14 @@ type Logging struct {
} }
type Conf struct { type Conf struct {
Search Search `json:"search"` Search Search `json:"search"`
Requests Requests `json:"requests"` Requests Requests `json:"requests"`
Depth uint `json:"depth"` Depth uint `json:"depth"`
Workers uint `json:"workers"` Workers uint `json:"workers"`
InitialDomains []string `json:"initial_domains"` InitialPages []string `json:"initial_pages"`
Save Save `json:"save"` BlacklistedDomains []string `json:"blacklisted_domains"`
Logging Logging `json:"logging"` Save Save `json:"save"`
Logging Logging `json:"logging"`
} }
func Default() *Conf { func Default() *Conf {
@ -77,9 +78,10 @@ func Default() *Conf {
WaitTimeoutMs: 1500, WaitTimeoutMs: 1500,
RequestPauseMs: 100, RequestPauseMs: 100,
}, },
InitialDomains: []string{""}, InitialPages: []string{""},
Depth: 5, Depth: 5,
Workers: 20, Workers: 20,
BlacklistedDomains: []string{""},
Logging: Logging{ Logging: Logging{
OutputLogs: true, OutputLogs: true,
LogsFile: "logs.log", LogsFile: "logs.log",

2
src/go.mod

@ -1,4 +1,4 @@
module unbewohnte/websurf module unbewohnte/wecr
go 1.18 go 1.18

2
src/logger/logger.go

@ -1,5 +1,5 @@
/* /*
websurf - surf the web for data recursively Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify

40
src/main.go

@ -1,5 +1,5 @@
/* /*
websurf - surf the web for data recursively Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
@ -24,14 +24,15 @@ import (
"fmt" "fmt"
"io" "io"
"log" "log"
"net/url"
"os" "os"
"os/signal" "os/signal"
"path/filepath" "path/filepath"
"time" "time"
"unbewohnte/websurf/config" "unbewohnte/wecr/config"
"unbewohnte/websurf/logger" "unbewohnte/wecr/logger"
"unbewohnte/websurf/web" "unbewohnte/wecr/web"
"unbewohnte/websurf/worker" "unbewohnte/wecr/worker"
) )
const ( const (
@ -78,7 +79,7 @@ func init() {
if *version { if *version {
fmt.Printf( fmt.Printf(
"webscrape - scrape the web\n(c) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n\n", "Webcrawl - crawl the web for data\n(c) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)\n\n",
) )
os.Exit(0) os.Exit(0)
} }
@ -152,14 +153,24 @@ func main() {
} }
// sanitize and correct inputs // sanitize and correct inputs
if len(conf.InitialDomains) == 0 { if len(conf.InitialPages) == 0 {
logger.Warning("No initial domain URLs have been set") logger.Warning("No initial page URLs have been set")
return return
} else if len(conf.InitialDomains) != 0 && conf.InitialDomains[0] == "" { } else if len(conf.InitialPages) != 0 && conf.InitialPages[0] == "" {
logger.Warning("No initial domain URLs have been set") logger.Warning("No initial page URLs have been set")
return return
} }
for index, blacklistedDomain := range conf.BlacklistedDomains {
parsedURL, err := url.Parse(blacklistedDomain)
if err != nil {
logger.Warning("Failed to parse blacklisted %s: %s", blacklistedDomain, err)
continue
}
conf.BlacklistedDomains[index] = parsedURL.Host
}
if conf.Depth <= 0 { if conf.Depth <= 0 {
conf.Depth = 1 conf.Depth = 1
logger.Warning("Depth is <= 0. Set to %d", conf.Depth) logger.Warning("Depth is <= 0. Set to %d", conf.Depth)
@ -215,9 +226,9 @@ func main() {
results := make(chan web.Result, conf.Workers*5) results := make(chan web.Result, conf.Workers*5)
// create initial jobs // create initial jobs
for _, initialDomain := range conf.InitialDomains { for _, initialPage := range conf.InitialPages {
jobs <- web.Job{ jobs <- web.Job{
URL: initialDomain, URL: initialPage,
Search: conf.Search, Search: conf.Search,
Depth: conf.Depth, Depth: conf.Depth,
} }
@ -225,8 +236,9 @@ func main() {
// form a worker pool // form a worker pool
workerPool := worker.NewWorkerPool(jobs, results, conf.Workers, worker.WorkerConf{ workerPool := worker.NewWorkerPool(jobs, results, conf.Workers, worker.WorkerConf{
Requests: conf.Requests, Requests: conf.Requests,
Save: conf.Save, Save: conf.Save,
BlacklistedDomains: conf.BlacklistedDomains,
}) })
logger.Info("Created a worker pool with %d workers", conf.Workers) logger.Info("Created a worker pool with %d workers", conf.Workers)

4
src/web/images.go

@ -1,5 +1,5 @@
/* /*
websurf - surf the web for data recursively Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
@ -22,7 +22,7 @@ import (
"bytes" "bytes"
"net/url" "net/url"
"strings" "strings"
"unbewohnte/websurf/logger" "unbewohnte/wecr/logger"
"golang.org/x/net/html" "golang.org/x/net/html"
) )

4
src/web/job.go

@ -1,5 +1,5 @@
/* /*
websurf - surf the web for data recursively Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
@ -18,7 +18,7 @@
package web package web
import "unbewohnte/websurf/config" import "unbewohnte/wecr/config"
type Job struct { type Job struct {
URL string URL string

2
src/web/requests.go

@ -1,5 +1,5 @@
/* /*
websurf - surf the web for data recursively Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify

4
src/web/result.go

@ -1,5 +1,5 @@
/* /*
websurf - surf the web for data recursively Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
@ -18,7 +18,7 @@
package web package web
import "unbewohnte/websurf/config" import "unbewohnte/wecr/config"
type Result struct { type Result struct {
PageURL string PageURL string

2
src/web/text.go

@ -1,5 +1,5 @@
/* /*
websurf - surf the web for data recursively Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify

4
src/worker/pool.go

@ -1,5 +1,5 @@
/* /*
websurf - surf the web for data recursively Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
@ -21,7 +21,7 @@ package worker
import ( import (
"sync" "sync"
"time" "time"
"unbewohnte/websurf/web" "unbewohnte/wecr/web"
) )
type visited struct { type visited struct {

40
src/worker/worker.go

@ -1,5 +1,5 @@
/* /*
websurf - surf the web for data recursively Wecr - crawl the web for data
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
@ -28,14 +28,15 @@ import (
"path/filepath" "path/filepath"
"regexp" "regexp"
"time" "time"
"unbewohnte/websurf/config" "unbewohnte/wecr/config"
"unbewohnte/websurf/logger" "unbewohnte/wecr/logger"
"unbewohnte/websurf/web" "unbewohnte/wecr/web"
) )
type WorkerConf struct { type WorkerConf struct {
Requests config.Requests Requests config.Requests
Save config.Save Save config.Save
BlacklistedDomains []string
} }
type Worker struct { type Worker struct {
@ -70,14 +71,31 @@ func (w *Worker) Work() {
return return
} }
// see if the domain is not blacklisted
var skip bool = false
parsedURL, err := url.Parse(job.URL)
if err != nil {
logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err)
continue
}
for _, blacklistedDomain := range w.Conf.BlacklistedDomains {
if parsedURL.Hostname() == blacklistedDomain {
skip = true
logger.Info("Skipping blacklisted %s", job.URL)
break
}
}
if skip {
continue
}
// check if it is the first occurence // check if it is the first occurence
w.visited.Lock.Lock() w.visited.Lock.Lock()
var skip bool = false
for _, visitedURL := range w.visited.URLs { for _, visitedURL := range w.visited.URLs {
if job.URL == visitedURL { if job.URL == visitedURL {
// okay, don't even bother. Move onto the next job // okay, don't even bother. Move onto the next job
skip = true skip = true
logger.Info("Skipping %s", job.URL) logger.Info("Skipping visited %s", job.URL)
w.visited.Lock.Unlock() w.visited.Lock.Unlock()
break break
} }
@ -136,12 +154,6 @@ func (w *Worker) Work() {
case config.QueryImages: case config.QueryImages:
// find image URLs, output data to the file // find image URLs, output data to the file
parsedURL, err := url.Parse(job.URL)
if err != nil {
logger.Error("Failed to parse URL \"%s\" to get hostname: %s", job.URL, err)
continue
}
imageLinks := web.FindPageImages(pageData, parsedURL.Host) imageLinks := web.FindPageImages(pageData, parsedURL.Host)
for count, imageLink := range imageLinks { for count, imageLink := range imageLinks {

Loading…
Cancel
Save