Browse Source

extractData flag

master v0.2.1
parent
commit
7dbf55cb32
  1. 1
      .gitignore
  2. 9
      Makefile
  3. 11
      README.md
  4. 2
      src/config/config.go
  5. 19
      src/main.go
  6. 78
      src/utilities/dataextractor.go
  7. 8
      src/web/text.go

1
.gitignore vendored

@ -7,3 +7,4 @@ conf_mega_ita.json
wecr
release/
scraped/
extracted_data.txt

9
Makefile

@ -4,6 +4,9 @@ TESTDIR:=testing
RELEASEDIR:=release
LICENSE:=COPYING
README:=README.md
UTILITIESDIR:=utilities
UTILITYEXTRACTOR:=extractor
BUILDDIR:=build
LINUXDIR:=$(EXE)_linux
WINDIR:=$(EXE)_windows
@ -17,7 +20,7 @@ WINDIR64:=$(WINDIR)_x64
DARWINDIR64:=$(DARWINDIR)_x64
all:
all: clean
cd $(SRCDIR) && go build && mv $(EXE) ..
test: all
@ -27,9 +30,9 @@ test: all
cp conf.json $(TESTDIR)
clean:
rm -rf $(TESTDIR) $(EXE)
rm -rf $(TESTDIR) $(EXE) $(RELEASEDIR)
release:
release: clean
rm -rf $(RELEASEDIR)
mkdir -p $(RELEASEDIR)/$(LINUXDIR64)

11
README.md

@ -6,7 +6,7 @@ Just a simple HTML web spider with minimal dependencies. It is possible to searc
## Configuration
The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wDir` (working directory) flag is set to some other value.
The flow of work fully depends on the configuration file. By default `conf.json` is used as a configuration file, but the name can be changed via `-conf` flag. The default configuration is embedded in the program so on the first launch or by simply deleting the file, a new `conf.json` will be created in the same directory as the executable itself unless the `-wDir` (working directory) flag is set to some other value. To see al available flags run `wecr -h`.
The configuration is split into different branches like `requests` (how requests are made, ie: request timeout, wait time, user agent), `logging` (use logs, output to a file), `save` (output file|directory, save pages or not) or `search` (use regexp, query string) each of which contain tweakable parameters. There are global ones as well such as `workers` (working threads that make requests in parallel) and `depth` (literally, how deep the recursive search should go). The names are simple and self-explanatory so no attribute-by-attribute explanation needed for most of them.
@ -28,12 +28,11 @@ When `is_regexp` is enabled, the `query` is treated as a regexp string and pages
By default, if the query is not something of special values all the matches and other data will be outputted to `output.json` file as separate continuous JSON objects, but if `save_pages` is set to `true` and|or `query` is set to `images`, `videos`, `audio`, etc. - the additional contents will be put in the corresponding directories inside `output_dir`, which is neatly created by the executable's side.
## TODO
## Build
- **PARSE HTML WITH REGEXP (_EVIL LAUGH_)** - Done
- Search for videos - Done
- Search for audio - Done
- Search for documents
If you're on *nix - it's as easy as `make`.
Otherwise - `go build` in the `src` directory to build `wecr`.
## License
AGPLv3

2
src/config/config.go

@ -89,7 +89,7 @@ func Default() *Conf {
},
Requests: Requests{
UserAgent: "",
RequestWaitTimeoutMs: 1500,
RequestWaitTimeoutMs: 2500,
RequestPauseMs: 100,
ContentFetchTimeoutMs: 0,
},

19
src/main.go

@ -32,6 +32,7 @@ import (
"time"
"unbewohnte/wecr/config"
"unbewohnte/wecr/logger"
"unbewohnte/wecr/utilities"
"unbewohnte/wecr/web"
"unbewohnte/wecr/worker"
)
@ -41,6 +42,7 @@ const version = "v0.2.1"
const (
defaultConfigFile string = "conf.json"
defaultOutputFile string = "output.json"
defaultPrettifiedOutputFile string = "extracted_data.txt"
)
var (
@ -64,6 +66,11 @@ var (
"Output file name to output information into",
)
extractDataFilename = flag.String(
"extractData", "",
"Set filename for output JSON file and extract data from it, put each entry nicely on a new line in a new file, then exit",
)
workingDirectory string
configFilePath string
outputFilePath string
@ -113,6 +120,18 @@ func init() {
logger.Info("Working in \"%s\"", workingDirectory)
// extract data if needed
if strings.TrimSpace(*extractDataFilename) != "" {
logger.Info("Extracting data from %s...", *extractDataFilename)
err := utilities.ExtractDataFromOutput(*extractDataFilename, defaultPrettifiedOutputFile, "\n", false)
if err != nil {
logger.Error("Failed to extract data from %s: %s", *extractDataFilename, err)
os.Exit(1)
}
logger.Info("Outputted \"%s\"", defaultPrettifiedOutputFile)
os.Exit(0)
}
// global path to configuration file
configFilePath = filepath.Join(workingDirectory, *configFile)

78
src/utilities/dataextractor.go

@ -0,0 +1,78 @@
/*
Wecr - crawl the web for data
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package utilities
import (
"encoding/json"
"fmt"
"io"
"os"
"unbewohnte/wecr/web"
)
// Extracts data from the output JSON file and puts it in a new file with separators between each entry
func ExtractDataFromOutput(inputFilename string, outputFilename string, separator string, keepDuplicates bool) error {
inputFile, err := os.Open(inputFilename)
if err != nil {
return err
}
defer inputFile.Close()
outputFile, err := os.Create(outputFilename)
if err != nil {
return err
}
defer outputFile.Close()
var processedData []string
decoder := json.NewDecoder(inputFile)
for {
var result web.Result
err := decoder.Decode(&result)
if err == io.EOF {
break
}
if err != nil {
return err
}
for _, dataEntry := range result.Data {
var skip = false
if !keepDuplicates {
for _, processedEntry := range processedData {
if dataEntry == processedEntry {
skip = true
break
}
}
if skip {
continue
}
processedData = append(processedData, dataEntry)
}
outputFile.WriteString(fmt.Sprintf("%s%s", dataEntry, separator))
}
}
return nil
}

8
src/web/text.go

@ -34,9 +34,9 @@ var tagHrefRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(href)[\s]*=[\s]*("|'
// matches src="link" or even something along the lines of SrC = 'link'
var tagSrcRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(src)[\s]*=[\s]*("|')(.*?)("|')`)
// var emailRegexp *regexp.Regexp = regexp.MustCompile(`[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}`)
var emailRegexp *regexp.Regexp = regexp.MustCompile(`[A-Za-z0-9._%+\-!%&?~^#$]+@[A-Za-z0-9.\-]+\.[a-zA-Z]{2,4}`)
var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*")
// var emailRegexp *regexp.Regexp = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*")
// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
func ResolveLink(url *url.URL, fromHost string) string {
@ -158,8 +158,8 @@ func FindPageEmailsWithCheck(pageBody []byte) []string {
emailAddresses := FindPageEmails(pageBody)
for _, email := range emailAddresses {
_, err := net.LookupMX(strings.Split(email, "@")[1])
if err != nil {
mx, err := net.LookupMX(strings.Split(email, "@")[1])
if err != nil || len(mx) == 0 {
continue
}

Loading…
Cancel
Save