wecr/src/web/text.go

/*
	Wecr - crawl the web for data
	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU Affero General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU Affero General Public License for more details.

	You should have received a copy of the GNU Affero General Public License
	along with this program.  If not, see <https://www.gnu.org/licenses/>.
*/

package web

import (
	"bufio"
	"bytes"
	"net/url"
	"regexp"
	"strings"

	"golang.org/x/net/html"
)

// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
func ResolveLink(url *url.URL, fromHost string) string {
	if !url.IsAbs() {
		if url.Scheme == "" {
			// add scheme
			url.Scheme = "http"
		}

		if url.Host == "" {
			// add host
			url.Host = fromHost

		}
	}

	return url.String()
}

// Find all links on page that are specified in <a> tag
func FindPageLinks(pageBody []byte, from *url.URL) []string {
	var urls []string

	tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
	for {
		tokenType := tokenizer.Next()

		switch tokenType {
		case html.ErrorToken:
			return urls

		case html.StartTagToken:
			token := tokenizer.Token()

			if token.Data != "a" {
				continue
			}

			// recheck
			for _, attribute := range token.Attr {
				if attribute.Key != "href" {
					continue
				}

				link, err := url.Parse(attribute.Val)
				if err != nil {
					break
				}

				urls = append(urls, ResolveLink(link, from.Host))
			}
		}
	}
}

// Tries to find a certain string in page. Returns true if such string has been found
func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool {
	scanner := bufio.NewScanner(bytes.NewReader(pageBody))

	for scanner.Scan() {
		lineBytes := scanner.Bytes()

		if !ignoreCase {
			if bytes.Contains(lineBytes, []byte(text)) {
				return true
			}
		} else {
			if strings.Contains(strings.ToLower(string(lineBytes)), strings.ToLower(text)) {
				return true
			}
		}
	}

	return false
}

// Tries to find a string matching given regexp in page. Returns an array of found
func FindPageRegexp(re *regexp.Regexp, pageBody []byte) []string {
	return re.FindAllString(string(pageBody), -1)
}
Initial commit 2 years ago			`/*`
Blacklisting domains 2 years ago			`Wecr - crawl the web for data`
Initial commit 2 years ago			`Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)`

			`This program is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU Affero General Public License as published by`
			`the Free Software Foundation, either version 3 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU Affero General Public License for more details.`

			`You should have received a copy of the GNU Affero General Public License`
			`along with this program. If not, see <https://www.gnu.org/licenses/>.`
			`*/`

			`package web`

			`import (`
			`"bufio"`
			`"bytes"`
Fixed a terrible logical "bug" in worker`s Work function; Added comments; Added more image extentions; Merged and improved ResolveLink function 2 years ago			`"net/url"`
Initial commit 2 years ago			`"regexp"`
			`"strings"`

			`"golang.org/x/net/html"`
			`)`

Fixed a terrible logical "bug" in worker`s Work function; Added comments; Added more image extentions; Merged and improved ResolveLink function 2 years ago			`// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright`
			`func ResolveLink(url *url.URL, fromHost string) string {`
			`if !url.IsAbs() {`
			`if url.Scheme == "" {`
			`// add scheme`
			`url.Scheme = "http"`
			`}`

			`if url.Host == "" {`
			`// add host`
			`url.Host = fromHost`

			`}`
			`}`

			`return url.String()`
			`}`

			`// Find all links on page that are specified in <a> tag`
			`func FindPageLinks(pageBody []byte, from *url.URL) []string {`
Initial commit 2 years ago			`var urls []string`

			`tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))`
			`for {`
			`tokenType := tokenizer.Next()`

			`switch tokenType {`
			`case html.ErrorToken:`
			`return urls`

			`case html.StartTagToken:`
			`token := tokenizer.Token()`

			`if token.Data != "a" {`
			`continue`
			`}`

			`// recheck`
			`for _, attribute := range token.Attr {`
			`if attribute.Key != "href" {`
			`continue`
			`}`

Fixed a terrible logical "bug" in worker`s Work function; Added comments; Added more image extentions; Merged and improved ResolveLink function 2 years ago			`link, err := url.Parse(attribute.Val)`
			`if err != nil {`
			`break`
Fixed relative links not being resolved 2 years ago			`}`

Fixed a terrible logical "bug" in worker`s Work function; Added comments; Added more image extentions; Merged and improved ResolveLink function 2 years ago			`urls = append(urls, ResolveLink(link, from.Host))`
Initial commit 2 years ago			`}`
			`}`
			`}`
			`}`

			`// Tries to find a certain string in page. Returns true if such string has been found`
			`func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool {`
			`scanner := bufio.NewScanner(bytes.NewReader(pageBody))`

			`for scanner.Scan() {`
			`lineBytes := scanner.Bytes()`

			`if !ignoreCase {`
			`if bytes.Contains(lineBytes, []byte(text)) {`
			`return true`
			`}`
			`} else {`
			`if strings.Contains(strings.ToLower(string(lineBytes)), strings.ToLower(text)) {`
			`return true`
			`}`
			`}`
			`}`

			`return false`
			`}`

			`// Tries to find a string matching given regexp in page. Returns an array of found`
			`func FindPageRegexp(re *regexp.Regexp, pageBody []byte) []string {`
			`return re.FindAllString(string(pageBody), -1)`
			`}`