wecr/src/web/text.go

/*
	Wecr - crawl the web for data
	Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU Affero General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU Affero General Public License for more details.

	You should have received a copy of the GNU Affero General Public License
	along with this program.  If not, see <https://www.gnu.org/licenses/>.
*/

package web

import (
	"bufio"
	"bytes"
	"fmt"
	"regexp"
	"strings"

	"golang.org/x/net/html"
)

func FindPageLinks(pageBody []byte, hostname string) []string {
	var urls []string

	tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
	for {
		tokenType := tokenizer.Next()

		switch tokenType {
		case html.ErrorToken:
			return urls

		case html.StartTagToken:
			token := tokenizer.Token()

			if token.Data != "a" {
				continue
			}

			// recheck
			for _, attribute := range token.Attr {
				if attribute.Key != "href" {
					continue
				}

				var link string = attribute.Val

				if !strings.Contains(link, hostname) {
					// add hostname
					if strings.HasPrefix(link, "/") && strings.HasSuffix(hostname, "/") {
						link = fmt.Sprintf("%s%s", hostname, link[1:])
					} else if !strings.HasPrefix(link, "/") && !strings.HasSuffix(hostname, "/") {
						link = fmt.Sprintf("%s/%s", hostname, link)
					} else {
						link = fmt.Sprintf("%s%s", hostname, link)
					}
				}

				link = strings.TrimPrefix(link, "//")

				if !strings.HasPrefix(link, "http://") && !strings.HasPrefix(link, "https://") {
					// add scheme
					link = "http://" + link
				}

				urls = append(urls, link)
			}
		}
	}
}

// Tries to find a certain string in page. Returns true if such string has been found
func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool {
	scanner := bufio.NewScanner(bytes.NewReader(pageBody))

	for scanner.Scan() {
		lineBytes := scanner.Bytes()

		if !ignoreCase {
			if bytes.Contains(lineBytes, []byte(text)) {
				return true
			}
		} else {
			if strings.Contains(strings.ToLower(string(lineBytes)), strings.ToLower(text)) {
				return true
			}
		}
	}

	return false
}

// Tries to find a string matching given regexp in page. Returns an array of found
func FindPageRegexp(re *regexp.Regexp, pageBody []byte) []string {
	return re.FindAllString(string(pageBody), -1)
}
Initial commit 2 years ago			`/*`
Blacklisting domains 2 years ago			`Wecr - crawl the web for data`
Initial commit 2 years ago			`Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)`

			`This program is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU Affero General Public License as published by`
			`the Free Software Foundation, either version 3 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU Affero General Public License for more details.`

			`You should have received a copy of the GNU Affero General Public License`
			`along with this program. If not, see <https://www.gnu.org/licenses/>.`
			`*/`

			`package web`

			`import (`
			`"bufio"`
			`"bytes"`
Fixed relative links not being resolved 2 years ago			`"fmt"`
Initial commit 2 years ago			`"regexp"`
			`"strings"`

			`"golang.org/x/net/html"`
			`)`

Fixed relative links not being resolved 2 years ago			`func FindPageLinks(pageBody []byte, hostname string) []string {`
Initial commit 2 years ago			`var urls []string`

			`tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))`
			`for {`
			`tokenType := tokenizer.Next()`

			`switch tokenType {`
			`case html.ErrorToken:`
			`return urls`

			`case html.StartTagToken:`
			`token := tokenizer.Token()`

			`if token.Data != "a" {`
			`continue`
			`}`

			`// recheck`
			`for _, attribute := range token.Attr {`
			`if attribute.Key != "href" {`
			`continue`
			`}`

Fixed relative links not being resolved 2 years ago			`var link string = attribute.Val`

			`if !strings.Contains(link, hostname) {`
			`// add hostname`
			`if strings.HasPrefix(link, "/") && strings.HasSuffix(hostname, "/") {`
			`link = fmt.Sprintf("%s%s", hostname, link[1:])`
			`} else if !strings.HasPrefix(link, "/") && !strings.HasSuffix(hostname, "/") {`
			`link = fmt.Sprintf("%s/%s", hostname, link)`
			`} else {`
			`link = fmt.Sprintf("%s%s", hostname, link)`
			`}`
Initial commit 2 years ago			`}`
Fixed relative links not being resolved 2 years ago
			`link = strings.TrimPrefix(link, "//")`

			`if !strings.HasPrefix(link, "http://") && !strings.HasPrefix(link, "https://") {`
			`// add scheme`
			`link = "http://" + link`
			`}`

			`urls = append(urls, link)`
Initial commit 2 years ago			`}`
			`}`
			`}`
			`}`

			`// Tries to find a certain string in page. Returns true if such string has been found`
			`func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool {`
			`scanner := bufio.NewScanner(bytes.NewReader(pageBody))`

			`for scanner.Scan() {`
			`lineBytes := scanner.Bytes()`

			`if !ignoreCase {`
			`if bytes.Contains(lineBytes, []byte(text)) {`
			`return true`
			`}`
			`} else {`
			`if strings.Contains(strings.ToLower(string(lineBytes)), strings.ToLower(text)) {`
			`return true`
			`}`
			`}`
			`}`

			`return false`
			`}`

			`// Tries to find a string matching given regexp in page. Returns an array of found`
			`func FindPageRegexp(re *regexp.Regexp, pageBody []byte) []string {`
			`return re.FindAllString(string(pageBody), -1)`
			`}`