Surf the web for data recursively
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

106 lines
2.6 KiB

2 years ago
/*
Wecr - crawl the web for data
2 years ago
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package web
import (
"bufio"
"bytes"
"fmt"
2 years ago
"regexp"
"strings"
"golang.org/x/net/html"
)
func FindPageLinks(pageBody []byte, hostname string) []string {
2 years ago
var urls []string
tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
for {
tokenType := tokenizer.Next()
switch tokenType {
case html.ErrorToken:
return urls
case html.StartTagToken:
token := tokenizer.Token()
if token.Data != "a" {
continue
}
// recheck
for _, attribute := range token.Attr {
if attribute.Key != "href" {
continue
}
var link string = attribute.Val
if !strings.Contains(link, hostname) {
// add hostname
if strings.HasPrefix(link, "/") && strings.HasSuffix(hostname, "/") {
link = fmt.Sprintf("%s%s", hostname, link[1:])
} else if !strings.HasPrefix(link, "/") && !strings.HasSuffix(hostname, "/") {
link = fmt.Sprintf("%s/%s", hostname, link)
} else {
link = fmt.Sprintf("%s%s", hostname, link)
}
2 years ago
}
link = strings.TrimPrefix(link, "//")
if !strings.HasPrefix(link, "http://") && !strings.HasPrefix(link, "https://") {
// add scheme
link = "http://" + link
}
urls = append(urls, link)
2 years ago
}
}
}
}
// Tries to find a certain string in page. Returns true if such string has been found
func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool {
scanner := bufio.NewScanner(bytes.NewReader(pageBody))
for scanner.Scan() {
lineBytes := scanner.Bytes()
if !ignoreCase {
if bytes.Contains(lineBytes, []byte(text)) {
return true
}
} else {
if strings.Contains(strings.ToLower(string(lineBytes)), strings.ToLower(text)) {
return true
}
}
}
return false
}
// Tries to find a string matching given regexp in page. Returns an array of found
func FindPageRegexp(re *regexp.Regexp, pageBody []byte) []string {
return re.FindAllString(string(pageBody), -1)
}