Surf the web for data recursively
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

110 lines
2.5 KiB

2 years ago
/*
Wecr - crawl the web for data
2 years ago
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package web
import (
"bufio"
"bytes"
"net/url"
2 years ago
"regexp"
"strings"
"golang.org/x/net/html"
)
// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright
func ResolveLink(url *url.URL, fromHost string) string {
if !url.IsAbs() {
if url.Scheme == "" {
// add scheme
url.Scheme = "http"
}
if url.Host == "" {
// add host
url.Host = fromHost
}
}
return url.String()
}
// Find all links on page that are specified in <a> tag
func FindPageLinks(pageBody []byte, from *url.URL) []string {
2 years ago
var urls []string
tokenizer := html.NewTokenizer(bytes.NewReader(pageBody))
for {
tokenType := tokenizer.Next()
switch tokenType {
case html.ErrorToken:
return urls
case html.StartTagToken:
token := tokenizer.Token()
if token.Data != "a" {
continue
}
// recheck
for _, attribute := range token.Attr {
if attribute.Key != "href" {
continue
}
link, err := url.Parse(attribute.Val)
if err != nil {
break
}
urls = append(urls, ResolveLink(link, from.Host))
2 years ago
}
}
}
}
// Tries to find a certain string in page. Returns true if such string has been found
func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool {
scanner := bufio.NewScanner(bytes.NewReader(pageBody))
for scanner.Scan() {
lineBytes := scanner.Bytes()
if !ignoreCase {
if bytes.Contains(lineBytes, []byte(text)) {
return true
}
} else {
if strings.Contains(strings.ToLower(string(lineBytes)), strings.ToLower(text)) {
return true
}
}
}
return false
}
// Tries to find a string matching given regexp in page. Returns an array of found
func FindPageRegexp(re *regexp.Regexp, pageBody []byte) []string {
return re.FindAllString(string(pageBody), -1)
}