/* Wecr - crawl the web for data Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ package web import ( "bufio" "bytes" "net/url" "regexp" "strings" "golang.org/x/net/html" ) // Fix relative link and construct an absolute one. Does nothing if the URL already looks alright func ResolveLink(url *url.URL, fromHost string) string { if !url.IsAbs() { if url.Scheme == "" { // add scheme url.Scheme = "http" } if url.Host == "" { // add host url.Host = fromHost } } return url.String() } // Find all links on page that are specified in tag func FindPageLinks(pageBody []byte, from *url.URL) []string { var urls []string tokenizer := html.NewTokenizer(bytes.NewReader(pageBody)) for { tokenType := tokenizer.Next() switch tokenType { case html.ErrorToken: return urls case html.StartTagToken: token := tokenizer.Token() if token.Data != "a" { continue } // recheck for _, attribute := range token.Attr { if attribute.Key != "href" { continue } link, err := url.Parse(attribute.Val) if err != nil { break } urls = append(urls, ResolveLink(link, from.Host)) } } } } // Tries to find a certain string in page. Returns true if such string has been found func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool { scanner := bufio.NewScanner(bytes.NewReader(pageBody)) for scanner.Scan() { lineBytes := scanner.Bytes() if !ignoreCase { if bytes.Contains(lineBytes, []byte(text)) { return true } } else { if strings.Contains(strings.ToLower(string(lineBytes)), strings.ToLower(text)) { return true } } } return false } // Tries to find a string matching given regexp in page. Returns an array of found func FindPageRegexp(re *regexp.Regexp, pageBody []byte) []string { return re.FindAllString(string(pageBody), -1) }