You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
109 lines
2.5 KiB
109 lines
2.5 KiB
/* |
|
Wecr - crawl the web for data |
|
Copyright (C) 2022 Kasyanov Nikolay Alexeyevich (Unbewohnte) |
|
|
|
This program is free software: you can redistribute it and/or modify |
|
it under the terms of the GNU Affero General Public License as published by |
|
the Free Software Foundation, either version 3 of the License, or |
|
(at your option) any later version. |
|
|
|
This program is distributed in the hope that it will be useful, |
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
GNU Affero General Public License for more details. |
|
|
|
You should have received a copy of the GNU Affero General Public License |
|
along with this program. If not, see <https://www.gnu.org/licenses/>. |
|
*/ |
|
|
|
package web |
|
|
|
import ( |
|
"bufio" |
|
"bytes" |
|
"net/url" |
|
"regexp" |
|
"strings" |
|
|
|
"golang.org/x/net/html" |
|
) |
|
|
|
// Fix relative link and construct an absolute one. Does nothing if the URL already looks alright |
|
func ResolveLink(url *url.URL, fromHost string) string { |
|
if !url.IsAbs() { |
|
if url.Scheme == "" { |
|
// add scheme |
|
url.Scheme = "http" |
|
} |
|
|
|
if url.Host == "" { |
|
// add host |
|
url.Host = fromHost |
|
|
|
} |
|
} |
|
|
|
return url.String() |
|
} |
|
|
|
// Find all links on page that are specified in <a> tag |
|
func FindPageLinks(pageBody []byte, from *url.URL) []string { |
|
var urls []string |
|
|
|
tokenizer := html.NewTokenizer(bytes.NewReader(pageBody)) |
|
for { |
|
tokenType := tokenizer.Next() |
|
|
|
switch tokenType { |
|
case html.ErrorToken: |
|
return urls |
|
|
|
case html.StartTagToken: |
|
token := tokenizer.Token() |
|
|
|
if token.Data != "a" { |
|
continue |
|
} |
|
|
|
// recheck |
|
for _, attribute := range token.Attr { |
|
if attribute.Key != "href" { |
|
continue |
|
} |
|
|
|
link, err := url.Parse(attribute.Val) |
|
if err != nil { |
|
break |
|
} |
|
|
|
urls = append(urls, ResolveLink(link, from.Host)) |
|
} |
|
} |
|
} |
|
} |
|
|
|
// Tries to find a certain string in page. Returns true if such string has been found |
|
func IsTextOnPage(text string, ignoreCase bool, pageBody []byte) bool { |
|
scanner := bufio.NewScanner(bytes.NewReader(pageBody)) |
|
|
|
for scanner.Scan() { |
|
lineBytes := scanner.Bytes() |
|
|
|
if !ignoreCase { |
|
if bytes.Contains(lineBytes, []byte(text)) { |
|
return true |
|
} |
|
} else { |
|
if strings.Contains(strings.ToLower(string(lineBytes)), strings.ToLower(text)) { |
|
return true |
|
} |
|
} |
|
} |
|
|
|
return false |
|
} |
|
|
|
// Tries to find a string matching given regexp in page. Returns an array of found |
|
func FindPageRegexp(re *regexp.Regexp, pageBody []byte) []string { |
|
return re.FindAllString(string(pageBody), -1) |
|
}
|
|
|