Kasianov Nikolai Alekseevich
2 years ago
13 changed files with 605 additions and 152 deletions
@ -1,2 +0,0 @@ |
|||||||
golang.org/x/net v0.4.0 h1:Q5QPcMlvfxFTAPV0+07Xz/MpK9NTXu2VDUuy0FeMfaU= |
|
||||||
golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= |
|
@ -0,0 +1,118 @@ |
|||||||
|
/* |
||||||
|
Wecr - crawl the web for data |
||||||
|
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) |
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify |
||||||
|
it under the terms of the GNU Affero General Public License as published by |
||||||
|
the Free Software Foundation, either version 3 of the License, or |
||||||
|
(at your option) any later version. |
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful, |
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||||
|
GNU Affero General Public License for more details. |
||||||
|
|
||||||
|
You should have received a copy of the GNU Affero General Public License |
||||||
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/ |
||||||
|
|
||||||
|
package web |
||||||
|
|
||||||
|
import ( |
||||||
|
"net/url" |
||||||
|
"strings" |
||||||
|
) |
||||||
|
|
||||||
|
func hasAudioExtention(url string) bool { |
||||||
|
for _, extention := range AudioExtentions { |
||||||
|
if strings.HasSuffix(url, extention) { |
||||||
|
return true |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return false |
||||||
|
} |
||||||
|
|
||||||
|
// Tries to find audio URLs on the page
|
||||||
|
func FindPageAudio(pageBody []byte, from *url.URL) []string { |
||||||
|
var urls []string |
||||||
|
|
||||||
|
// for every element that has "src" attribute
|
||||||
|
for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) { |
||||||
|
var linkStartIndex int |
||||||
|
var linkEndIndex int |
||||||
|
|
||||||
|
linkStartIndex = strings.Index(match, "\"") |
||||||
|
if linkStartIndex == -1 { |
||||||
|
linkStartIndex = strings.Index(match, "'") |
||||||
|
if linkStartIndex == -1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
linkEndIndex = strings.LastIndex(match, "'") |
||||||
|
if linkEndIndex == -1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
} else { |
||||||
|
linkEndIndex = strings.LastIndex(match, "\"") |
||||||
|
if linkEndIndex == -1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
if linkEndIndex <= linkStartIndex+1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) |
||||||
|
if err != nil { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
linkResolved := ResolveLink(link, from.Host) |
||||||
|
if hasAudioExtention(linkResolved) { |
||||||
|
urls = append(urls, linkResolved) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// for every "a" element as well
|
||||||
|
for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) { |
||||||
|
var linkStartIndex int |
||||||
|
var linkEndIndex int |
||||||
|
|
||||||
|
linkStartIndex = strings.Index(match, "\"") |
||||||
|
if linkStartIndex == -1 { |
||||||
|
linkStartIndex = strings.Index(match, "'") |
||||||
|
if linkStartIndex == -1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
linkEndIndex = strings.LastIndex(match, "'") |
||||||
|
if linkEndIndex == -1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
} else { |
||||||
|
linkEndIndex = strings.LastIndex(match, "\"") |
||||||
|
if linkEndIndex == -1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
if linkEndIndex <= linkStartIndex+1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) |
||||||
|
if err != nil { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
linkResolved := ResolveLink(link, from.Host) |
||||||
|
if hasAudioExtention(linkResolved) { |
||||||
|
urls = append(urls, linkResolved) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// return discovered mutual video urls
|
||||||
|
return urls |
||||||
|
} |
@ -0,0 +1,88 @@ |
|||||||
|
/* |
||||||
|
Wecr - crawl the web for data |
||||||
|
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) |
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify |
||||||
|
it under the terms of the GNU Affero General Public License as published by |
||||||
|
the Free Software Foundation, either version 3 of the License, or |
||||||
|
(at your option) any later version. |
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful, |
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||||
|
GNU Affero General Public License for more details. |
||||||
|
|
||||||
|
You should have received a copy of the GNU Affero General Public License |
||||||
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/ |
||||||
|
|
||||||
|
package web |
||||||
|
|
||||||
|
var AudioExtentions = []string{ |
||||||
|
".3gp", |
||||||
|
".aa", |
||||||
|
".aac", |
||||||
|
".aax", |
||||||
|
".act", |
||||||
|
".aiff", |
||||||
|
".alac", |
||||||
|
".amr", |
||||||
|
".ape", |
||||||
|
".au", |
||||||
|
".flac", |
||||||
|
".m4a", |
||||||
|
".mp3", |
||||||
|
".mpc", |
||||||
|
".msv", |
||||||
|
".ogg", |
||||||
|
".oga", |
||||||
|
".mogg", |
||||||
|
".opus", |
||||||
|
".tta", |
||||||
|
".wav", |
||||||
|
".cda", |
||||||
|
} |
||||||
|
|
||||||
|
var ImageExtentions = []string{ |
||||||
|
".jpeg", |
||||||
|
".jpg", |
||||||
|
".jpe", |
||||||
|
".jfif", |
||||||
|
".png", |
||||||
|
".ppm", |
||||||
|
".svg", |
||||||
|
".gif", |
||||||
|
".tiff", |
||||||
|
".bmp", |
||||||
|
".webp", |
||||||
|
".ico", |
||||||
|
".kra", |
||||||
|
".bpg", |
||||||
|
".drw", |
||||||
|
".tga", |
||||||
|
".kra", |
||||||
|
} |
||||||
|
|
||||||
|
var VideoExtentions = []string{ |
||||||
|
".webm", |
||||||
|
".mkv", |
||||||
|
".flv", |
||||||
|
".wmv", |
||||||
|
".avi", |
||||||
|
".yuv", |
||||||
|
".mp2", |
||||||
|
".mp4", |
||||||
|
".mpeg", |
||||||
|
".mpg", |
||||||
|
".mpv", |
||||||
|
".m4v", |
||||||
|
".3gp", |
||||||
|
".3g2", |
||||||
|
".nsv", |
||||||
|
".vob", |
||||||
|
".ogv", |
||||||
|
} |
||||||
|
|
||||||
|
var DocumentExtentions = []string{ |
||||||
|
"", |
||||||
|
} |
@ -0,0 +1,118 @@ |
|||||||
|
/* |
||||||
|
Wecr - crawl the web for data |
||||||
|
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) |
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify |
||||||
|
it under the terms of the GNU Affero General Public License as published by |
||||||
|
the Free Software Foundation, either version 3 of the License, or |
||||||
|
(at your option) any later version. |
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful, |
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||||
|
GNU Affero General Public License for more details. |
||||||
|
|
||||||
|
You should have received a copy of the GNU Affero General Public License |
||||||
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/ |
||||||
|
|
||||||
|
package web |
||||||
|
|
||||||
|
import ( |
||||||
|
"net/url" |
||||||
|
"strings" |
||||||
|
) |
||||||
|
|
||||||
|
func hasVideoExtention(url string) bool { |
||||||
|
for _, extention := range VideoExtentions { |
||||||
|
if strings.HasSuffix(url, extention) { |
||||||
|
return true |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return false |
||||||
|
} |
||||||
|
|
||||||
|
// Tries to find videos' URLs on the page
|
||||||
|
func FindPageVideos(pageBody []byte, from *url.URL) []string { |
||||||
|
var urls []string |
||||||
|
|
||||||
|
// for every element that has "src" attribute
|
||||||
|
for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) { |
||||||
|
var linkStartIndex int |
||||||
|
var linkEndIndex int |
||||||
|
|
||||||
|
linkStartIndex = strings.Index(match, "\"") |
||||||
|
if linkStartIndex == -1 { |
||||||
|
linkStartIndex = strings.Index(match, "'") |
||||||
|
if linkStartIndex == -1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
linkEndIndex = strings.LastIndex(match, "'") |
||||||
|
if linkEndIndex == -1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
} else { |
||||||
|
linkEndIndex = strings.LastIndex(match, "\"") |
||||||
|
if linkEndIndex == -1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
if linkEndIndex <= linkStartIndex+1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) |
||||||
|
if err != nil { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
linkResolved := ResolveLink(link, from.Host) |
||||||
|
if hasVideoExtention(linkResolved) { |
||||||
|
urls = append(urls, linkResolved) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// for every "a" element as well
|
||||||
|
for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) { |
||||||
|
var linkStartIndex int |
||||||
|
var linkEndIndex int |
||||||
|
|
||||||
|
linkStartIndex = strings.Index(match, "\"") |
||||||
|
if linkStartIndex == -1 { |
||||||
|
linkStartIndex = strings.Index(match, "'") |
||||||
|
if linkStartIndex == -1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
linkEndIndex = strings.LastIndex(match, "'") |
||||||
|
if linkEndIndex == -1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
} else { |
||||||
|
linkEndIndex = strings.LastIndex(match, "\"") |
||||||
|
if linkEndIndex == -1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
if linkEndIndex <= linkStartIndex+1 { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) |
||||||
|
if err != nil { |
||||||
|
continue |
||||||
|
} |
||||||
|
|
||||||
|
linkResolved := ResolveLink(link, from.Host) |
||||||
|
if hasVideoExtention(linkResolved) { |
||||||
|
urls = append(urls, linkResolved) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// return discovered mutual video urls
|
||||||
|
return urls |
||||||
|
} |
Loading…
Reference in new issue