Kasianov Nikolai Alekseevich
2 years ago
13 changed files with 605 additions and 152 deletions
@ -1,2 +0,0 @@
|
||||
golang.org/x/net v0.4.0 h1:Q5QPcMlvfxFTAPV0+07Xz/MpK9NTXu2VDUuy0FeMfaU= |
||||
golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= |
@ -0,0 +1,118 @@
|
||||
/* |
||||
Wecr - crawl the web for data |
||||
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) |
||||
|
||||
This program is free software: you can redistribute it and/or modify |
||||
it under the terms of the GNU Affero General Public License as published by |
||||
the Free Software Foundation, either version 3 of the License, or |
||||
(at your option) any later version. |
||||
|
||||
This program is distributed in the hope that it will be useful, |
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
GNU Affero General Public License for more details. |
||||
|
||||
You should have received a copy of the GNU Affero General Public License |
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/ |
||||
|
||||
package web |
||||
|
||||
import ( |
||||
"net/url" |
||||
"strings" |
||||
) |
||||
|
||||
func hasAudioExtention(url string) bool { |
||||
for _, extention := range AudioExtentions { |
||||
if strings.HasSuffix(url, extention) { |
||||
return true |
||||
} |
||||
} |
||||
|
||||
return false |
||||
} |
||||
|
||||
// Tries to find audio URLs on the page
|
||||
func FindPageAudio(pageBody []byte, from *url.URL) []string { |
||||
var urls []string |
||||
|
||||
// for every element that has "src" attribute
|
||||
for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) { |
||||
var linkStartIndex int |
||||
var linkEndIndex int |
||||
|
||||
linkStartIndex = strings.Index(match, "\"") |
||||
if linkStartIndex == -1 { |
||||
linkStartIndex = strings.Index(match, "'") |
||||
if linkStartIndex == -1 { |
||||
continue |
||||
} |
||||
|
||||
linkEndIndex = strings.LastIndex(match, "'") |
||||
if linkEndIndex == -1 { |
||||
continue |
||||
} |
||||
} else { |
||||
linkEndIndex = strings.LastIndex(match, "\"") |
||||
if linkEndIndex == -1 { |
||||
continue |
||||
} |
||||
} |
||||
|
||||
if linkEndIndex <= linkStartIndex+1 { |
||||
continue |
||||
} |
||||
|
||||
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) |
||||
if err != nil { |
||||
continue |
||||
} |
||||
|
||||
linkResolved := ResolveLink(link, from.Host) |
||||
if hasAudioExtention(linkResolved) { |
||||
urls = append(urls, linkResolved) |
||||
} |
||||
} |
||||
|
||||
// for every "a" element as well
|
||||
for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) { |
||||
var linkStartIndex int |
||||
var linkEndIndex int |
||||
|
||||
linkStartIndex = strings.Index(match, "\"") |
||||
if linkStartIndex == -1 { |
||||
linkStartIndex = strings.Index(match, "'") |
||||
if linkStartIndex == -1 { |
||||
continue |
||||
} |
||||
|
||||
linkEndIndex = strings.LastIndex(match, "'") |
||||
if linkEndIndex == -1 { |
||||
continue |
||||
} |
||||
} else { |
||||
linkEndIndex = strings.LastIndex(match, "\"") |
||||
if linkEndIndex == -1 { |
||||
continue |
||||
} |
||||
} |
||||
|
||||
if linkEndIndex <= linkStartIndex+1 { |
||||
continue |
||||
} |
||||
|
||||
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) |
||||
if err != nil { |
||||
continue |
||||
} |
||||
|
||||
linkResolved := ResolveLink(link, from.Host) |
||||
if hasAudioExtention(linkResolved) { |
||||
urls = append(urls, linkResolved) |
||||
} |
||||
} |
||||
|
||||
// return discovered mutual video urls
|
||||
return urls |
||||
} |
@ -0,0 +1,88 @@
|
||||
/* |
||||
Wecr - crawl the web for data |
||||
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) |
||||
|
||||
This program is free software: you can redistribute it and/or modify |
||||
it under the terms of the GNU Affero General Public License as published by |
||||
the Free Software Foundation, either version 3 of the License, or |
||||
(at your option) any later version. |
||||
|
||||
This program is distributed in the hope that it will be useful, |
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
GNU Affero General Public License for more details. |
||||
|
||||
You should have received a copy of the GNU Affero General Public License |
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/ |
||||
|
||||
package web |
||||
|
||||
var AudioExtentions = []string{ |
||||
".3gp", |
||||
".aa", |
||||
".aac", |
||||
".aax", |
||||
".act", |
||||
".aiff", |
||||
".alac", |
||||
".amr", |
||||
".ape", |
||||
".au", |
||||
".flac", |
||||
".m4a", |
||||
".mp3", |
||||
".mpc", |
||||
".msv", |
||||
".ogg", |
||||
".oga", |
||||
".mogg", |
||||
".opus", |
||||
".tta", |
||||
".wav", |
||||
".cda", |
||||
} |
||||
|
||||
var ImageExtentions = []string{ |
||||
".jpeg", |
||||
".jpg", |
||||
".jpe", |
||||
".jfif", |
||||
".png", |
||||
".ppm", |
||||
".svg", |
||||
".gif", |
||||
".tiff", |
||||
".bmp", |
||||
".webp", |
||||
".ico", |
||||
".kra", |
||||
".bpg", |
||||
".drw", |
||||
".tga", |
||||
".kra", |
||||
} |
||||
|
||||
var VideoExtentions = []string{ |
||||
".webm", |
||||
".mkv", |
||||
".flv", |
||||
".wmv", |
||||
".avi", |
||||
".yuv", |
||||
".mp2", |
||||
".mp4", |
||||
".mpeg", |
||||
".mpg", |
||||
".mpv", |
||||
".m4v", |
||||
".3gp", |
||||
".3g2", |
||||
".nsv", |
||||
".vob", |
||||
".ogv", |
||||
} |
||||
|
||||
var DocumentExtentions = []string{ |
||||
"", |
||||
} |
@ -0,0 +1,118 @@
|
||||
/* |
||||
Wecr - crawl the web for data |
||||
Copyright (C) 2023 Kasyanov Nikolay Alexeyevich (Unbewohnte) |
||||
|
||||
This program is free software: you can redistribute it and/or modify |
||||
it under the terms of the GNU Affero General Public License as published by |
||||
the Free Software Foundation, either version 3 of the License, or |
||||
(at your option) any later version. |
||||
|
||||
This program is distributed in the hope that it will be useful, |
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
GNU Affero General Public License for more details. |
||||
|
||||
You should have received a copy of the GNU Affero General Public License |
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/ |
||||
|
||||
package web |
||||
|
||||
import ( |
||||
"net/url" |
||||
"strings" |
||||
) |
||||
|
||||
func hasVideoExtention(url string) bool { |
||||
for _, extention := range VideoExtentions { |
||||
if strings.HasSuffix(url, extention) { |
||||
return true |
||||
} |
||||
} |
||||
|
||||
return false |
||||
} |
||||
|
||||
// Tries to find videos' URLs on the page
|
||||
func FindPageVideos(pageBody []byte, from *url.URL) []string { |
||||
var urls []string |
||||
|
||||
// for every element that has "src" attribute
|
||||
for _, match := range tagSrcRegexp.FindAllString(string(pageBody), -1) { |
||||
var linkStartIndex int |
||||
var linkEndIndex int |
||||
|
||||
linkStartIndex = strings.Index(match, "\"") |
||||
if linkStartIndex == -1 { |
||||
linkStartIndex = strings.Index(match, "'") |
||||
if linkStartIndex == -1 { |
||||
continue |
||||
} |
||||
|
||||
linkEndIndex = strings.LastIndex(match, "'") |
||||
if linkEndIndex == -1 { |
||||
continue |
||||
} |
||||
} else { |
||||
linkEndIndex = strings.LastIndex(match, "\"") |
||||
if linkEndIndex == -1 { |
||||
continue |
||||
} |
||||
} |
||||
|
||||
if linkEndIndex <= linkStartIndex+1 { |
||||
continue |
||||
} |
||||
|
||||
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) |
||||
if err != nil { |
||||
continue |
||||
} |
||||
|
||||
linkResolved := ResolveLink(link, from.Host) |
||||
if hasVideoExtention(linkResolved) { |
||||
urls = append(urls, linkResolved) |
||||
} |
||||
} |
||||
|
||||
// for every "a" element as well
|
||||
for _, match := range tagHrefRegexp.FindAllString(string(pageBody), -1) { |
||||
var linkStartIndex int |
||||
var linkEndIndex int |
||||
|
||||
linkStartIndex = strings.Index(match, "\"") |
||||
if linkStartIndex == -1 { |
||||
linkStartIndex = strings.Index(match, "'") |
||||
if linkStartIndex == -1 { |
||||
continue |
||||
} |
||||
|
||||
linkEndIndex = strings.LastIndex(match, "'") |
||||
if linkEndIndex == -1 { |
||||
continue |
||||
} |
||||
} else { |
||||
linkEndIndex = strings.LastIndex(match, "\"") |
||||
if linkEndIndex == -1 { |
||||
continue |
||||
} |
||||
} |
||||
|
||||
if linkEndIndex <= linkStartIndex+1 { |
||||
continue |
||||
} |
||||
|
||||
link, err := url.Parse(match[linkStartIndex+1 : linkEndIndex]) |
||||
if err != nil { |
||||
continue |
||||
} |
||||
|
||||
linkResolved := ResolveLink(link, from.Host) |
||||
if hasVideoExtention(linkResolved) { |
||||
urls = append(urls, linkResolved) |
||||
} |
||||
} |
||||
|
||||
// return discovered mutual video urls
|
||||
return urls |
||||
} |
Loading…
Reference in new issue