Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

URL fixes #148

Merged
merged 11 commits into from
Sep 27, 2024
14 changes: 13 additions & 1 deletion internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"net/http"
"net/url"
"regexp"
"strconv"
"strings"
"sync/atomic"

Expand Down Expand Up @@ -169,7 +170,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu

for match := range matches {
if len(matches[match]) > 0 {
rawAssets = append(rawAssets, matches[match][1])
matchFound := matches[match][1]
// Don't extract CSS elements that aren't URLs
if strings.Contains(matchFound, "%") || strings.HasPrefix(matchFound, "0.") || strings.HasPrefix(matchFound, "--font") || strings.HasPrefix(matchFound, "--size") || strings.HasPrefix(matchFound, "--color") || strings.HasPrefix(matchFound, "--shreddit") || strings.HasPrefix(matchFound, "100vh") {
continue
}
rawAssets = append(rawAssets, matchFound)
}
}
}
Expand Down Expand Up @@ -276,6 +282,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
scriptLinks := utils.DedupeStrings(regexOutlinks.FindAllString(outerHTML, -1))
for _, scriptLink := range scriptLinks {
if strings.HasPrefix(scriptLink, "http") {
// Escape URLs when unicode runes are present in the extracted URLs
scriptLink, err := strconv.Unquote(`"` + scriptLink + `"`)
if err != nil {
c.Log.Debug("unable to escape URL from JSON in script tag", "error", err, "url", scriptLink)
continue
}
rawAssets = append(rawAssets, scriptLink)
}
}
Expand Down
6 changes: 6 additions & 0 deletions internal/pkg/crawl/sitespecific/youtube/youtube_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ func TestParse(t *testing.T) {
// Parse the video
streamURLs, metaURLs, rawJSON, _, err := ytdlp.Parse(f)
if err != nil {
_, found := ytdlp.FindPath()
if !found {
// TODO: install yt-dlp when running our tests in CI?
t.Skipf("yt-dlp not installed. skipping test due to missing executable.")
return
}
t.Fatal(err)
}

Expand Down
39 changes: 37 additions & 2 deletions internal/pkg/utils/url.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,14 @@ import (
func URLToString(u *url.URL) string {
var err error

q := u.Query()
u.RawQuery = q.Encode()
switch u.Host {
case "external-preview.redd.it", "styles.redditmedia.com", "preview.redd.it":
// Do nothing. We don't want to encode the URL for signature purposes. :(
break
default:
q := u.Query()
u.RawQuery = encodeQuery(q)
}
u.Host, err = idna.ToASCII(u.Host)
if err != nil {
if strings.Contains(u.Host, ":") {
Expand All @@ -38,6 +44,35 @@ func URLToString(u *url.URL) string {
return u.String()
}

// Encode encodes the values into “URL encoded” form
// from: https://cs.opensource.google/go/go/+/refs/tags/go1.23.1:src/net/url/url.go;l=1002
// modified to not sort.
func encodeQuery(v url.Values) string {
if len(v) == 0 {
return ""
}
var buf strings.Builder
keys := make([]string, 0, len(v))
for k := range v {
keys = append(keys, k)
}
// Modified to not sort the keys.
// slices.Sort(keys)
for _, k := range keys {
vs := v[k]
keyEscaped := url.QueryEscape(k)
for _, v := range vs {
if buf.Len() > 0 {
buf.WriteByte('&')
}
buf.WriteString(keyEscaped)
buf.WriteByte('=')
buf.WriteString(url.QueryEscape(v))
}
}
return buf.String()
}

// MakeAbsolute turn all URLs in a slice of url.URL into absolute URLs, based
// on a given base *url.URL
func MakeAbsolute(base *url.URL, URLs []*url.URL) []*url.URL {
Expand Down
27 changes: 27 additions & 0 deletions internal/pkg/utils/url_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,30 @@ func TestURLwithIPv6WithPort(t *testing.T) {
t.Fatalf("Expected %s, got %s", expected, actual)
}
}

func TestURLwithSpacesandUnicode(t *testing.T) {
u, err := url.Parse("https://www.youtube.com/watch/0HBwC_wIFF4?t=18363石神視点【Minecraft】平日もど真ん中なんだから早く寝なきゃ【石神のぞみ/にじさんじ所属】https://www.youtube.com/watch/L30uAR9X8Uw?t=10100【倉持エン足中")
if err != nil {
t.Fatalf("Error parsing URL: %v", err)
}

expected := "https://www.youtube.com/watch/0HBwC_wIFF4?t=18363%E7%9F%B3%E7%A5%9E%E8%A6%96%E7%82%B9%E3%80%90Minecraft%E3%80%91%E5%B9%B3%E6%97%A5%E3%82%82%E3%81%A9%E7%9C%9F%E3%82%93%E4%B8%AD%E3%81%AA%E3%82%93%E3%81%A0%E3%81%8B%E3%82%89%E6%97%A9%E3%81%8F%E5%AF%9D%E3%81%AA%E3%81%8D%E3%82%83%E3%80%90%E7%9F%B3%E7%A5%9E%E3%81%AE%E3%81%9E%E3%81%BF%EF%BC%8F%E3%81%AB%E3%81%98%E3%81%95%E3%82%93%E3%81%98%E6%89%80%E5%B1%9E%E3%80%91https%3A%2F%2Fwww.youtube.com%2Fwatch%2FL30uAR9X8Uw%3Ft%3D10100%E3%80%90%E5%80%89%E6%8C%81%E3%82%A8%E3%83%B3%E8%B6%B3%E4%B8%AD"
actual := URLToString(u)
if actual != expected {
t.Fatalf("Expected %s, got %s", expected, actual)
}
}

// For technical reasons we are not encoding reddit URLs.
func TestURLwithRedditOverride(t *testing.T) {
u, err := url.Parse("https://styles.redditmedia.com/t5_7wkhw/styles/profileIcon_8w6r6fr3rh2d1.jpeg?width=64&height=64&frame=1&auto=webp&crop=64:64,smart&s=6d8ab9b89c9b846c9eb65622db9ced4992dc0905")
if err != nil {
t.Fatalf("Error parsing URL: %v", err)
}

expected := "https://styles.redditmedia.com/t5_7wkhw/styles/profileIcon_8w6r6fr3rh2d1.jpeg?width=64&height=64&frame=1&auto=webp&crop=64:64,smart&s=6d8ab9b89c9b846c9eb65622db9ced4992dc0905"
actual := URLToString(u)
if actual != expected {
t.Fatalf("Expected %s, got %s", expected, actual)
}
}