-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcanonicalizer.go
188 lines (162 loc) · 4.65 KB
/
canonicalizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
package urlresolver
import (
"fmt"
"net/url"
"regexp"
"strings"
"github.com/PuerkitoBio/purell"
)
// NormalizationFlags defines the normalization flags the purell package will
// use during canonicalization.
//
// See https://godoc.org/github.com/PuerkitoBio/purell#NormalizationFlags
var NormalizationFlags = (purell.FlagsSafe |
purell.FlagRemoveDotSegments |
purell.FlagRemoveDuplicateSlashes |
purell.FlagDecodeDWORDHost |
purell.FlagDecodeOctalHost |
purell.FlagDecodeHexHost |
purell.FlagRemoveUnnecessaryHostDots |
purell.FlagRemoveEmptyPortSeparator)
var (
// Query parameters matching these patterns will ALWAYS be stripped. The
// categorized patterns below were largely sourced from this Chrome
// Extension:
//
// https://github.com/newhouse/url-tracking-stripper/blob/dea6c144/README.md#documentation
excludeParamPattern = listToRegexp(`(?i)^(`, `)$`, []string{
// Google's Urchin Tracking Module & Google Adwords
`utm_.+`,
`gclid`,
// Adobe Omniture SiteCatalyst
`icid`,
// Facebook
`fbclid`,
// Hubspot
`_hsenc`,
`_hsmi`,
// Marketo
`mkt_.+`,
// MailChimp
`mc_.+`,
// Simple Reach
`sr_.+`,
// Vero
`vero_.+`,
// Unknown
`nr_email_referer`,
`ncid`,
`ref`,
// Other ad trackers?
`ad(set)?_(name|id)`,
`omega_(ad|adset|utm)_.+`,
`campaign_id`,
`variant`,
// Miscellaneous garbage-looking params noticed by @mccutchen while
// perusing logs
`_r`,
`cmpid`,
`currentPage`,
`fsrc`,
`mb?id`,
`mobile_touch`,
`ocid`,
`rss`,
`s_(sub)?src`,
`smid`,
`wpsrc`,
})
// Per-domain lists of allowed query parameters
domainParamAllowlist = map[*regexp.Regexp]*regexp.Regexp{
regexp.MustCompile(`(?i)(^|\.)youtube\.com$`): regexp.MustCompile(`^(v|p|t|list)$`),
// really, this should be restricted to twitter.com/search?q=, but
// allowing q= on any twitter URL is probably okay
regexp.MustCompile(`(?i)(^|\.)twitter\.com$`): regexp.MustCompile(`^q$`),
}
// All query params will be stripped from these domains, which tend to be
// content-focused web sites.
//
// TODO: this could potentially make us miss roll some urls up together
// (e.g. in the case of /search?q=foo on a domain), but I think it"s worth
// it for now.
stripParamDomainPattern = listToRegexp(`(?i)(^|\.)(`, `)$`, []string{
`bbc\.co\.uk`,
`buzzfeed\.com`,
`deadspin\.com`,
`economist\.com`,
`grantland\.com`,
`huffingtonpost\.com`,
`instagram\.com`,
`newyorker\.com`,
`nymag\.com`,
`nytimes\.com`,
`slate\.com`,
`techcrunch\.com`,
`theguardian\.com`,
`theonion\.com`,
`twitter\.com`,
`vanityfair\.com`,
`vulture\.com`,
`washingtonpost\.com`,
`wsj\.com`,
})
// Paths under these domains will be lowercased, as they tend to be
// usernames that are treated as case-insensitive but may appear in a
// variety of cases (e.g. twitter.com/McCutchen and twitter.com/mccutchen
// are equivalent).
lowercaseDomainPattern = listToRegexp(`(?i)(^|\.)(`, `)$`, []string{
`instagram\.com`,
`twitter\.com`,
})
)
// Canonicalize filters unnecessary query params and then normalizes a URL,
// ensuring consistent case, encoding, sorting of params, etc.
func Canonicalize(u *url.URL) string {
return normalize(clean(u))
}
// normalize normalizes a URL, ensuring consistent case, encoding, sorting of
// params, etc.
func normalize(u *url.URL) string {
if lowercaseDomainPattern.MatchString(u.Host) {
u.Path = strings.ToLower(u.Path)
}
return purell.NormalizeURL(u, NormalizationFlags)
}
// clean removes unnecessary query params and fragment identifiers from a URL.
func clean(u *url.URL) *url.URL {
u.RawQuery = filterParams(u).Encode()
u.Fragment = ""
return u
}
func filterParams(u *url.URL) url.Values {
filtered := url.Values{}
hostname := u.Hostname()
for param, values := range u.Query() {
if shouldExcludeParam(hostname, param) {
continue
}
for _, v := range values {
filtered.Add(param, v)
}
}
return filtered
}
func shouldExcludeParam(domain string, param string) bool {
// Is this a param we strip from any domain?
if excludeParamPattern.MatchString(param) {
return true
}
// Is there a param whitelist for this domain, and is this param on it?
for domainPattern, whitelistPattern := range domainParamAllowlist {
if domainPattern.MatchString(domain) {
return !whitelistPattern.MatchString(param)
}
}
// Finally, do we strip all params from this domain? If not, default to
// allowing the param.
return stripParamDomainPattern.MatchString(domain)
}
func listToRegexp(prefix string, suffix string, patterns []string) *regexp.Regexp {
combinedPattern := fmt.Sprintf("%s%s%s", prefix, strings.Join(patterns, "|"), suffix)
return regexp.MustCompile(combinedPattern)
}