Skip to content

Commit

Permalink
Add detection for mathematical and letterlike symbols
Browse files Browse the repository at this point in the history
This commit enhances the multi-language spam detection by including checks for mathematical alphanumeric symbols and letterlike symbols. It also adds a test case for those strange characters mixed with Cyrillic.
  • Loading branch information
umputun committed Jul 26, 2024
1 parent 90a9a21 commit 7dca4f6
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
9 changes: 8 additions & 1 deletion lib/tgspam/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,7 @@ func (d *Detector) isMultiLang(msg string) spamcheck.Response {
switch {
case r == 'i': // skip 'i' because it's used in many languages
continue
case unicode.Is(unicode.Latin, r):
case unicode.Is(unicode.Latin, r) || unicode.In(r, unicode.Number):
scripts["Latin"] = true
case unicode.Is(unicode.Cyrillic, r):
scripts["Cyrillic"] = true
Expand Down Expand Up @@ -614,6 +614,13 @@ func (d *Detector) isMultiLang(msg string) spamcheck.Response {
scripts["Georgian"] = true
case r == 'ї':
scripts["Ukrainian"] = true
default:
// check for mathematical alphanumeric symbols and letterlike symbols
if unicode.In(r, unicode.Other_Math, unicode.Other_Alphabetic) ||
(r >= '\U0001D400' && r <= '\U0001D7FF') || // Mathematical Alphanumeric Symbols
(r >= '\u2100' && r <= '\u214F') { // Letterlike Symbols
scripts["Mathematical"] = true
}
}
if len(scripts) > 1 {
return true
Expand Down
1 change: 1 addition & 0 deletions lib/tgspam/detector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,7 @@ func TestDetector_CheckMultiLang(t *testing.T) {
{"WithCyrillic real example 2", "В поuске паpтнеров, заuнтересованных в пассuвном дoходе с затpатой мuнuмум лuчного временu. Все деталu в лс", 10, true},
{"WithCyrillic real example 3", "Всем привет, есть простая шабашка, подойдет любому. Даю 15 тысяч. Накину на проезд, сигареты, обед. ", 0, false},
{"WithCyrillic and i", "Привет мiр", 0, false},
{"strange with cyrillic", "𝐇айди и𝐇𝐓и𝐦𝐇ы𝐞 ф𝐨𝐓𝐤и лю𝐛𝐨й д𝐞𝐁𝐲ш𝐤и ч𝐞𝐩𝐞𝟑 𝐛𝐨𝐓а", 7, true},
}

for _, tt := range tests {
Expand Down

0 comments on commit 7dca4f6

Please sign in to comment.