From 7dca4f6df187bcb44d4125b07e039a380ccae43b Mon Sep 17 00:00:00 2001 From: Umputun Date: Fri, 26 Jul 2024 12:17:43 -0500 Subject: [PATCH] Add detection for mathematical and letterlike symbols This commit enhances the multi-language spam detection by including checks for mathematical alphanumeric symbols and letterlike symbols. It also adds a test case for those strange characters mixed with Cyrillic. --- lib/tgspam/detector.go | 9 ++++++++- lib/tgspam/detector_test.go | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/tgspam/detector.go b/lib/tgspam/detector.go index e58e5d2..4c8e6d9 100644 --- a/lib/tgspam/detector.go +++ b/lib/tgspam/detector.go @@ -586,7 +586,7 @@ func (d *Detector) isMultiLang(msg string) spamcheck.Response { switch { case r == 'i': // skip 'i' because it's used in many languages continue - case unicode.Is(unicode.Latin, r): + case unicode.Is(unicode.Latin, r) || unicode.In(r, unicode.Number): scripts["Latin"] = true case unicode.Is(unicode.Cyrillic, r): scripts["Cyrillic"] = true @@ -614,6 +614,13 @@ func (d *Detector) isMultiLang(msg string) spamcheck.Response { scripts["Georgian"] = true case r == 'ї': scripts["Ukrainian"] = true + default: + // check for mathematical alphanumeric symbols and letterlike symbols + if unicode.In(r, unicode.Other_Math, unicode.Other_Alphabetic) || + (r >= '\U0001D400' && r <= '\U0001D7FF') || // Mathematical Alphanumeric Symbols + (r >= '\u2100' && r <= '\u214F') { // Letterlike Symbols + scripts["Mathematical"] = true + } } if len(scripts) > 1 { return true diff --git a/lib/tgspam/detector_test.go b/lib/tgspam/detector_test.go index ad576ce..e4b8a64 100644 --- a/lib/tgspam/detector_test.go +++ b/lib/tgspam/detector_test.go @@ -632,6 +632,7 @@ func TestDetector_CheckMultiLang(t *testing.T) { {"WithCyrillic real example 2", "В поuске паpтнеров, заuнтересованных в пассuвном дoходе с затpатой мuнuмум лuчного временu. Все деталu в лс", 10, true}, {"WithCyrillic real example 3", "Всем привет, есть простая шабашка, подойдет любому. Даю 15 тысяч. Накину на проезд, сигареты, обед. ", 0, false}, {"WithCyrillic and i", "Привет мiр", 0, false}, + {"strange with cyrillic", "𝐇айди и𝐇𝐓и𝐦𝐇ы𝐞 ф𝐨𝐓𝐤и лю𝐛𝐨й д𝐞𝐁𝐲ш𝐤и ч𝐞𝐩𝐞𝟑 𝐛𝐨𝐓а", 7, true}, } for _, tt := range tests {