forked from shigabeev/russian_tts_normalization
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrussian.py
343 lines (288 loc) · 15.6 KB
/
russian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
import re
# Updated mapping dictionary with common digraphs
cyrrilization_mapping_extended = {
'a': 'а', 'b': 'б', 'c': 'к', 'd': 'д', 'e': 'е',
'f': 'ф', 'g': 'г', 'h': 'х', 'i': 'и', 'j': 'й',
'k': 'к', 'l': 'л', 'm': 'м', 'n': 'н', 'o': 'о',
'p': 'п', 'q': 'к', 'r': 'р', 's': 'с', 't': 'т',
'u': 'у', 'v': 'в', 'w': 'в', 'x': 'кс', 'y': 'ы',
'z': 'з',
# Common digraphs
'sh': 'ш', 'ch': 'ч', 'th': 'з', 'ph': 'ф', 'oo': 'у', 'ee': 'и', 'kh': 'х',
# common trigraphs
'sch': 'ск'
# Capital letters are also converted to lowercase in the cyrrilization
}
# Russian letter to its phonetic pronunciation mapping
pronunciation_map = {
'А': 'а', 'Б': 'бэ', 'В': 'вэ', 'Г': 'гэ', 'Д': 'дэ',
'Е': 'е', 'Ё': 'ё', 'Ж': 'жэ', 'З': 'зэ', 'И': 'и',
'Й': 'ий', 'К': 'ка', 'Л': 'эл', 'М': 'эм', 'Н': 'эн',
'О': 'о', 'П': 'пэ', 'Р': 'эр', 'С': 'эс', 'Т': 'тэ',
'У': 'у', 'Ф': 'эф', 'Х': 'ха', 'Ц': 'цэ', 'Ч': 'чэ',
'Ш': 'ша', 'Щ': 'ща', 'Ъ': 'твёрдый знак', 'Ы': 'ы', 'Ь': 'мягкий знак',
'Э': 'э', 'Ю': 'ю', 'Я': 'я'
}
# Function to expand abbreviations in the text
def expand_abbreviations(text):
# Regex to find sequences of uppercase Cyrillic letters
abbreviations = re.findall(r'\b[А-ЯЁ]{2,}\b', text)
# Expand each abbreviation using the pronunciation map
for abbr in abbreviations:
# Create a pronounced form of the abbreviation
pronounced_form = ' '.join(pronunciation_map[letter] for letter in abbr if letter in pronunciation_map)
# Replace the abbreviation with its pronounced form
text = text.replace(abbr, pronounced_form)
return text
def cyrrilize(text):
"""Convert a given text from Latin script to an approximate Cyrillic script in lowercase,
taking into account common digraphs."""
text = text.lower() # Convert text to lowercase
cyrrilized_text = ""
i = 0
while i < len(text):
if i + 1 < len(text) and text[i:i+2] in cyrrilization_mapping_extended:
# If a digraph is found, add its cyrrilization and increment by 2
cyrrilized_text += cyrrilization_mapping_extended[text[i:i+2]]
i += 2
else:
# Add the cyrrilization of a single character
cyrrilized_text += cyrrilization_mapping_extended.get(text[i], text[i])
i += 1
return cyrrilized_text
def number_to_words(n):
"""
Convert a number into its word components in Russian
"""
if n == 0:
return 'ноль'
units = ['','один','два','три','четыре','пять','шесть','семь','восемь','девять']
teens = ['десять','одиннадцать','двенадцать','тринадцать','четырнадцать','пятнадцать','шестнадцать','семнадцать','восемнадцать','девятнадцать']
tens = ['','десять','двадцать','тридцать','сорок','пятьдесят','шестьдесят','семьдесят','восемьдесят','девяносто']
hundreds = ['','сто','двести','триста','четыреста','пятьсот','шестьсот','семьсот','восемьсот','девятьсот']
thousand_units = ['тысяча', 'тысячи', 'тысяч']
million_units = ['миллион', 'миллиона', 'миллионов']
billion_units = ['миллиард', 'миллиарда', 'миллиардов']
words = []
# Helper function to resolve the correct form of thousands, millions, and billions
def russian_plural(number, units):
if number % 10 == 1 and number % 100 != 11:
return units[0]
elif 2 <= number % 10 <= 4 and (number % 100 < 10 or number % 100 >= 20):
return units[1]
else:
return units[2]
# Helper function to handle numbers below 1000
def under_thousand(number):
if number == 0:
return []
elif number < 10:
return [units[number]]
elif number < 20:
return [teens[number - 10]]
elif number < 100:
return [tens[number // 10], units[number % 10]]
else:
return [hundreds[number // 100]] + under_thousand(number % 100)
# Break the number into the billions, millions, thousands, and the rest
billions = n // 1_000_000_000
millions = (n % 1_000_000_000) // 1_000_000
thousands = (n % 1_000_000) // 1_000
remainder = n % 1_000
if billions:
words += under_thousand(billions) + [russian_plural(billions, billion_units)]
if millions:
words += under_thousand(millions) + [russian_plural(millions, million_units)]
if thousands:
# Special case for 'one' and 'two' in thousands
if thousands % 10 == 1 and thousands % 100 != 11:
words.append('одна')
elif thousands % 10 == 2 and thousands % 100 != 12:
words.append('две')
else:
words += under_thousand(thousands)
words.append(russian_plural(thousands, thousand_units))
words += under_thousand(remainder)
return ' '.join(word for word in words if word)
def detect_numbers(text):
# Regular expression pattern for matching standalone numbers
number_pattern = re.compile(r'\b\d+\b')
# Find all matches and return them along with their start and end indices
matches = list(number_pattern.finditer(text))
number_matches = [{'number': match.group(), 'start': match.start(), 'end': match.end()} for match in matches]
return number_matches
def number_to_words_digit_by_digit(n):
"""
Convert a number into its word components in Russian, digit by digit.
"""
units = ['ноль', 'один', 'два', 'три', 'четыре', 'пять', 'шесть', 'семь', 'восемь', 'девять']
return ' '.join(units[int(digit)] for digit in str(n))
# Update the normalize_text_with_numbers to handle large numbers by reading them digit by digit
def normalize_text_with_numbers(text):
# Detect all standalone numbers in the text
detected_numbers = detect_numbers(text)
# Sort detected numbers by their starting index in descending order
detected_numbers.sort(key=lambda x: x['start'], reverse=True)
# Replace each number with its normalized form
for num in detected_numbers:
number_value = int(num['number'])
# For large numbers that are out of the range of the 'number_to_words' function, use 'number_to_words_digit_by_digit'
if number_value >= 1_000_000_000_000:
normalized_number = number_to_words_digit_by_digit(number_value)
else:
normalized_number = number_to_words(number_value)
# Replace the original number in the text with its normalized form
text = text[:num['start']] + normalized_number + text[num['end']:]
return text
def normalize_phone_number(phone_number):
# Strip the phone number of all non-numeric characters
digits = re.sub(r'\D', '', phone_number)
# Define the segments for the Russian phone number
segments = {
'country_code': digits[:1], # +7 or 8
'area_code': digits[1:4], # 495
'block_1': digits[4:7], # 123
'block_2': digits[7:9], # 45
'block_3': digits[9:11], # 67
}
# Normalizing the country code
if segments['country_code'] == '8':
segments['country_code'] = 'восемь'
elif segments['country_code'] == '7':
segments['country_code'] = 'плюс семь'
# Normalize each segment using the number_to_words function
normalized_segments = {
key: number_to_words(int(value)) if key != 'country_code' else value
for key, value in segments.items()
}
# Combine the segments into the final spoken form
spoken_form = ' '.join(normalized_segments.values())
return spoken_form
# Correcting the phone number normalization function to handle various formats correctly
def normalize_text_with_phone_numbers(text):
# Detect all phone numbers in the text
phone_pattern = re.compile(
r"(?:\+7|8)\s*\(?\d{3}\)?\s*\d{3}[-\s]?\d{2}[-\s]?\d{2}|8\d{10}"
)
# We use finditer here instead of findall to get the match objects, which will include the start and end indices.
matches = list(phone_pattern.finditer(text))
detected_phone_numbers = [{'phone': match.group().strip(), 'start': match.start(), 'end': match.end()} for match in matches]
# Sort detected phone numbers by their starting index in descending order
# This ensures that when we replace them, we don't mess up the indices of the remaining phone numbers
detected_phone_numbers.sort(key=lambda x: x['start'], reverse=True)
# Replace each phone number with its normalized form
for pn in detected_phone_numbers:
normalized_phone = normalize_phone_number(pn['phone'])
# Replace the original phone number in the text with its normalized form
text = text[:pn['start']] + normalized_phone + text[pn['end']:]
return text
# Full function that detects and converts currency in a text to its full Russian word representation
def currency_normalization(text):
"""
Detects currency amounts in the text and converts them to their word representations in Russian.
"""
# Helper function to resolve the correct form of the currency units
def russian_plural(number, units):
if number % 10 == 1 and number % 100 != 11:
return units[0]
elif 2 <= number % 10 <= 4 and (number % 100 < 10 or number % 100 >= 20):
return units[1]
else:
return units[2]
# Function to convert a currency amount into its word components in Russian
def currency_to_words(amount, currency='rub'):
# Define the currency units and subunits
currencies = {
'rub': (['рубль', 'рубля', 'рублей'], ['копейка', 'копейки', 'копеек']),
'usd': (['доллар', 'доллара', 'долларов'], ['цент', 'цента', 'центов']),
'eur': (['евро', 'евро', 'евро'], ['евроцент', 'евроцента', 'евроцентов']), # Euro has invariable form
'gbp': (['фунт', 'фунта', 'фунтов'], ['пенс', 'пенса', 'пенсов']),
'uah': (['гривна', 'гривны', 'гривен'], ['копейка', 'копейки', 'копеек']),
}
# Get the correct currency units
main_units, sub_units = currencies.get(currency, currencies['rub'])
# Separate the amount into main and subunits
main_amount = int(amount)
sub_amount = int(round((amount - main_amount) * 100))
# Convert numbers to words
main_words = number_to_words(main_amount) + ' ' + russian_plural(main_amount, main_units)
sub_words = ''
# Add subunits if present
if sub_amount > 0:
sub_words = number_to_words(sub_amount) + ' ' + russian_plural(sub_amount, sub_units)
# Combine main and subunit words
full_currency_words = main_words.strip()
if sub_words:
full_currency_words += ' ' + sub_words.strip()
return full_currency_words
# Define currency patterns for detection
currency_patterns = {
'rub': [r'(\d+(?:\.\d\d)?)\s*(руб(л(ей|я|ь))?|₽)', r'(\d+(?:\.\d\d)?)\s*RUB'],
'usd': [r'(\d+(?:\.\d\d)?)\s*(доллар(ов|а|ы)?|\$)', r'(\d+(?:\.\d\d)?)\s*USD', r'\$(\d+(?:\.\d\d)?)'],
'eur': [r'(\d+(?:\.\d\d)?)\s*(евро|€)', r'(\d+(?:\.\d\d)?)\s*EUR', r'(\d+)\s*€'],
'gbp': [r'(\d+(?:\.\d\d)?)\s*(фунт(ов|а|ы)?|£)', r'(\d+(?:\.\d\d)?)\s*GBP', r'£(\d+)'],
'uah': [r'(\d+(?:\.\d\d)?)\s*(грив(ен|ны|на)|₴)', r'(\d+(?:\.\d\d)?)\s*UAH', r'(\d+)\s*₴'],
}
# Detect and convert currencies in the text
def detect_currency(text):
# Check each currency pattern to find matches
for currency_code, patterns in currency_patterns.items():
for pattern in patterns:
matches = re.finditer(pattern, text)
for match in matches:
# Extract the amount and convert it to words
amount = float(match.group(1))
currency_words = currency_to_words(amount, currency_code)
# Replace the original amount with its word representation in the text
text = re.sub(pattern, currency_words, text, count=1)
return text
# Run the detection and conversion on the input text
return detect_currency(text)
# Updated function to normalize dates in a given text with month names and ordinal days
def normalize_dates(text):
# Month names in Russian in the genitive case
month_names = {
'01': 'января', '02': 'февраля', '03': 'марта',
'04': 'апреля', '05': 'мая', '06': 'июня',
'07': 'июля', '08': 'августа', '09': 'сентября',
'10': 'октября', '11': 'ноября', '12': 'декабря'
}
# Regular expression for matching dates in DD.MM.YYYY format
date_pattern = re.compile(r'\b(\d{2})\.(\d{2})\.(\d{4})\b')
# Function to normalize a single date
def normalize_date(match):
day, month, year = match.groups()
# Convert day to ordinal word and year to words
day_word = number_to_words_ordinal(int(day))
year_word = number_to_words(int(year))
# Use the month name from the mapping
month_name = month_names.get(month, '')
# Construct the normalized date string in the format "7 января 2021 года"
return f'{day_word} {month_name} {year_word} года'
def number_to_words_ordinal(n):
"""
Convert a number into its ordinal word components in Russian. This function is specific to days of the month,
where ordinal numbers are required.
"""
# Russian ordinal numbers for days (1st to 31st) in the genitive case, which is used for dates
ordinal_days = {
1: 'первое', 2: 'второе', 3: 'третье', 4: 'четвёртое', 5: 'пятое',
6: 'шестое', 7: 'седьмое', 8: 'восьмое', 9: 'девятое', 10: 'десятое',
11: 'одиннадцатое', 12: 'двенадцатое', 13: 'тринадцатое', 14: 'четырнадцатое', 15: 'пятнадцатое',
16: 'шестнадцатое', 17: 'семнадцатое', 18: 'восемнадцатое', 19: 'девятнадцатое', 20: 'двадцатое',
21: 'двадцать первое', 22: 'двадцать второе', 23: 'двадцать третье', 24: 'двадцать четвёртое',
25: 'двадцать пятое', 26: 'двадцать шестое', 27: 'двадцать седьмое', 28: 'двадцать восьмое',
29: 'двадцать девятое', 30: 'тридцатое', 31: 'тридцать первое'
}
return ordinal_days.get(n, '')
# Replace all found dates in the text with their normalized forms
normalized_text = date_pattern.sub(normalize_date, text)
return normalized_text
def normalize_russian(text):
text = expand_abbreviations(text)
text = normalize_dates(text)
text = currency_normalization(text)
text = normalize_text_with_phone_numbers(text)
text = normalize_text_with_numbers(text)
text = cyrrilize(text)
return text