-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathcreate_phonetic_dict.py
117 lines (103 loc) · 5.31 KB
/
create_phonetic_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from argparse import ArgumentParser
import codecs
import os
import pymorphy2
from russian_tagsets import converters
from russian_g2p.Accentor import Accentor
from russian_g2p.Grapheme2Phoneme import Grapheme2Phoneme
def transcribe_words(source_words_list):
n_words = len(source_words_list)
n_parts = 100
part_size = n_words // n_parts
while (part_size * n_parts) < n_words:
part_size += 1
transcriptions = []
bad_words = []
to_ud2 = converters.converter('opencorpora-int', 'ud20')
morph = pymorphy2.MorphAnalyzer()
accentor = Accentor(exception_for_unknown=True, use_wiki=False)
g2p = Grapheme2Phoneme(exception_for_nonaccented=True)
russian_letters = set('АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя')
russian_consonants = set('БбВвГгДдЖжЗзЙйКкЛлМмНнПпРрСсТтФфХхЦцЧчШшЩщЪъЬь')
part_counter = 0
for word_idx in range(len(source_words_list)):
cur_word = source_words_list[word_idx].strip().lower()
err_msg = 'Word {0} is wrong!'.format(word_idx)
assert len(cur_word) > 0, err_msg + ' It is empty!'
assert set(cur_word) <= (russian_letters | {'-'}), \
err_msg + ' "{0}" contains an inadmissible characters.'.format(cur_word)
assert set(cur_word) != {'-'}, err_msg + ' It is empty!'
if (len(cur_word) > 1) and (set(cur_word) <= russian_consonants):
bad_words.append(cur_word)
else:
morpho_variants = set([to_ud2(str(it.tag)) for it in morph.parse(cur_word)])
try:
accentuation_variants = []
for it in morpho_variants:
accentuation_variants += accentor.do_accents([[cur_word, it]])[0]
variants_of_transcriptions = list(set(
filter(
lambda it2: len(it2) > 0,
map(
lambda it: tuple(g2p.word_to_phonemes(it)),
accentuation_variants
)
)
))
if len(variants_of_transcriptions) > 0:
transcriptions.append((cur_word, ' '.join(variants_of_transcriptions[0])))
if len(variants_of_transcriptions) > 1:
for variant_idx in range(1, len(variants_of_transcriptions)):
transcriptions.append(('{0}({1})'.format(cur_word, variant_idx + 1),
' '.join(variants_of_transcriptions[variant_idx])))
else:
bad_words.append(cur_word)
except:
bad_words.append(cur_word)
if ((word_idx + 1) % part_size) == 0:
part_counter += 1
print('{0:.2%} of words have been processed...'.format(part_counter / float(n_parts)))
if part_counter < n_parts:
print('100.00% of words have been processed...')
return transcriptions, bad_words
def main():
parser = ArgumentParser()
parser.add_argument('-s', '--src', dest='source_word_list', type=str, required=True,
help='Source file with words for which phonetical transcirptions will be calculated.')
parser.add_argument('-d', '--dst', dest='destination_dictionary', type=str, required=True,
help='Destination file into which all words with their calculated transcriptions will be '
'written.')
parser.add_argument('-b', '--bad', dest='bad_word_list', type=str, required=True,
help='Special file into which bad words will be written '
'(transcriptions for bad words cannot be calculated).')
args = parser.parse_args()
src_name = os.path.normpath(args.source_word_list)
assert os.path.isfile(src_name), 'File "{0}" does not exist!'.format(src_name)
dst_name = os.path.normpath(args.destination_dictionary)
dst_dir = os.path.dirname(dst_name)
if len(dst_dir) > 0:
assert os.path.isdir(dst_dir), 'Directory "{0}" does not exist!'.format(dst_dir)
bad_name = os.path.normpath(args.bad_word_list)
bad_dir = os.path.dirname(bad_name)
if len(bad_dir) > 0:
assert os.path.isdir(bad_dir), 'Directory "{0}" does not exist!'.format(bad_dir)
source_words = list()
with codecs.open(src_name, mode='r', encoding='utf-8', errors='ignore') as fp:
cur_line = fp.readline()
while len(cur_line) > 0:
prep_line = cur_line.strip()
if len(prep_line) > 0:
source_words.append(prep_line)
cur_line = fp.readline()
assert len(source_words) > 0, 'The source word list "{0}" is empty!'.format(src_name)
print('Source words have been successfully loaded...')
transcriptions, bad_words = transcribe_words(sorted(source_words))
with codecs.open(dst_name, mode='w', encoding='utf-8', errors='ignore') as fp:
for cur in transcriptions:
fp.write('{0} {1}\n'.format(cur[0], cur[1]))
if len(bad_words) > 0:
with codecs.open(bad_name, mode='w', encoding='utf-8', errors='ignore') as fp:
for cur in sorted(bad_words):
fp.write('{0}\n'.format(cur))
if __name__ == '__main__':
main()