-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_reader.py
127 lines (97 loc) · 3.98 KB
/
data_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import xml.etree.ElementTree as etree
import operator
class OpenCorporaReader:
SKIPPED_SPEECH_PARTS = {'PNCT', 'NUMB', 'SYMB', 'LATN', 'ROMN'}
STOP_CHARS_CODES = range(ord('a'), ord('z'))
def __init__(self, xml_filename):
self._xml_filename = xml_filename
self.sentences = []
self._uniq_chars = {}
self._uniq_speech_parts = {}
self._loaded = False
def _get_tokens(self, xml_filename):
tree = etree.parse(xml_filename)
tokens = tree.findall('.//tokens')
return tokens
def _word_has_stop_chars(self, word):
for c in word:
if ord(c) in self.STOP_CHARS_CODES:
return True
return False
def _add_uniq_chars(self, word):
for c in word:
current_char_count = self._uniq_chars.get(c) or 0
self._uniq_chars[c] = current_char_count + 1
def _add_uniq_speech_parts(self, speech_part):
current_parts_count = self._uniq_speech_parts.get(speech_part) or 0
self._uniq_speech_parts[speech_part] = current_parts_count + 1
def _get_sentence(self, tokens_entry):
sentence = []
for token in tokens_entry:
speech_part = token.find('.//g').attrib['v']
if speech_part in self.SKIPPED_SPEECH_PARTS:
continue
word = token.attrib['text'].lower()
if self._word_has_stop_chars(word):
raise ValueError('sentence has invalid chars')
self._add_uniq_chars(word)
self._add_uniq_speech_parts(speech_part)
sentence.append((word, speech_part))
return sentence
def _get_sentences(self, tokens):
sentences = []
for tokens_entry in tokens:
try:
sentences.append(self._get_sentence(tokens_entry))
except ValueError as _error:
pass
return sentences
def load(self):
tokens = self._get_tokens(self._xml_filename)
self.sentences = self._get_sentences(tokens)
self._loaded = True
def get_uniq_chars(self):
if not self._loaded:
raise BaseException('data is not loaded')
return self._uniq_chars.keys()
def get_uniq_speech_parts(self):
if not self._loaded:
raise BaseException('data is not loaded')
return self._uniq_speech_parts.keys()
def get_longest_sentence(self):
if not self._loaded:
raise BaseException('data is not loaded')
return max(self.sentences, key=lambda sentence: len(sentence))
def get_chars_freq(self):
if not self._loaded:
raise BaseException('data is not loaded')
return sorted(self._uniq_chars.items(), key = operator.itemgetter(1))
def get_speech_parts_freq(self):
if not self._loaded:
raise BaseException('data is not loaded')
return sorted(self._uniq_speech_parts.items(), key = operator.itemgetter(1))
def get_longest_word(self):
if not self._loaded:
raise BaseException('data is not loaded')
max_length = 0
longest_word = ''
for sentence in self.sentences:
for word, _speech_part in sentence:
word_len = len(word)
if word_len > max_length:
max_length = word_len
longest_word = word
return longest_word
if __name__ == '__main__':
from download_data import OPEN_CORPORA_DEST_FILE
loader = OpenCorporaReader(OPEN_CORPORA_DEST_FILE)
loader.load()
print('unique chars', loader.get_chars_freq())
print('unique speech_parts', loader.get_speech_parts_freq())
print('sentences count: ', len(loader.sentences))
longest_sentence = loader.get_longest_sentence()
print('longest sentence: ', longest_sentence)
print('max sentence length: ', len(longest_sentence))
longest_word = loader.get_longest_word()
print('longest word: ', longest_word)
print('longest word chars: ', len(longest_word))