-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathirma_reader.py
71 lines (56 loc) · 1.44 KB
/
irma_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import re
def get_codes_dict(path):
file = open(path, 'r')
codes = {}
for x in file.readlines():
if x=='\n':
continue
words = x.strip().split(' ')
for word in words:
if re.match('\d', word[0]):
codes[word] = ' '.join(words[1:])
pw = word
break
else:
codes[pw] = codes[pw]+' '.join(words[0:])
break
return codes
def get_words_from_code(irma, codes):
words = []
for i in range(0,len(irma)+1):
if str(irma)[0:i] in codes:
words.append(codes[(irma)[0:i]])
print words
def get_vocab(codes,codes2=None):
vocab = {}
for key,words in codes.iteritems():
word_list = re.sub('[^0-9a-zA-Z]+', ' ', words).strip()
word_list = word_list.split(' ')
for w in word_list:
if w not in vocab:
if w == ' ':
continue
# print "Appending "+w
vocab[w] = len(vocab)
if(codes2 != None):
for key,words in codes2.iteritems():
word_list = re.sub('[^0-9a-zA-Z]+', ' ', words).strip()
word_list = word_list.split(' ')
for w in word_list:
if w not in vocab:
if w == ' ':
continue
vocab[w] = len(vocab)
return vocab
if __name__=='__main__':
codes_d = get_codes_dict('irma_code.txt')
for key in sorted(codes_d):
print key,' ',codes_d[key]
print '***************'
codes_c = get_codes_dict('IRMA_C.txt')
for key in sorted(codes_c):
print key,' ',codes_c[key]
vocab = get_vocab(codes_d,codes_c)
for key in sorted(vocab):
print key,' ',vocab[key]
print (len(vocab))