-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpart4.py
179 lines (143 loc) · 5.3 KB
/
part4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# -*- coding: utf-8 -*-
from pathlib import Path
from math import log
from sharedFunctions import estEmissions, estTransitions2, getDictionary
def predictViterbiFile(emissions, transitions, dictionary, inputFile, outputFile):
"""
Predicts sentiments using the Viterbi algorithm
If not outputFile given, saves labelled file as dev.p3.out
@param emissions: output from estEmissions function
@param transitions: output from estTransitions function
@param dictionary: output from getDictionary function
@param inputFile: name of file with unlabelled text
@param outputFile: name of file to save output of unlabelled text to
"""
with open(inputFile, encoding="utf-8") as f, open(outputFile, "w", encoding="utf-8") as out:
sentence = []
for line in f:
# form sentence
if line != "\n":
word = line.strip()
sentence.append(word)
# predict tag sequence
else:
sequence = predictViterbiList(emissions, transitions, dictionary, sentence)
for i in range(len(sequence)):
out.write("{} {}\n".format(sentence[i], sequence[i]))
out.write("\n")
sentence = []
def isMissing(child, parent, d):
"""
Returns whether child is not related to parent in dictionary d
@return: True if child not found under parent in d
"""
return (parent not in d) or (child not in d[parent]) or (d[parent][child] is None)
def predictViterbiList(emissions, transitions, dictionary, textList):
"""
Predicts sentiments for a list of words using the
Viterbi algorithm
@param emissions: output from estEmissions function
@param transitions: output from estTransitions2 function
@param dictionary: output from getDictionary function
@param textList: list of words
@return: most probable y sequence for given textList as a list
"""
# base case
tags = list(emissions.keys())
tags.append('_START')
tags.append('_STOP')
d = {}
c = {}
# b[i] = {X: {parent: b_i(parent, X)}}
d[0] = {"_START": {"_None": 0.0}}
d[1] = {"_START": {"_START": 0.0}}
# forward iterations
# Calculate log pie to combat underflow problem
for i in range(2, len(textList) + 2):
word = textList[i - 2].lower()
# Replace word with #UNK# if not in train
if word not in dictionary:
word = "#UNK#"
for n in tags:
# Skip if emission is 0
if isMissing(word, n, emissions):
continue
b = emissions[n][word]
for m in tags:
bestPie = None
grandparent = None
for l in tags:
# Skip if probability of child given parent and grandparent is 0
if isMissing(l, m, d[i - 1]):
continue
# Skip if transition pair doesnt exist
if isMissing(n, (l, m), transitions):
continue
# Calculate pie
a = transitions[(l, m)][n]
tempPie = d[i - 1][m][l] + log(a) + log(b)
if bestPie is None or tempPie > bestPie:
bestPie = tempPie
grandparent = l
# Update d
if i in d and n in d[i]:
d[i][n][m] = bestPie
elif i in d:
d[i][n] = {m: bestPie}
else:
d[i] = {n: {m: bestPie}}
# Update c
if i in c:
c[i][(m, n)] = grandparent
else:
c[i] = {(m, n): grandparent}
# stop case
bestPie = None
grandparent = None
parent = None
for m in tags:
for l in tags:
if isMissing(l, m, d[len(textList) + 1]):
continue
if isMissing("_STOP", (l, m), transitions):
continue
# Calculate pie
a = transitions[(l, m)]["_STOP"]
tempPie = d[len(textList) + 1][m][l] + log(a)
if bestPie is None or tempPie > bestPie:
bestPie = tempPie
grandparent = l
parent = m
# backtracking to get sequence
i = len(textList) + 1
if parent is None:
parent = list(d[i].keys())[0]
if grandparent is None:
grandparent = list(d[i][parent].keys())[0]
sequence = [parent, grandparent]
while True:
if isMissing((grandparent, parent), i, c):
l = list(d[i - 2].keys())[0]
else:
l = c[i][(grandparent, parent)]
if l == "_START":
break
sequence.append(l)
parent = grandparent
grandparent = l
i -= 1
sequence.reverse()
return sequence
# main
datasets = ["EN", "FR", "CN", "SG"]
for ds in datasets:
datafolder = Path(ds)
trainFile = datafolder / "train"
testFile = datafolder / "dev.in"
outputFile = datafolder / "dev.p4.out"
emissions = estEmissions(trainFile)
transitions = estTransitions2(trainFile)
dictionary = getDictionary(trainFile)
predictViterbiFile(emissions, transitions, dictionary, testFile, outputFile)
print("Output:", outputFile)
print("Done!")