-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpart3.py
156 lines (119 loc) · 4.51 KB
/
part3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from pathlib import Path
from math import log
from sharedFunctions import estEmissions, estTransitions, getDictionary
def predictViterbiFile(emissions, transitions, dictionary, inputFile, outputFile):
"""
Predicts sentiments using the Viterbi algorithm
If not outputFile given, saves labelled file as dev.p3.out
@param emissions: output from estEmissions function
@param transitions: output from estTransitions function
@param dictionary: output from getDictionary function
@param inputFile: name of file with unlabelled text
@param outputFile: name of file to save output of unlabelled text to
"""
with open(inputFile) as f, open(outputFile, "w") as out:
sentence = []
for line in f:
# form sentence
if line != "\n":
word = line.strip()
sentence.append(word)
# predict tag sequence
else:
sequence = predictViterbiList(emissions, transitions, dictionary, sentence)
for i in range(len(sequence)):
out.write("{} {}\n".format(sentence[i], sequence[i]))
out.write("\n")
sentence = []
def isMissing(child, parent, d):
"""
Returns whether child is not related to parent in dictionary d
@return: True if child not found under parent in d
"""
return (child not in d[parent]) or (d[parent][child] == 0)
def predictViterbiList(emissions, transitions, dictionary, textList):
"""
Predicts sentiments for a list of words using the
Viterbi algorithm
@param emissions: output from estEmissions function
@param transitions: output from estTransitions function
@param dictionary: output from getDictionary function
@param textList: list of words
@return: most probable y sequence for given textList as a list
"""
# base case
tags = emissions.keys()
pies = {}
pies[0] = {"_START": [0.0, None]}
# forward iterations
# Calculate log pie to combat underflow problem
for i in range(1, len(textList) + 1):
word = textList[i - 1].lower()
# Replace word with #UNK# if not in train
if word not in dictionary:
word = "#UNK#"
for currTag in tags:
bestPie = None
parent = None
# Check that word can be emitted from currTag
if isMissing(word, currTag, emissions):
continue
b = emissions[currTag][word]
for prevTag, prevPie in pies[i - 1].items():
# Check that currTag can transit from prevTag and prevPie exist
if isMissing(currTag, prevTag, transitions) or \
prevPie[0] is None:
continue
a = transitions[prevTag][currTag]
# Calculate pie
tempPie = prevPie[0] + log(a) + log(b)
if bestPie is None or tempPie > bestPie:
bestPie = tempPie
parent = prevTag
# Update pies
if i in pies:
pies[i][currTag] = [bestPie, parent]
else:
pies[i] = {currTag: [bestPie, parent]}
# stop case
bestPie = None
parent = None
for prevTag, prevPie in pies[len(textList)].items():
# Check prev can lead to a stop
if "_STOP" in transitions[prevTag]:
a = transitions[prevTag]["_STOP"]
if a == 0 or prevPie[0] is None:
continue
tempPie = prevPie[0] + log(a)
if bestPie is None or tempPie > bestPie:
bestPie = tempPie
parent = prevTag
pies[len(textList) + 1] = {"_STOP": [bestPie, parent]}
# backtracking to get sequence
sequence = []
curr = "_STOP"
i = len(textList)
while True:
parent = pies[i + 1][curr][1]
if parent is None:
parent = list(pies[i].keys())[0]
if parent == "_START":
break
sequence.append(parent)
curr = parent
i -= 1
sequence.reverse()
return sequence
# main
datasets = ["EN", "FR", "CN", "SG"]
for ds in datasets:
datafolder = Path(ds)
trainFile = datafolder / "train"
testFile = datafolder / "dev.in"
outputFile = datafolder / "dev.p3.out"
emissions = estEmissions(trainFile)
transitions = estTransitions(trainFile)
dictionary = getDictionary(trainFile)
predictViterbiFile(emissions, transitions, dictionary, testFile, outputFile)
print("Output:", outputFile)
print("Done!")