-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsplitDataSet.py
63 lines (53 loc) · 1.94 KB
/
splitDataSet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python
import argparse # optparse is deprecated
import math
from itertools import islice # slicing for iterators
def word_matches(h, ref):
return sum(1 for w in h if w in ref)
train=open("dev.train","w")
test=open("dev.test","w")
trainlabels=open("data/dev.train.answers","w")
testlabels=open("data/dev.test.answers","w")
def main():
alpha=0.9
parser = argparse.ArgumentParser(description='Evaluate translation hypotheses.')
parser.add_argument('-i', '--input', default='data/hyp1-hyp2-ref',
help='input file (default data/hyp1-hyp2-ref)')
parser.add_argument('-n', '--num_sentences', default=None, type=int,
help='Number of hypothesis pairs to evaluate')
opts = parser.parse_args()
# we create a generator and avoid loading all sentences into a list
lines = open(opts.input).readlines()
classlines = open("data/dev.answers").readlines()
def sentences():
with open(opts.input) as refHypo,open("data/dev.answers") as labels:
for x, y in zip(refHypo, labels):
x = x.strip()
y = y.strip()
list = [sentence.strip().split() for sentence in x.split(' ||| ')]
list.append(y)
yield list
j=0
prev=""
starting=0
ending=0
for h1, h2, ref,label in islice(sentences(), opts.num_sentences):
if(j==0):
prev=ref
j+=1
if(ref==prev):
prev=ref
ending=j
else:
middle=starting+int(math.ceil((ending-starting)*.60))
for i in range(starting,middle+1):
train.write(lines[i])
trainlabels.write(classlines[i])
for i in range(middle+1,ending+1):
test.write(lines[i])
testlabels.write(classlines[i])
prev=ref
starting=j
# convention to allow import of this file as a module
if __name__ == '__main__':
main()