-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocessing_data.py
268 lines (213 loc) · 7.92 KB
/
preprocessing_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import re
import collections
import shutil
from tensorflow.python.platform import gfile
num_movie_scripts = 10
vocabulary_size = 10000
fraction_dev = 50
path_for_x_train = 'X_train.txt'
path_for_y_train = 'y_train.txt'
path_for_x_dev = 'X_dev.txt'
path_for_y_dev = 'y_dev.txt'
_PAD = b"_PAD"
_GO = b"_GO"
_EOS = b"_EOS"
_UNK = b"_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
_WORD_SPLIT = re.compile(b"([.,!?\":;)(])")
_DIGIT_RE = re.compile(br"\d")
#FROM DATA UTILS
# Build the dictionary with word-IDs from self-made dictionary and replace rare words with UNK token.
def build_dataset(words, vocabulary_size):
count = [['_UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count = unk_count + 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
def create_vocabulary(dictionary, vocabulary_path):
f = open(vocabulary_path, 'w')
for key in dictionary:
f.write(dictionary[key] + '\n')
f.close()
def initialize_vocabulary(vocabulary_path):
# finds vocabulary file
if gfile.Exists(vocabulary_path):
rev_vocab = []
with gfile.GFile(vocabulary_path, mode="rb") as f:
rev_vocab.extend(f.readlines())
rev_vocab = [line.strip() for line in rev_vocab]
vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
return vocab, rev_vocab
else:
raise ValueError("Vocabulary file %s not found.", vocabulary_path)
def generate_encoded_files2(x_train_file, y_train_file, x_dev_file, y_dev_file, tokenized_sentences, dictionary):
"""Sentence A is in x_train, Sentence B in y_train"""
encoded_holder = []
unk_id = dictionary['_UNK']
for sentence in tokenized_sentences:
encoded_holder.append(encode_sentence(sentence, dictionary, unk_id))
f1 = open(x_train_file, 'w')
f2 = open(y_train_file, 'w')
fraction = int(len(encoded_holder) / fraction_dev)
if (len(encoded_holder) % 2 == 0):
end = len(encoded_holder)
else:
end = len(encoded_holder)-1
for i in xrange(0,fraction,2):
f1.write(str(encoded_holder[i]) + '\n')
f2.write(str(encoded_holder[i+1]) + '\n')
f1.close()
f2.close()
d1 = open(x_dev_file, 'w')
d2 = open(y_dev_file, 'w')
for i in xrange(fraction, end, 2):
d1.write(str(encoded_holder[i]) + '\n')
d2.write(str(encoded_holder[i+1]) + '\n')
d1.close()
d2.close()
def generate_encoded_files(x_train_file, y_train_file, x_dev_file, y_dev_file, tokenized_sentences, dictionary):
"""Sentence A is in x_train and y_train, Sentence B in X_train and y_train"""
encoded_holder = []
f1 = open(x_train_file, 'w')
last_line = tokenized_sentences.pop()
first_line = tokenized_sentences.pop(0)
dev_counter = int(len(tokenized_sentences) - len(tokenized_sentences)/fraction_dev)
unk_id = dictionary['_UNK']
first_line_encoded = encode_sentence(first_line, dictionary, unk_id)
f1.write(first_line_encoded + '\n')
# Creates data for X_train
for x in xrange(dev_counter):
encoded_sentence = encode_sentence(tokenized_sentences[x], dictionary, unk_id)
encoded_holder.append(encoded_sentence)
f1.write(encoded_sentence + '\n') # Write sentence to file
f1.close()
d1 = open(x_dev_file, 'w')
# Creates data for x_dev_file
for x in xrange(dev_counter, len(tokenized_sentences)):
encoded_sentence = encode_sentence(tokenized_sentences[x], dictionary, unk_id)
encoded_holder.append(encoded_sentence)
d1.write(encoded_sentence + '\n') # Write sentence to file
d1.close()
# Creates data for y_train
f2 = open(y_train_file, 'w')
for x in xrange(dev_counter + 1):
f2.write(encoded_holder[x] + '\n') # Write sentence to file
f2.close()
# Creates data for y_dev
d2 = open(y_dev_file, 'w')
for x in xrange(dev_counter + 1, len(tokenized_sentences)):
d2.write(encoded_holder[x] + '\n') # Write sentence to file
last_line_encoded = encode_sentence(last_line, dictionary, unk_id)
d2.write(last_line_encoded + '\n')
d2.close()
def basic_tokenizer(sentence):
"""Very basic tokenizer: split the sentence into a list of tokens"""
words = []
for space_separated_fragment in sentence.strip().split():
words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
return [w for w in words if w]
def encode_sentence(sentence, dictionary, unk_id):
# Extract first word (and don't add any space)
if not sentence:
return ""
first_word = sentence.pop(0)
if first_word in dictionary:
encoded_sentence = str(dictionary[first_word])
else:
encoded_sentence = str(unk_id)
# Loop rest of the words (and add space in front)
for word in sentence:
if word in dictionary:
encoded_word = dictionary[word]
else:
encoded_word = unk_id
encoded_sentence += " " + str(encoded_word)
return encoded_sentence
def sentence_to_token_ids(sentence, vocabulary):
"""Convert a string to list of integers representing token-ids.
For example, a sentence "I have a dog" may become tokenized into
["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
"a": 4, "dog": 7"} this function will return [1, 2, 4, 7].
Returns:
a list of integers, the token-ids for the sentence.
"""
words = basic_tokenizer(sentence)
return [vocabulary.get(w, UNK_ID) for w in words]
def read_data(num_movie_scripts):
data_tokens = []
# Append each line in file to the set
for i in range(0, num_movie_scripts):
path = 'data/'+str(i)+'raw.txt'
print 'Reading ', path, '...'
lines = [line.rstrip('\n') for line in open(path)]
data_tokens_temp = []
for line in lines:
# Tokenize each sentence
data_tokens_temp.extend(re.findall(r'\S+', line))
data_tokens.extend(data_tokens_temp)
return data_tokens
# Reads data and puts every sentence in a TWO DIMENSIONAL array as tokens
# data_tokens[0] = ['This', 'is', 'a', 'sentence']
def read_sentences(num_movie_scripts):
data_tokens = []
# Append each line in file to the set
for i in range(0, num_movie_scripts):
path = 'data/'+str(i)+'raw.txt'
print 'Reading ', path, '...'
lines = [line.rstrip('\n') for line in open(path)]
data_tokens_temp = []
for line in lines:
# Tokenize each sentence
data_tokens_temp.append(re.findall(r'\S+', line))
data_tokens.extend(data_tokens_temp)
return data_tokens
def make_files(num_movie_scripts, vocabulary_size, fraction_dev=50, path_for_x_train = 'X_train.txt', path_for_y_train = 'y_train.txt', path_for_x_dev = 'X_dev.txt', path_for_y_dev = 'y_dev.txt'):
# Generate dictionary for dataset
print '------------------------------------------------'
print ' Generating dictionary based on ', str(num_movie_scripts), ' scripts'
print '------------------------------------------------'
tokenized_data = read_data(num_movie_scripts)
data, count, dictionary, reverse_dictionary = build_dataset(tokenized_data, vocabulary_size)
create_vocabulary(reverse_dictionary, 'vocabulary_for_movies.txt')
# Generate an encoded file using the freated dictionary
print '------------------------------------------------'
print ' Creating encoded file using created dictionary'
print ' (Saved in ', path_for_x_train, ')'
print '------------------------------------------------'
tokenized_sentences = read_sentences(num_movie_scripts)
generate_encoded_files(path_for_x_train, path_for_y_train, path_for_x_dev, path_for_y_dev, tokenized_sentences, dictionary)
#-----------------------Printing methods----------------------------
def print_dic(dic, counter):
c = 0
for x in dic:
print x
c += 1
if(c == counter):
break
def print_info(data, count, dictionary, reverse_dictionary):
print '-------- data'
print data
print '-------- count'
print count
print '-------- dictionary'
print_dic(dictionary, 5)
print dictionary
print '-------- reverse_dictionary'
print_dic(reverse_dictionary, 5)
print reverse_dictionary