-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCWS_gen_mp.py
239 lines (197 loc) · 8.48 KB
/
CWS_gen_mp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
'''
Author: Lemay.ai
License: GPLv3
'''
# This is a dirty hack based on the CWS generation script that I didn't want to write and I'm
# sorry.
# The gc causes a segfault when it goes to clean up old models, but the program runs *fine* if
# it reaches the end - because instead of doing GC it just tells the system it's done. Therefore,
# the solution is to avoid using the GC, via the dirty dirty hack of running every model in a
# new subprocess.
import os, sys
import glob
import keras
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Turns off some *annoying* warnings
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.special import comb
from sklearn.model_selection import train_test_split
from multiprocessing import Process, Queue
###################
# Functions
###################
# Modified CWS index calculation that spits out numerator and denominator separately.
# Should be functionally identical to the below.
def CWS_index_modified(results):
ntests, ncategories = results.shape
possible_matches = comb(ncategories*ntests, 2)
possible_in_col_matches = ncategories*comb(ntests, 2)
possible_across_col_matches = possible_matches - possible_in_col_matches
# Count all matches total.
categories, category_counts = np.unique(results, return_counts=True)
all_matches = 0
for ix, cat in enumerate(categories):
all_matches += comb(category_counts[ix], 2)
# Count within-column matches
in_col_matches = 0
for i in range(ncategories):
col = results[:,i]
cats, cat_counts = np.unique(col, return_counts=True)
for ix, cat in enumerate(cats):
in_col_matches += comb(cat_counts[ix], 2)
in_col_mismatches = possible_in_col_matches - in_col_matches
# Cross-column matches are then easy
cross_col_matches = all_matches - in_col_matches
cross_col_mismatches = possible_across_col_matches - cross_col_matches
# And the numerator/denominator are easy.
numerator = cross_col_mismatches / possible_across_col_matches
denominator = in_col_mismatches / possible_in_col_matches
return numerator, denominator
# Original CWS calculation, for reference/double-checking
def CWS_index(results):
ncategories = len(results[0])
ntests = len(results)
possible_matches = comb(ncategories*ntests, 2)
per_col_matches = comb(ntests, 2)
possible_across_col_matches = possible_matches - ncategories*per_col_matches
lang_counts_by_col = []
for i in range(ncategories):
column = results[:,i]
lang_counts_by_col.append(np.unique(column, return_counts = True))
lang_counts = np.unique(results, return_counts = True)
all_col_matches = 0
for lang_count in lang_counts[1]:
all_col_matches += comb(lang_count, 2)
within_col_matches = 0
for i in lang_counts_by_col:
for j in range(len(i[0])):
within_col_matches += comb(i[1][j], 2)
actual_across_col_matches = all_col_matches - within_col_matches
numerator = (possible_across_col_matches - actual_across_col_matches)/possible_across_col_matches
denominator = (10*per_col_matches - within_col_matches)/(10*per_col_matches)
CWS = numerator/denominator
return CWS
# Constructs X and Y vectors for a given df containing body, tags, and vector
def construct_X_Y(df, vecSize=100, verbose=False):
Y = pd.get_dummies(df['tags']).values
vectors = df['vector'].values
X_rows = vectors.shape[0]
X = np.empty([X_rows,vecSize,300])
it = range(X_rows)
if verbose:
it = tqdm(it, desc='generating X/Y')
for i in it:
if len(vectors[i][0]) == 0:
xs = np.zeros([1,vecSize,300])
else:
xd = np.stack(vectors[i][0], axis=0)
xd = xd.swapaxes(0,1)
xs = pad_sequences(xd, maxlen=vecSize, dtype='float32')
xs = np.swapaxes(xs,0,1)
X[i] = xs
return X, Y
# Constructs modified X-vector by concatenating the custom vector generated by the sidecar.
def attach_sidecar(X, df, vecSize=100, verbose=False):
custom_vectors = df['vector_custom'].values
assert X.shape[0] == custom_vectors.shape[0]
rows = X.shape[0]
X_cust = np.empty([rows,vecSize,vecSize])
it = range(rows)
if verbose:
it = tqdm(it, desc='attaching sidecar')
for i in range(rows):
if len(custom_vectors[i][0]) == 0:
xs = np.zeros([1,vecSize,vecSize])
else:
xd = np.stack(custom_vectors[i][0], axis=0)
xd = np.swapaxes(xd,0,1)
xs = pad_sequences(xd, maxlen=vecSize, dtype='float32')
xs = np.swapaxes(xs,0,1)
X_cust[i]=xs
# Concatenation magic
X = np.swapaxes(X, 1, 2)
X_cust = np.swapaxes(X_cust, 1, 2)
X = np.hstack((X, X_cust))
return np.swapaxes(X, 1, 2)
# Does test logic on a model
def test_model(model, X, Y, seed=42, verbose=False):
x_train, x_test, y_train, y_test=train_test_split(X, Y, test_size=0.05, random_state=seed)
# Sorts tests record indexes into per-language buckets.
# It's important for us to order by language, so the CWS calculation is easier later.
test_by_lang = {0:[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[], 7:[], 8:[], 9:[]}
for i in range(len(y_test)):
for j in range(len(y_test[i])):
if y_test[i][j] == 1:
test_by_lang[j].append(i)
# We *really* need every language to appear at least 30 times.
for i in range(10):
assert len(test_by_lang[i]) > 30
# The loop below was convenient to write but it's inefficient. In theory we should really
# just be calling model.predict on the entire test set at once.
results = []
it = range(10)
if verbose:
it = tqdm(it, desc='languages')
for lang in it:
lang_col = []
it2 = test_by_lang[lang][:30]
if verbose:
it2 = tqdm(it2, desc='inputs')
for lang_record in it2:
pred = model.predict(np.array([x_test[lang_record]]))
lang_col.append(list(pred[0]).index(max(pred[0])))
results.append(lang_col)
results = np.transpose(np.array(results))
return results
# Iterates over models in a directory, and returns the CWS scores
def generate_CWS(model_dir, X, Y, verbose=False):
CWS_scores = np.zeros((10, 2))
it = range(10)
if verbose:
it = tqdm([i for i in it], desc='variants')
q = Queue()
for seed in it:
def proc_func(q):
model_path = os.path.join(model_dir, 'model_{}.h5'.format(seed))
model = keras.models.load_model(model_path)
results = test_model(model, X, Y, verbose=verbose, seed=seed)
q.put(CWS_index_modified(results))
p = Process(target=proc_func, args=(q,))
p.start()
p.join()
CWS_scores[seed] = q.get()
return CWS_scores
###################
# Main
###################
def main():
# Iterable of data file name, and model file directory, and whether or not to cocnatenate
# the custom vectors.
test_data = [
('word2vec_vectors_window_6.pkl', 'word2vec_width_90', False), # Word2Vec on its own
('word2vec_vectors_window_6.pkl', 'word2vec_with_custom_window_6_width_80', True), # Word2Vec plus sidecar
('glove_vectors_window_4.p', 'glove_width_90', False), # GloVe on its own
('glove_vectors_window_4.p', 'glove_with_custom_window_4_width_80', True), # GloVe plus sidecar
('fastText_vectors_window_5.p', 'fasttext_width_80', False), # fastText on its own
('fastText_vectors_window_5.p', 'fasttext_with_custom_window_5_width_70', True) # fastText plus sidecar
]
for data_file, model_dir, has_sidecar in tqdm(test_data, desc='models'):
def proc_func(data_file, model_dir, has_sidecar):
print('Reading vector file')
df = pd.read_pickle(data_file)
print('Constructing X and Y vectors')
X, Y = construct_X_Y(df, verbose=True)
if has_sidecar:
print('Attaching sidecar')
X = attach_sidecar(X, df, verbose=True)
CWS_scores = generate_CWS(model_dir, X, Y, verbose=True)
np.savetxt(os.path.join('CWS_csvs', model_dir + '_CWS.csv'), CWS_scores,
delimiter=',')
p = Process(target=proc_func, args=(data_file, model_dir, has_sidecar))
p.start()
p.join()
if __name__ == '__main__':
main()