-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrun_eval.py
712 lines (632 loc) · 27.6 KB
/
run_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
import os
import json
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import pickle
import numba
import scipy.stats
from scipy.spatial.kdtree import distance_matrix
from run_script import data_prep
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import StratifiedKFold, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import make_pipeline
from collections import Counter
from numpy.random import default_rng
def knn_eval(X, y, n_neighbors=5, n_splits=10):
'''
This is a function that is used to evaluate the lower dimension embedding.
An accuracy is calculated by an k-nearest neighbor classifier.
Input:
X: A numpy array with the shape [N, k]. The lower dimension embedding
of some dataset. Expected to have some clusters.
y: A numpy array with the shape [N, 1]. The labels of the original
dataset.
n_neighbors: Number of neighbors considered by the classifier.
n_splits: Number of splits used in the cross validation.
Output:
acc: The avg accuracy generated by the clf, using cross val.
'''
skf = StratifiedKFold(n_splits=n_splits)
sum_acc = 0
max_acc = n_splits
for train_index, test_index in skf.split(X, y):
clf = KNeighborsClassifier(n_neighbors=n_neighbors)
clf.fit(X[train_index], y[train_index])
acc = clf.score(X[test_index], y[test_index])
sum_acc += acc
avg_acc = sum_acc/max_acc
return avg_acc
def knn_eval_large(X, y, n_neighbors=5, n_splits=10, seed=0, sample_size=10000):
'''
This is a function that is used to evaluate the lower dimension embedding.
An accuracy is calculated by a k-nearest neighbor classifier, defined over
a small sample of datasets.
Input:
X: A numpy array with the shape [N, k]. The lower dimension embedding
of some dataset. Expected to have some clusters.
y: A numpy array with the shape [N, 1]. The labels of the original
dataset.
n_neighbors: Number of neighbors considered by the classifier.
n_splits: Number of splits used in the cross validation.
Output:
acc: The avg accuracy generated by the clf, using cross val.
'''
rng = np.random.default_rng(seed=seed)
skf = StratifiedKFold(n_splits=n_splits)
correct_cnt = 0 # Counter for intersection
cnt = 0
for train_index, test_index in skf.split(X, y):
X_train, y_train = X[train_index], y[train_index]
X_test, y_test = X[test_index], y[test_index]
sample_size = min(X_test.shape[0], sample_size) # prevent overflow
indices = rng.choice(np.arange(X_test.shape[0]), size=sample_size, replace=False)
for i in indices:
index_list_neighbors = calculate_neighbors(X_train, X_test[i], n_neighbors - 1) # no self in train set
y_neighbor = y_train[index_list_neighbors]
# find the predicted value
# if there's a tie, pick one of the pred randomly
y_cnt = np.bincount(y_neighbor)
mode = np.amax(y_cnt)
y_pred = np.arange(y_cnt.shape[0])[y_cnt == mode]
y_pred = rng.choice(y_pred)
# compare the predicted with the label
if y_pred == y_test[i]:
correct_cnt += 1
cnt += 1
return correct_cnt / cnt
def knn_eval_series(X, y, n_neighbors_list=[1, 3, 5, 10, 15, 20, 25, 30]):
'''
This is a function that is used to evaluate the lower dimension embedding.
An accuracy is calculated by an k-nearest neighbor classifier.
A series of accuracy will be calculated for the given n_neighbors.
Input:
X: A numpy array with the shape [N, k]. The lower dimension embedding
of some dataset. Expected to have some clusters.
y: A numpy array with the shape [N, 1]. The labels of the original
dataset.
n_neighbors_list: A list of int.
kwargs: Any keyword argument that is send into the knn clf.
Output:
accs: The avg accuracy generated by the clf, using leave one out cross val.
'''
avg_accs = []
for n_neighbors in n_neighbors_list:
avg_acc = knn_eval(X, y, n_neighbors)
avg_accs.append(avg_acc)
return avg_accs
def svm_eval(X, y, n_splits=10, **kwargs):
'''
This is a function that is used to evaluate the lower dimension embedding.
An accuracy is calculated by an SVM with rbf kernel.
Input:
X: A numpy array with the shape [N, k]. The lower dimension embedding
of some dataset. Expected to have some clusters.
y: A numpy array with the shape [N, 1]. The labels of the original
dataset.
kwargs: Any keyword argument that is send into the SVM.
Output:
acc: The (avg) accuracy generated by an SVM with rbf kernel.
'''
X = scale(X)
skf = StratifiedKFold(n_splits=n_splits)
sum_acc = 0
max_acc = n_splits
for train_index, test_index in skf.split(X, y):
clf = SVC(**kwargs)
clf.fit(X[train_index], y[train_index])
acc = clf.score(X[test_index], y[test_index])
sum_acc += acc
avg_acc = sum_acc/max_acc
return avg_acc
def svm_eval_large(X, y, n_splits=10, sample_size=100000,
seed=20200202, **kwargs):
'''
This is an accelerated version of the SVM function.
Training an SVM is infeasible over huge dataset. We therefore only sample
a portion of data to perform the training and testing.
'''
X = X.astype(np.float)
X = scale(X)
# Subsampling X and y
rng = np.random.default_rng(seed=seed)
sample_size = min(X.shape[0], sample_size) # prevent overflow
indices = rng.choice(np.arange(X.shape[0]), size=sample_size, replace=False)
X, y = X[indices], y[indices]
# Perform standard evaluation
skf = StratifiedKFold(n_splits=n_splits)
sum_acc = 0
max_acc = n_splits
for train_index, test_index in skf.split(X, y):
feature_map_nystroem = Nystroem(gamma=1/(X.var()*X.shape[1]), n_components=300)
data_transformed = feature_map_nystroem.fit_transform(X[train_index])
clf = LinearSVC(tol=1e-5, **kwargs)
clf.fit(data_transformed, y[train_index])
test_transformed = feature_map_nystroem.transform(X[test_index])
acc = clf.score(test_transformed, y[test_index])
sum_acc += acc
avg_acc = sum_acc/max_acc
return avg_acc
def faster_svm_eval(X, y, n_splits=10, **kwargs):
'''
This is an accelerated version of the svm_eval function.
An accuracy is calculated by an SVM with rbf kernel.
Input:
X: A numpy array with the shape [N, k]. The lower dimension embedding
of some dataset. Expected to have some clusters.
y: A numpy array with the shape [N, 1]. The labels of the original
dataset.
kwargs: Any keyword argument that is send into the SVM.
Output:
acc: The (avg) accuracy generated by an SVM with rbf kernel.
'''
X = X.astype(np.float)
X = scale(X)
skf = StratifiedKFold(n_splits=n_splits)
sum_acc = 0
max_acc = n_splits
for train_index, test_index in skf.split(X, y):
feature_map_nystroem = Nystroem(gamma=1/(X.var()*X.shape[1]), n_components=300)
data_transformed = feature_map_nystroem.fit_transform(X[train_index])
clf = LinearSVC(tol=1e-5, **kwargs)
clf.fit(data_transformed, y[train_index])
test_transformed = feature_map_nystroem.transform(X[test_index])
acc = clf.score(test_transformed, y[test_index])
sum_acc += acc
avg_acc = sum_acc/max_acc
return avg_acc
def random_triplet_eval(X, X_new, num_triplets=5):
'''
This is a function that is used to evaluate the lower dimension embedding.
An triplet satisfaction score is calculated by evaluating how many randomly
selected triplets have been violated. Each point will generate 5 triplets.
Input:
X: A numpy array with the shape [N, p]. The higher dimension embedding
of some dataset. Expected to have some clusters.
X_new: A numpy array with the shape [N, k]. The lower dimension embedding
of some dataset. Expected to have some clusters as well.
y: A numpy array with the shape [N, 1]. The labels of the original
dataset. Used to identify clusters
Output:
acc: The score generated by the algorithm.
'''
# Sampling Triplets
# Five triplet per point
anchors = np.arange(X.shape[0])
rng = default_rng()
triplets = rng.choice(anchors, (X.shape[0], num_triplets, 2))
triplet_labels = np.zeros((X.shape[0], num_triplets))
anchors = anchors.reshape((-1, 1, 1))
# Calculate the distances and generate labels
b = np.broadcast(anchors, triplets)
distances = np.empty(b.shape)
distances.flat = [np.linalg.norm(X[u] - X[v]) for (u,v) in b]
labels = distances[:, :, 0] < distances[: , :, 1]
# Calculate distances for LD
b = np.broadcast(anchors, triplets)
distances_l = np.empty(b.shape)
distances_l.flat = [np.linalg.norm(X_new[u] - X_new[v]) for (u,v) in b]
pred_vals = distances_l[:, :, 0] < distances_l[:, :, 1]
# Compare the labels and return the accuracy
correct = np.sum(pred_vals == labels)
acc = correct/X.shape[0]/num_triplets
return acc
def neighbor_kept_ratio_eval(X, X_new, n_neighbors=30):
'''
This is a function that evaluates the local structure preservation.
A nearest neighbor set is constructed on both the high dimensional space and
the low dimensional space.
Input:
X: A numpy array with the shape [N, p]. The higher dimension embedding
of some dataset. Expected to have some clusters.
X_new: A numpy array with the shape [N, k]. The lower dimension embedding
of some dataset. Expected to have some clusters as well.
y: A numpy array with the shape [N, 1]. The labels of the original
dataset. Used to identify clusters
Output:
acc: The score generated by the algorithm.
'''
nn_hd = NearestNeighbors(n_neighbors=n_neighbors+1)
nn_ld = NearestNeighbors(n_neighbors=n_neighbors+1)
nn_hd.fit(X)
nn_ld.fit(X_new)
# Construct a k-neighbors graph, where 1 indicates a neighbor relationship
# and 0 means otherwise, resulting in a graph of the shape n * n
graph_hd = nn_hd.kneighbors_graph(X).toarray()
graph_hd -= np.eye(X.shape[0]) # Removing diagonal
graph_ld = nn_ld.kneighbors_graph(X_new).toarray()
graph_ld -= np.eye(X.shape[0]) # Removing diagonal
neighbor_kept = np.sum(graph_hd * graph_ld).astype(float)
neighbor_kept_ratio = neighbor_kept / n_neighbors / X.shape[0]
return neighbor_kept_ratio
def neighbor_kept_ratio_eval_large(X, X_new, n_neighbors=30, sample_size=10000, seed=0):
'''
This is a function that evaluates the local structure preservation.
In a large dataset, keeping a neighbor graph is infeasible as it will lead
to OOM error. Therefore, we evaluate the neighborhood using a small portion
of points as samples.
Input:
X: A numpy array with the shape [N, p]. The higher dimension embedding
of some dataset. Expected to have some clusters.
X_new: A numpy array with the shape [N, k]. The lower dimension embedding
of some dataset. Expected to have some clusters as well.
n_neighbors: Number of neighbors considered by the algorithm
samples: Number of samples considered by the algorithm.
seed: The random seed used by the random number generator.
Output:
acc: The score generated by the algorithm.
'''
rng = np.random.default_rng(seed=seed)
sample_size = min(X.shape[0], sample_size) # prevent overflow
indices = rng.choice(np.arange(X.shape[0]), size=sample_size, replace=False)
correct_cnt = 0 # Counter for intersection
for i in indices:
# Calculate the neighbors
index_list_high = calculate_neighbors(X, i, n_neighbors)
index_list_low = calculate_neighbors(X_new, i, n_neighbors)
# Calculate the intersection
correct_cnt += intersection(index_list_high, index_list_low)
correct_cnt -= sample_size # Remove self
neighbor_kept_ratio = correct_cnt / n_neighbors / sample_size
return neighbor_kept_ratio
def calculate_neighbors(X, i, n_neighbors):
'''A helper function that calculates the neighbor of a sample in a dataset.
'''
if isinstance(i, int):
diff_mat = X - X[i]
else:
diff_mat = X - i # In this case, i is an instance of sample
# print(f"Shape of the diff matrix is {diff_mat.shape}")
diff_mat = np.linalg.norm(diff_mat, axis=1)
diff_mat = diff_mat.reshape(-1)
# Find the top n_neighbors + 1 entries
index_list = np.argpartition(diff_mat, n_neighbors + 1)[:n_neighbors+2]
return index_list
def intersection(index_list1, index_list2):
'''A helper function that calculates the intersection between two different
list of indices, with O(n) complexity.'''
index_dict = {}
for i in range(len(index_list1)):
index_dict[index_list1[i]] = 1
cnt = 0
for i in range(len(index_list2)):
if index_list2[i] in index_dict:
cnt += 1
return cnt
def neighbor_kept_ratio_series_eval(X, X_news, n_neighbors=30):
nn_hd = NearestNeighbors(n_neighbors=n_neighbors+1)
nn_hd.fit(X)
graph_hd = nn_hd.kneighbors_graph(X).toarray()
graph_hd -= np.eye(X.shape[0]) # Removing diagonal
nk_ratios = []
for X_new in X_news:
nn_ld = NearestNeighbors(n_neighbors=n_neighbors+1)
nn_ld.fit(X_new)
graph_ld = nn_ld.kneighbors_graph(X_new).toarray()
graph_ld -= np.eye(X.shape[0]) # Removing diagonal
neighbor_kept = np.sum(graph_hd * graph_ld).astype(float)
neighbor_kept_ratio = neighbor_kept / n_neighbors / X.shape[0]
nk_ratios.append(neighbor_kept_ratio)
return nk_ratios
def neighbor_kept_ratio_series_eval_fast(X, X_news, n_neighbors=30):
nn_hd = NearestNeighbors(n_neighbors=n_neighbors+1)
nn_hd.fit(X)
graph_hd = nn_hd.kneighbors(X, return_distance=False)
graph_hd = graph_hd[:, 1:] # Remove itself
nk_ratios = []
for X_new in X_news:
nn_ld = NearestNeighbors(n_neighbors=n_neighbors+1)
nn_ld.fit(X_new)
graph_ld = nn_ld.kneighbors(X_new, return_distance=False)
graph_ld = graph_ld[:, 1:] # Remove itself
neighbor_kept = 0
for i in range(graph_hd.shape[0]):
neighbor_kept += len(np.intersect1d(graph_hd[i], graph_ld[i]))
neighbor_kept_ratio = neighbor_kept / n_neighbors / X.shape[0]
nk_ratios.append(neighbor_kept_ratio)
return nk_ratios
def eval_reduction(dataset_name, methods):
print(f'Evaluating {dataset_name}')
X, y = data_prep(dataset_name)
X = X.astype(float)
for method in methods:
# Check if the file exists
print(method)
if not os.path.exists(f'./output/{dataset_name}_{method}.npy'):
continue
# Unsupervised eval
# NK Ratio
print('Nearest Neighbor Kept')
X_lows = np.load(
f'./output/{dataset_name}_{method}.npy', allow_pickle=True)
nk_ratios = neighbor_kept_ratio_series_eval_fast(X, X_lows)
nk_ratios = np.array(nk_ratios)
np.save(f'./results/{dataset_name}_{method}_nkratios.npy', nk_ratios)
# RT Ratio
print('Random Triplet Accuracy')
rte_ratios = []
for X_low in X_lows:
rte_ratio = random_triplet_eval(X, X_low)
rte_ratios.append(rte_ratio)
rte_ratios = np.array(rte_ratios)
np.save(f'./results/{dataset_name}_{method}_rteratios.npy', rte_ratios)
# Supervised eval
# KNN Acc
print('KNN Accuracy')
knn_accs = []
for X_low in X_lows:
knn_acc = knn_eval(X_low, y)
knn_accs.append(knn_acc)
np.save(f'./results/{dataset_name}_{method}_knnaccs.npy', knn_accs)
print('SVM Accuracy')
svm_accs = []
for X_low in X_lows:
svm_acc = svm_eval(X_low, y)
svm_accs.append(svm_acc)
np.save(f'./results/{dataset_name}_{method}_svmaccs.npy', svm_accs)
print('---------')
print('Finished Successfully')
def eval_reduction_unsupervised(dataset_name, methods):
print(f'Evaluating {dataset_name}')
X, y = data_prep(dataset_name)
for method in methods:
# Check if the file exists
print(method)
if not os.path.exists(f'./output/{dataset_name}_{method}.npy'):
continue
# Unsupervised eval
# NK Ratio
print('Nearest Neighbor Kept')
X_lows = np.load(
f'./output/{dataset_name}_{method}.npy', allow_pickle=True)
nk_ratios = neighbor_kept_ratio_series_eval_fast(X, X_lows)
nk_ratios = np.array(nk_ratios)
np.save(f'./results/{dataset_name}_{method}_nkratios.npy', nk_ratios)
# Random Triplet Accuracy
print('Random Triplet Accuracy')
rte_ratios = []
for X_low in X_lows:
rte_ratio = random_triplet_eval(X, X_low)
rte_ratios.append(rte_ratio)
rte_ratios = np.array(rte_ratios)
np.save(f'./results/{dataset_name}_{method}_rteratios.npy', rte_ratios)
print('---------')
print('Finished Successfully')
def eval_reduction_onepercent(dataset_name, methods):
print(f'Evaluating {dataset_name}')
X, y = data_prep(dataset_name)
for method in methods:
# Check if the file exists
print(method)
if not os.path.exists(f'./output/{dataset_name}_{method}.npy'):
continue
# Unsupervised eval
# NK Ratio
print('Nearest Neighbor Kept')
X_lows = np.load(
f'./output/{dataset_name}_{method}.npy', allow_pickle=True)
n_neighbors = int(X.shape[0] * 0.01) # 1% of the data
nk_ratios = neighbor_kept_ratio_series_eval_fast(X, X_lows, n_neighbors=n_neighbors)
nk_ratios = np.array(nk_ratios)
np.save(f'./results/{dataset_name}_{method}_nkratios_onepercent.npy', nk_ratios)
print('Finished Successfully')
def spearman_correlation_eval(X, X_new, n_points=1000, random_seed=100):
'''Evaluate the global structure of an embedding via spearman correlation in
distance matrix, following https://www.nature.com/articles/s41467-019-13056-x
'''
# Fix the random seed to ensure reproducability
rng = np.random.default_rng(seed=random_seed)
dataset_size = X.shape[0]
# Sample n_points points from the dataset randomly
sample_index = rng.choice(np.arange(dataset_size), size=n_points, replace=False)
# Generate the distance matrix in high dim and low dim
dist_high = distance_matrix(X[sample_index], X[sample_index])
dist_low = distance_matrix(X_new[sample_index], X_new[sample_index])
dist_high = dist_high.reshape([-1])
dist_low = dist_low.reshape([-1])
# Calculate the correlation
corr, pval = scipy.stats.spearmanr(dist_high, dist_low)
return dist_high, dist_low, corr, pval
def spearman_correlation_series_eval(X, X_news, n_points=1000, random_seed=100):
corrs = []
pvals = []
dist_highs = []
dist_lows = []
for i in range(len(X_news)):
X_new = X_news[i]
dist_high, dist_low, corr, pval = spearman_correlation_eval(X, X_new, n_points, random_seed)
corrs.append(corr)
pvals.append(pval)
dist_highs.append(dist_high)
dist_lows.append(dist_low)
corrs = np.array(corrs)
pvals = np.array(pvals)
dist_highs = np.array(dist_highs)
dist_lows = np.array(dist_lows)
return corrs, pvals, dist_highs, dist_lows
def centroid_knn_eval(X, X_new, y, k):
'''Evaluate the global structure of an embedding via the KNC metric:
neighborhood preservation for cluster centroids, following
https://www.nature.com/articles/s41467-019-13056-x
'''
# Calculating the cluster centers
cluster_mean_ori, cluster_mean_new = [], []
categories = np.unique(y)
num_cat = len(categories)
cluster_mean_ori = np.zeros((num_cat, X.shape[1]))
cluster_mean_new = np.zeros((num_cat, X_new.shape[1]))
cnt_ori = np.zeros(num_cat) # number of instances for each class
# Only loop through the whole dataset once
for i in range(X.shape[0]):
ylabel = int(y[i])
cluster_mean_ori[ylabel] += X[i]
cluster_mean_new[ylabel] += X_new[i]
cnt_ori[ylabel] += 1
cluster_mean_ori = ((cluster_mean_ori.T)/cnt_ori).T
cluster_mean_new = ((cluster_mean_new.T)/cnt_ori).T
# Generate the nearest neighbor list in the high dimension
nbrs = NearestNeighbors(n_neighbors=k+1).fit(cluster_mean_ori)
_, indices = nbrs.kneighbors(cluster_mean_ori)
indices = indices[:,1:] # Remove the center itself
# Now for the low dimension
nbr_low = NearestNeighbors(n_neighbors=k+1).fit(cluster_mean_new)
_, indices_low = nbr_low.kneighbors(cluster_mean_new)
indices_low = indices_low[:,1:] # Remove the center itself
# Calculate the intersection of both lists
len_neighbor_list = k * num_cat
both_nbrs = 0
# for each category, check each of its indices
for i in range(num_cat):
for j in range(k):
if indices[i, j] in indices_low[i, :]:
both_nbrs += 1
# Compare both lists and generate the accuracy
return both_nbrs/len_neighbor_list
def centroid_knn_series_eval(X, X_news, y, k):
accs = []
for i in range(len(X_news)):
X_new = X_news[i]
acc = centroid_knn_eval(X, X_new, y, k)
accs.append(acc)
accs = np.array(accs)
return accs
def centroid_corr_eval(X, X_new, y, k):
'''Evaluate the global structure of an embedding via the KNC metric:
neighborhood preservation for cluster centroids, following
https://www.nature.com/articles/s41467-019-13056-x
'''
# Calculating the cluster centers
cluster_mean_ori, cluster_mean_new = [], []
categories = np.unique(y)
num_cat = len(categories)
cluster_mean_ori = np.zeros((num_cat, X.shape[1]))
cluster_mean_new = np.zeros((num_cat, X_new.shape[1]))
cnt_ori = np.zeros(num_cat) # number of instances for each class
# Only loop through the whole dataset once
for i in range(X.shape[0]):
ylabel = int(y[i])
cluster_mean_ori[ylabel] += X[i]
cluster_mean_new[ylabel] += X_new[i]
cnt_ori[ylabel] += 1
cluster_mean_ori = ((cluster_mean_ori.T)/cnt_ori).T
cluster_mean_new = ((cluster_mean_new.T)/cnt_ori).T
# Generate the distance matrix in high dim and low dim
dist_high = distance_matrix(cluster_mean_ori, cluster_mean_ori)
dist_low = distance_matrix(cluster_mean_new, cluster_mean_new)
dist_high = dist_high.reshape([-1])
dist_low = dist_low.reshape([-1])
# Calculate the correlation
corr, pval = scipy.stats.spearmanr(dist_high, dist_low)
return dist_high, dist_low, corr, pval
def eval_reduction_additional_ii(dataset_name, methods):
print(f'Evaluating {dataset_name}')
X, y = data_prep(dataset_name)
if y.shape[0] == 10:
supervised = False # Not supervised
num_categories = 0
else:
supervised = True
num_categories = len(np.unique(y))
k = min((num_categories + 2) // 4, 10) # maximum of 10
for method in methods:
# Check if the file exists
print(method)
# Skip the evaluation if the method fails to generate a result
if not os.path.exists(f'./output/{dataset_name}_{method}.npy'):
print('Result not exist')
continue
X_lows = np.load(
f'./output/{dataset_name}_{method}.npy', allow_pickle=True)
# Pearson Correlation of the centroids -- supervised
if supervised:
print('Centroid Spearman')
corrs = []
for i in range(5):
_, _, corr, _ = centroid_corr_eval(X, X_lows[i], y, k) # k is compatible to the number of categories
corrs.append(corr)
corrs = np.array(corrs)
np.save(f'./results/{dataset_name}_{method}_centroidcorr.npy', corrs)
print('Finished Successfully')
def eval_reduction_additional(dataset_name, methods):
print(f'Evaluating {dataset_name}')
X, y = data_prep(dataset_name)
if y.shape[0] == 10:
supervised = False # Not supervised
num_categories = 0
else:
supervised = True
num_categories = len(np.unique(y))
k = min((num_categories + 2) // 4, 10) # maximum of 10
for method in methods:
# Check if the file exists
print(method)
# Skip the evaluation if the method fails to generate a result
if not os.path.exists(f'./output/{dataset_name}_{method}.npy'):
print('Result not exist')
continue
# Spearman Correlation of the distance matrix -- unsupervised
print('Spearman Correlation')
X_lows = np.load(
f'./output/{dataset_name}_{method}.npy', allow_pickle=True)
corrs, pvals, dist_highs, dist_lows = spearman_correlation_series_eval(X, X_lows) # default to 1000 points
corrs = np.array(corrs)
np.save(f'./results/{dataset_name}_{method}_spearmancorr.npy', corrs)
# Nearest Neighbor Preservance of the centroids -- supervised
if supervised:
print('Centroid KNN')
accs = centroid_knn_series_eval(X, X_lows, y, k) # k is compatible to the number of categories
np.save(f'./results/{dataset_name}_{method}_centroidknn.npy', accs)
print('Finished Successfully')
def eval_reduction_large(dataset_name, methods, metric=0):
print(f'Evaluating {dataset_name}')
X, y = data_prep(dataset_name)
for method in methods:
# Check if the file exists
print(method)
if not os.path.exists(f'./output/{dataset_name}_{method}.npy'):
continue
# Unsupervised eval
# NK Ratio
X_lows = np.load(
f'./output/{dataset_name}_{method}.npy', allow_pickle=True)
if metric == 0:
print('Nearest Neighbor Kept')
nk_ratios = []
for i in range(5):
nk_ratio = neighbor_kept_ratio_eval_large(X, X_lows[i], sample_size=1000) # 10000 is way too slow
nk_ratios.append(nk_ratio)
nk_ratios = np.array(nk_ratios)
np.save(f'./results/{dataset_name}_{method}_nkratios.npy', nk_ratios)
# RT Ratio
elif metric == 1:
print('Random Triplet Accuracy')
rte_ratios = []
for X_low in X_lows:
rte_ratio = random_triplet_eval(X, X_low)
rte_ratios.append(rte_ratio)
rte_ratios = np.array(rte_ratios)
np.save(f'./results/{dataset_name}_{method}_rteratios.npy', rte_ratios)
# Supervised eval
# KNN Acc
elif metric == 2:
print('KNN Accuracy')
knn_accs = []
for X_low in X_lows:
knn_acc = knn_eval_large(X_low, y, sample_size=200) # 10-fold, each evaluated with 200 samples
knn_accs.append(knn_acc)
np.save(f'./results/{dataset_name}_{method}_knnaccs.npy', knn_accs)
# SVM Acc
elif metric == 3:
print('SVM Accuracy')
svm_accs = []
for X_low in X_lows:
svm_acc = svm_eval_large(X_low, y) # 10-fold evaluated on a subset of 100000 samples
svm_accs.append(svm_acc)
np.save(f'./results/{dataset_name}_{method}_svmaccs.npy', svm_accs)
print('---------')
print('Finished Successfully')