-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclusteringCrossValidation.py
190 lines (160 loc) · 6.78 KB
/
clusteringCrossValidation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import torch
import pickle
import clusteringTest as ct
import distanceLearningNet as dln
import prepareDataForTraining as pdft
import netClasses as nc
from importlib import reload
import numpy as np
import datetime
reload(ct)
reload(dln)
reload(pdft)
reload(nc)
fname = 'models/run {}.txt'.format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
pickle_name = 'parsed_patterns.pik'
num_validation_sets = 5 # number of experiments to run
val_ratio = 0.1 # use this much of each training set for validation
feature_subset = 'all' # key indicating features to use (see prepareDataForTraining)
dim_size = 5 # dimensionality of subspace
stagnation_time = 1000 # stop training when val set doesn't improve in N iterations
batch_size = 256
percentiles = [75, 80, 85, 90, 95] # for estimating values of epsilon for DBSCAN
reduce_with_pca = -1 # an interesting idea that didn't work
pairs_unsimilar_factor = 1 # how many pairs of significant occs from diff pattern?
pairs_trivial_factor = 1 # pairs of significant/trivial occs from different patterns?
pairs_intra_trivial_factor = 0 # pairs of trivial occs from different patterns?
pairs_max_similar = 0 # limit size of pair sets (0 = off)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu") # GPU doesn't really help here
# load from pickle
print("loading data from file...")
with open(pickle_name, "rb") as f:
dat = pickle.load(f)
songs = dat[0]
pClasses = dat[1]
pOccs = dat[2]
annPClassNames = dat[3]
annPOccNames = dat[4]
genPClassNames = dat[5]
genPOccNames = dat[6]
song_to_tunefam = dat[7]
sorted_fkeys = sorted(list(pOccs.values())[0].occFeatures.keys())
tune_fams = list(set(song_to_tunefam.values()))
# break up into sets
fams_shuffle = np.array(tune_fams)
np.random.shuffle(fams_shuffle)
fams_sets = np.array_split(fams_shuffle, num_validation_sets)
all_results = []
pca_results = []
for run_num in range(1): #num_validation_sets):
print("starting run {}...".format(run_num))
test_fams = fams_sets[0]
train_fams = np.concatenate(fams_sets[1:])
fams_sets = np.roll(fams_sets, 1) # prepare for the next test by rotating test/train
# get pairwise similarity/unsimilarity data for training set
train_class_names = [x for x in annPClassNames if (pClasses[x].tuneFamily in train_fams)]
test_class_names = [x for x in annPClassNames if (pClasses[x].tuneFamily in test_fams)]
train_gen_class_names = [x for x in genPClassNames if (pClasses[x].tuneFamily in train_fams)]
test_gen_class_names = [x for x in genPClassNames if (pClasses[x].tuneFamily in test_fams)]
# split train further into a small val set
val_split_idx = int(len(train_class_names) * val_ratio)
np.random.shuffle(train_class_names)
val_class_names = train_class_names[:val_split_idx]
train_class_names = train_class_names[val_split_idx:]
train_data, train_labels = pdft.assemble_clustering_feats(dat,
train_class_names,
train_gen_class_names,
unsimilar_factor=pairs_unsimilar_factor,
gen_factor=pairs_trivial_factor,
intra_gen_factor=pairs_intra_trivial_factor,
max_similar=pairs_max_similar,
subset=feature_subset,
reduce_with_pca=reduce_with_pca)
val_data, val_labels = pdft.assemble_clustering_feats(dat,
val_class_names,
train_gen_class_names,
unsimilar_factor=pairs_unsimilar_factor,
gen_factor=pairs_trivial_factor,
intra_gen_factor=pairs_intra_trivial_factor,
max_similar=pairs_max_similar,
subset=feature_subset,
reduce_with_pca=reduce_with_pca)
# make the model
model = nc.FFNetDistance(num_feats=train_data.shape[-1], dim_size=dim_size)
model.to(device)
x_train = torch.tensor(train_data).float()
y_train = torch.tensor(train_labels).long()
x_val = torch.tensor(val_data).float()
y_val = torch.tensor(val_labels).long()
model, accs = dln.train_model((x_train, y_train), model, device,
batch_size=batch_size,
num_epochs=50000,
stagnation_time=stagnation_time,
poll_every=500,
val_every=50,
lr=1e-4,
val_data=(x_val, y_val)
)
# TESTING
# assemble test occurrences
torch.save(model.state_dict(), 'models\model{}.pt'. format(run_num))
model.eval() # set model to evaluation mode
# create test set of cluster-labeled occurrences
test_occs = []
labels_true = []
for i, pn in enumerate(test_class_names):
occNames = pClasses[pn].occNames
for on in occNames:
test_occs.append(on)
labels_true.append(i)
# add noisy occs from the same tunefam
for pn in test_gen_class_names:
occNames = pClasses[pn].occNames
for on in occNames:
test_occs.append(on)
labels_true.append(-1)
res, emb_labellings = ct.evaluate_clustering(test_occs, labels_true, model, pOccs,
feature_subset, eps_pctiles=percentiles, reduce_with_pca=reduce_with_pca)
# print(res)
all_results.append(res)
pca_res, pca_labellings = ct.evaluate_clustering_pca(test_occs, labels_true, pOccs,
n_components=dim_size, subset=feature_subset, eps_pctiles=percentiles)
# print(pca_res)
pca_results.append(pca_res)
# write results of cross-validation to a file
with open(fname, 'a') as the_file:
the_file.write(
f"""
num_validation_sets:{num_validation_sets}
val_ratio:{val_ratio}
feature_subset:{feature_subset}
dim_size:{dim_size}
stagnation_time:{stagnation_time}
percentiles:{percentiles}
pairs_unsimilar_factor:{pairs_unsimilar_factor}
pairs_trivial_factor:{pairs_trivial_factor}
pairs_intra_trivial_factor:{pairs_intra_trivial_factor}
pairs_max_similar:{pairs_max_similar}"""
)
the_file.write('\nEMBEDDING RESULTS:\n')
for run_key in all_results[0].keys():
the_file.write('\n --- {} ---'.format(run_key))
for key in res[run_key].keys():
category = [x[run_key][key] for x in all_results]
mean = np.round(np.mean(category), 3)
stdv = np.round(np.std(category) / np.sqrt(len(all_results)), 3)
the_file.write('{}: {} , {}\n'.format(key, mean, stdv))
the_file.write('\nPCA RESULTS:\n')
for run_key in all_results[0].keys():
the_file.write('\n --- {} ---'.format(run_key))
for key in res[run_key].keys():
category = [x[run_key][key] for x in pca_results]
mean = np.round(np.mean(category), 3)
stdv = np.round(np.std(category) / np.sqrt(len(pca_results)), 3)
the_file.write('{}: {} , {}\n'.format(key, mean, stdv))
print('done')
# plt.clf()
# #plt.plot(labels_true)
# plt.plot(emb_labellings[0])
# plt.show()