-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscan.py
166 lines (143 loc) · 6.05 KB
/
scan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import numpy as np
import multiprocessing as mp
import csv
import glob
import os
import re
# sklearn items
from sklearn.metrics import roc_curve, auc, average_precision_score, f1_score
#
from imblearn.metrics import geometric_mean_score
from keras import backend as K
import tensorflow as tf
# TODO: clear_session or del model
# TODO: separate _output_setup method as a class
# TODO: clever way to create results header
class Scan(object):
def __init__(self, X_train, y_train, X_val, y_val, params_search, dataset_name, model):
self.X_train = X_train
self.y_train = y_train
self.X_val = X_val
self.y_val = y_val
self.params_grid = params_search.params_grid
self.params_name = params_search.params_name
self.dataset_name = dataset_name
self.model = model
self.result_dir = '.\\Results'
self.round_dir = ''
self.round_fp = ''
# to set the round_dir and the round_fp, to make a round directory under Results directory
self._output_setup()
#
self._write_results_header()
# self._run_search()
self.mp_handler()
def _run_search(self):
"""
For-loop to iterate over different combs of parameters
"""
with open(self.round_fp, 'a', newline='') as f:
res_writer = csv.writer(f, dialect='excel', delimiter=',')
for idx, params in enumerate(self.params_grid):
# tf.reset_default_graph()
result = self.worker((idx, params))
# K.clear_session()
print(f'Saving results to {self.round_fp}')
res_writer.writerow(result)
f.flush()
def worker(self, args):
"""
Train the model and predict on the validation set
:return: a list, containing the metrics for the training set and validation set, and the corresponding combo of
parameter
"""
idx, params = args
trained_model, stopped_epoch = self.model(self.X_train, self.y_train,
self.X_val, self.y_val, params, idx, self.round_dir)
metrics_tra = self.model_predict(trained_model, self.X_train, self.y_train)
metrics_val = self.model_predict(trained_model, self.X_val, self.y_val)
return [str(idx)] \
+ self._collect_results(metrics_tra) \
+ self._collect_results(metrics_val) \
+ [str(stopped_epoch)] \
+ self._collect_results(params)
def mp_handler(self):
"""
Multiprocess version to iterate over different combs of parameters
"""
cores = mp.cpu_count()
with mp.Pool(cores) as p:
with open(self.round_fp, 'a', newline='') as f:
res_writer = csv.writer(f, dialect='excel', delimiter=',')
for result in p.imap(self.worker, enumerate(self.params_grid)):
print(f'Saving results to {self.round_fp}')
res_writer.writerow(result)
f.flush()
@staticmethod
def model_predict(model, X, y, just_d60=False):
"""
perform the model prediction and calculate various metrics
:param model: trained model
:param X: a numpy array, data feed to the model
:param y: a numpy array, ground true label
:param just_d60: flag
:return: a dict
"""
if just_d60:
prob_pos = model.predict(X, batch_size=X.shape[0], verbose=0)
exp_pos = np.sum(prob_pos)
return np.sum(y), exp_pos, prob_pos
# evaluate
loss = model.evaluate(X, y, batch_size=X.shape[0], verbose=0)
# predict
prob_pos = model.predict(X, batch_size=X.shape[0], verbose=0) # TODO: batch_size
vfunc = np.vectorize(lambda x: 1 if x > 0.05 else 0)
y_pred = vfunc(prob_pos).ravel()
# calculate performance metrics
# precision, recall, _ = precision_recall_curve(y, prob_pos)
fpr, tpr, _ = roc_curve(y, prob_pos)
roc_auc = auc(fpr, tpr)
pr_auc = average_precision_score(y, prob_pos)
exp_pos = np.sum(prob_pos)
f_score = f1_score(y, y_pred)
g_mean = geometric_mean_score(y, y_pred)
metrics = {'Loss': loss,
'PR_AUC': pr_auc,
'ROC_AUC': roc_auc,
'F_score': f_score,
'G_mean': g_mean,
'Expeted_#_D60': exp_pos,
'Actual_#_D60': np.sum(y),
'Diff_D60': abs(np.sum(y) - exp_pos),
'Ratio_D60': np.sum(y) / exp_pos,
}
return metrics
def _write_results_header(self):
"""
Write the header of the output file
"""
header = ['Index',
'Loss_train', 'PR_AUC_train', 'ROC_AUC_train', 'F_score_train', 'G_mean_train', 'Expeted_#_D60_train',
'Actual_#_D60_train', 'Diff_D60_train', 'Ratio_D60_train',
'Loss_val', 'PR_AUC_val', 'ROC_AUC_val', 'F_score_val', 'G_mean_val', 'Expeted_#_D60_val',
'Actual_#_D60_val', 'Diff_D60_val', 'Ratio_D60_val',
'Stopped_Epochs'] + self.params_name
with open(self.round_fp, 'w', newline='') as f:
res_writer = csv.writer(f, dialect='excel', delimiter=',')
res_writer.writerow(header)
@staticmethod
def _collect_results(results):
op = []
for key in list(results.keys()):
op.append(results[key])
return op
def _output_setup(self):
rounds = glob.glob(f'{self.result_dir}\\{self.dataset_name}_*')
if not rounds:
round_no = 1
else:
sorted_rounds = sorted(rounds, key=lambda x: int(re.search(r'\d+$', x).group())) # sort rounds by index
round_no = int(sorted_rounds[-1].split('_')[-1]) + 1
self.round_dir = f'{self.result_dir}\\{self.dataset_name}_{round_no}'
os.mkdir(self.round_dir)
self.round_fp = f'{self.round_dir}\\{self.dataset_name}_{round_no}.csv'