-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmodel_eval.py
295 lines (266 loc) · 9.07 KB
/
model_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 18 15:05:45 2017
@author: Hank Kuang
@title: 模型评价(classifer model)
"""
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.metrics import auc
import pandas as pd
from sklearn.utils.multiclass import type_of_target
import numpy as np
import itertools
from sklearn.metrics import confusion_matrix
# from sklearn.metrics import precision_recall_curve
# from sklearn.metrics import average_precision_score
"""
ROC曲线绘制脚本
"""
def plot_roc_curve(prob_y, y):
"""
plot roc curve
----------------------------------
Params
prob_y: prediction of model
y: real data(testing sets)
----------------------------------
plt object
"""
fpr, tpr, _ = roc_curve(y, prob_y)
c_stats = auc(fpr, tpr)
plt.plot([0, 1], [0, 1], 'r--')
plt.plot(fpr, tpr, label="ROC curve")
s = "AUC = %.4f" % c_stats
plt.text(0.6, 0.2, s, bbox=dict(facecolor='red', alpha=0.5))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve') # ROC 曲线
plt.legend(loc='best')
plt.show()
"""
KS曲线以及KS统计表
"""
def ks_stats(prob_y, y, k=20):
"""
plot K-S curve and output ks table
----------------------------------
Params
prob_y: prediction of model
y: real data(testing sets)
k: Section number
----------------------------------
ks_results: pandas dataframe
ks_ax: plt object, k-s curcve
"""
# 检查y是否是二元变量
y_type = type_of_target(y)
if y_type not in ['binary']:
raise ValueError('y必须是二元变量')
# 合并y与y_hat,并按prob_y对数据进行降序排列
datasets = pd.concat([y, pd.Series(prob_y, name='prob_y', index=y.index)], axis=1)
datasets.columns = ["y", "prob_y"]
datasets = datasets.sort_values(by="prob_y", axis=0, ascending=True)
# 计算正负案例数和行数,以及等分子集的行数n
P = sum(y)
Nrows = datasets.shape[0]
N = Nrows - P
n = float(Nrows) / k
# 重建索引,并将数据划分为子集,并计算每个子集的正例数和负例数
datasets.index = np.arange(Nrows)
ks_df = pd.DataFrame()
rlt = {
"tile": str(0),
"Ptot": 0,
"Ntot": 0}
ks_df = ks_df.append(pd.Series(rlt), ignore_index=True)
for i in range(k):
lo = i * n
up = (i + 1) * n
tile = datasets.ix[lo:(up - 1), :]
Ptot = sum(tile['y'])
Ntot = n - Ptot
rlt = {
"tile": str(i + 1),
"Ptot": Ptot,
"Ntot": Ntot}
ks_df = ks_df.append(pd.Series(rlt), ignore_index=True)
# 计算各子集中的正负例比例,以及累积比例
ks_df['PerP'] = ks_df['Ptot'] / P
ks_df['PerN'] = ks_df['Ntot'] / N
ks_df['PerP_cum'] = ks_df['PerP'].cumsum()
ks_df['PerN_cum'] = ks_df['PerN'].cumsum()
# 计算ks曲线以及ks值
ks_df['ks'] = ks_df['PerN_cum'] - ks_df['PerP_cum']
ks_value = ks_df['ks'].max()
s = "KS value is %.4f" % ks_value
# 整理得出ks统计表
ks_results = ks_df.ix[1:, :]
ks_results = ks_results[['tile', 'Ntot', 'Ptot', 'PerN', 'PerP', 'PerN_cum', 'PerP_cum', 'ks']]
ks_results.columns = ['子集', '负例数', '正例数', '负例比例', '正例比例', '累积负例比例', '累积正例比例', 'ks']
# 获取ks值所在的数据点
ks_point = ks_results.ix[:, ['子集', 'ks']]
ks_point = ks_point.ix[ks_point['ks'] == ks_point['ks'].max(), :]
# 绘制KS曲线
ks_ax = _ks_plot(ks_df=ks_df, ks_label='ks', good_label='PerN_cum', bad_label='PerP_cum',
k=k, point=ks_point, s=s)
return ks_results, ks_ax
def _ks_plot(ks_df, ks_label, good_label, bad_label, k, point, s):
"""
middle function for ks_stats, plot k-s curve
"""
plt.plot(ks_df['tile'], ks_df[ks_label], "r-.", label="ks_curve", lw=1.2)
plt.plot(ks_df['tile'], ks_df[good_label], "g-.", label="good", lw=1.2)
plt.plot(ks_df['tile'], ks_df[bad_label], "m-.", label="bad", lw=1.2)
# plt.plot(point[0], point[1], 'o', markerfacecolor="red",
# markeredgecolor='k', markersize=6)
plt.legend(loc=0)
plt.plot([0, k], [0, 1], linestyle='--', lw=0.8, color='k', label='Luck')
plt.xlabel("decilis") # 等份子集
plt.title(s) # KS曲线图
plt.show()
"""
混淆矩阵
"""
def plot_confusion_matrix(y,
pred,
labels,
normalize=False,
cmap=plt.cm.Blues):
"""
plot confusion_matrix with colorbar
------------------------------------------
Params
y:real data labels
pred: predict results
labels: labels
normalize: bool, True means trans results to percent
cmap: color index
------------------------------------------
Return
plt object
"""
cm = confusion_matrix(y, pred, labels=labels)
plt.imshow(cm, interpolation='nearest', cmap=cmap) # 在指定的轴上展示图像
plt.colorbar() # 增加色柱
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels, rotation=45) # 设置坐标轴标签
plt.yticks(tick_marks, labels)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# print("标准化混淆矩阵")
else:
# print('混淆矩阵')
pass
# print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j], fontsize=12,
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True')
plt.xlabel('Predicted')
plt.title("confusion matrix")
plt.show()
"""
提升图和洛伦茨曲线
"""
def lift_lorenz(prob_y, y, k=10):
"""
plot lift_lorenz curve
----------------------------------
Params
prob_y: prediction of model
y: real data(testing sets)
k: Section number
----------------------------------
lift_ax: lift chart
lorenz_ax: lorenz curve
"""
# 检查y是否是二元变量
y_type = type_of_target(y)
if y_type not in ['binary']:
raise ValueError('y必须是二元变量')
# 合并y与y_hat,并按prob_y对数据进行降序排列
datasets = pd.concat([y, pd.Series(prob_y, name='prob_y', index=y.index)], axis=1)
datasets.columns = ["y", "prob_y"]
datasets = datasets.sort_values(by="prob_y", axis=0, ascending=False)
# 计算正案例数和行数,以及等分子集的行数n
P = sum(y)
Nrows = datasets.shape[0]
n = float(Nrows) / k
# 重建索引,并将数据划分为子集,并计算每个子集的正例数和负例数
datasets.index = np.arange(Nrows)
lift_df = pd.DataFrame()
rlt = {
"tile": str(0),
"Ptot": 0,
}
lift_df = lift_df.append(pd.Series(rlt), ignore_index=True)
for i in range(k):
lo = i * n
up = (i + 1) * n
tile = datasets.ix[lo:(up - 1), :]
Ptot = sum(tile['y'])
rlt = {
"tile": str(i + 1),
"Ptot": Ptot,
}
lift_df = lift_df.append(pd.Series(rlt), ignore_index=True)
# 计算正例比例&累积正例比例
lift_df['PerP'] = lift_df['Ptot'] / P
lift_df['PerP_cum'] = lift_df['PerP'].cumsum()
# 计算随机正例数、正例率以及累积随机正例率
lift_df['randP'] = float(P) / k
lift_df['PerRandP'] = lift_df['randP'] / P
lift_df.ix[0, :] = 0
lift_df['PerRandP_cum'] = lift_df['PerRandP'].cumsum()
lift_ax = lift_Chart(lift_df, k)
lorenz_ax = lorenz_cruve(lift_df)
return lift_ax, lorenz_ax
def lift_Chart(df, k):
"""
middle function for lift_lorenz, plot lift Chart
"""
# 绘图变量
PerP = df['PerP'][1:]
PerRandP = df['PerRandP'][1:]
# 绘图参数
fig, ax = plt.subplots()
index = np.arange(k + 1)[1:]
bar_width = 0.35
opacity = 0.4
error_config = {'ecolor': '0.3'}
rects1 = plt.bar(index, PerP, bar_width,
alpha=opacity,
color='b',
error_kw=error_config,
label='Per_p') # 正例比例
rects2 = plt.bar(index + bar_width, PerRandP, bar_width,
alpha=opacity,
color='r',
error_kw=error_config,
label='random_P') # 随机比例
plt.xlabel('Group')
plt.ylabel('Percent')
plt.title('lift_Chart')
plt.xticks(index + bar_width / 2, tuple(index))
plt.legend()
plt.tight_layout()
plt.show()
def lorenz_cruve(df):
"""
middle function for lift_lorenz, plot lorenz cruve
"""
# 准备绘图所需变量
PerP_cum = df['PerP_cum']
PerRandP_cum = df['PerRandP_cum']
decilies = df['tile']
# 绘制洛伦茨曲线
plt.plot(decilies, PerP_cum, 'm-^', label='lorenz_cruve') # lorenz曲线
plt.plot(decilies, PerRandP_cum, 'k-.', label='random') # 随机
plt.legend()
plt.xlabel("decilis") # 等份子集
plt.title("lorenz_cruve", fontsize=10) # 洛伦茨曲线
plt.show()