forked from RRdmlearning/Machine-Learning-From-Scratch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgbdt_model.py
113 lines (93 loc) · 4.53 KB
/
gbdt_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from __future__ import division, print_function
import numpy as np
import progressbar
# Import helper functions
from utils import train_test_split, standardize, to_categorical
from utils import mean_squared_error, accuracy_score
from utils.loss_functions import SquareLoss, CrossEntropy, SotfMaxLoss
from decision_tree.decision_tree_model import RegressionTree
from utils.misc import bar_widgets
class GBDT(object):
"""Super class of GradientBoostingClassifier and GradientBoostinRegressor.
Uses a collection of regression trees that trains on predicting the gradient
of the loss function.
Parameters:
-----------
n_estimators: int
树的数量
The number of classification trees that are used.
learning_rate: float
梯度下降的学习率
The step length that will be taken when following the negative gradient during
training.
min_samples_split: int
每棵子树的节点的最小数目(小于后不继续切割)
The minimum number of samples needed to make a split when building a tree.
min_impurity: float
每颗子树的最小纯度(小于后不继续切割)
The minimum impurity required to split the tree further.
max_depth: int
每颗子树的最大层数(大于后不继续切割)
The maximum depth of a tree.
regression: boolean
是否为回归问题
True or false depending on if we're doing regression or classification.
"""
def __init__(self, n_estimators, learning_rate, min_samples_split,
min_impurity, max_depth, regression):
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.min_samples_split = min_samples_split
self.min_impurity = min_impurity
self.max_depth = max_depth
self.regression = regression
# 进度条 processbar
self.bar = progressbar.ProgressBar(widgets=bar_widgets)
self.loss = SquareLoss()
if not self.regression:
self.loss = SotfMaxLoss()
# 分类问题也使用回归树,利用残差去学习概率
self.trees = []
for i in range(self.n_estimators):
self.trees.append(RegressionTree(min_samples_split=self.min_samples_split,
min_impurity=self.min_impurity,
max_depth=self.max_depth))
def fit(self, X, y):
# 让第一棵树去拟合模型
self.trees[0].fit(X, y)
y_pred = self.trees[0].predict(X)
for i in self.bar(range(1, self.n_estimators)):
gradient = self.loss.gradient(y, y_pred)
self.trees[i].fit(X, gradient)
y_pred -= np.multiply(self.learning_rate, self.trees[i].predict(X))
def predict(self, X):
y_pred = self.trees[0].predict(X)
for i in range(1, self.n_estimators):
y_pred -= np.multiply(self.learning_rate, self.trees[i].predict(X))
if not self.regression:
# Turn into probability distribution
y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)
# Set label to the value that maximizes probability
y_pred = np.argmax(y_pred, axis=1)
return y_pred
class GBDTRegressor(GBDT):
def __init__(self, n_estimators=200, learning_rate=0.5, min_samples_split=2,
min_var_red=1e-7, max_depth=4, debug=False):
super(GBDTRegressor, self).__init__(n_estimators=n_estimators,
learning_rate=learning_rate,
min_samples_split=min_samples_split,
min_impurity=min_var_red,
max_depth=max_depth,
regression=True)
class GBDTClassifier(GBDT):
def __init__(self, n_estimators=200, learning_rate=.5, min_samples_split=2,
min_info_gain=1e-7, max_depth=2, debug=False):
super(GBDTClassifier, self).__init__(n_estimators=n_estimators,
learning_rate=learning_rate,
min_samples_split=min_samples_split,
min_impurity=min_info_gain,
max_depth=max_depth,
regression=False)
def fit(self, X, y):
y = to_categorical(y)
super(GBDTClassifier, self).fit(X, y)