-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 920c9f5
Showing
8 changed files
with
588 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
|
||
|
||
|
||
|
||
import numpy as np | ||
#Regression Tree | ||
#区域划分问题 还是要递归的放法 | ||
|
||
|
||
data=[[1,4.5],[2,4.75],[3,4.91],[4,5.34],[5,5.8],[6,7.05],[7,7.90],[8,8.23],[9,8.70],[10,9.0]] #INDEX -1 为Y值 | ||
#Y为连续变量 | ||
|
||
|
||
|
||
def Split_find(dataSet): | ||
|
||
|
||
min_outside={} | ||
index=0 | ||
for split_pos in range(len(dataSet[0])-1): #选取切分量 | ||
|
||
min_inside={} | ||
|
||
for split in range(len(dataSet)): #SPLIT=[X1,X2,X3....,Y] | ||
right_region=[item for item in dataSet if item[split_pos]>dataSet[split][split_pos]] | ||
left_region =[item for item in dataSet if item[split_pos]<=dataSet[split][split_pos]] | ||
right_region_y=[item[-1] for item in right_region] | ||
left_region_y=[item[-1] for item in left_region] | ||
c_right=np.mean(right_region_y) | ||
c_left=np.mean(left_region_y) | ||
sum1=0 | ||
sum2=0 | ||
for y in right_region_y: | ||
sum1+=(y-c_right)**2 | ||
for y in left_region_y: | ||
sum2+=(y-c_left)**2 | ||
min_inside[split]=sum1+sum2 | ||
|
||
|
||
min_outside[(min(min_inside,key=min_inside.get),index)]=min(min_inside.values()) | ||
index+=1 | ||
|
||
point,j=min(min_outside,key=min_outside.get) | ||
return point,j,min(min_outside.values()) | ||
|
||
|
||
print(Split_find(data)) | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
class Node: | ||
def __init__(self,split_point,right_region,left_region): | ||
|
||
self.split_point=split_point | ||
self.right_region=right_region | ||
self.left_region=left_region | ||
|
||
class regression_tree: | ||
def __init__(self,root): | ||
self.root=root | ||
|
||
|
||
def create_tree(self,dataset): #停机条件为 检测新增划分能否降低误差 | ||
|
||
|
||
split_point,split_dim,loss_f=Split_find(dataSet=data) | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
#Decision Tree | ||
|
||
#entropy越大 不确定性越高。 | ||
import numpy as np | ||
import pandas as pd | ||
import math | ||
data_f=pd.read_excel('E:/Project_YOLO/Stattitiscs_learning_method/Decision Tree/data.xlsx') | ||
|
||
|
||
#信息增益: 特征A对训练数据集D的信息增益G(D,A)=H(D)-H(D|A) | ||
|
||
#信息熵 | ||
|
||
print(data_f) | ||
|
||
def entropy(X,data_): #random variable | ||
data=data_[X] | ||
l=[] | ||
for i in data: | ||
if i not in l: | ||
l.append(i) | ||
b=np.zeros(len(l)) | ||
for i in data: | ||
if(i==l[l.index(i)]): | ||
b[l.index(i)]+=1 | ||
c=b/len(data) | ||
d=[] | ||
for i in c: | ||
if i==0: | ||
d.append(0) | ||
else: | ||
d.append(math.log(i,2)) #以2为底 | ||
d=np.array(d) | ||
|
||
return -np.sum(c*d) | ||
|
||
|
||
def condi_entropy(condition,X,data): | ||
l=[] | ||
for i in data[condition]: | ||
if i not in l: | ||
l.append(i) | ||
b = np.zeros(len(l)) | ||
for i in data[condition]: | ||
if (i == l[l.index(i)]): | ||
b[l.index(i)] += 1 | ||
c = b / len(data[condition]) #得到H(Di) | ||
|
||
d=[] | ||
for fe in l: | ||
d.append(entropy(X,data[data[condition]==fe])) | ||
|
||
d=np.array(d) | ||
return sum(c*d) | ||
|
||
|
||
|
||
|
||
g1=entropy('是否发放贷款',data_f)-condi_entropy('年龄','是否发放贷款',data_f) | ||
print(g1) | ||
g2=entropy('是否发放贷款',data_f)-condi_entropy('有工作','是否发放贷款',data_f) | ||
print(g2) | ||
#....编辑成功!! | ||
|
||
features=data_f.columns[1:-1] | ||
labels=[] | ||
for label in features: | ||
labels.append(label) | ||
#ID3算法 (C4.5算法则采用信息增益比来选择特征) | ||
def feature_get(dataset, labels): # 传入数据集与标签 | ||
g_d_a = [] | ||
for label in labels: | ||
g_d_a.append(entropy('是否发放贷款', dataset) - condi_entropy(label, '是否发放贷款', dataset)) | ||
index = g_d_a.index(max(g_d_a)) | ||
optimum_label = labels[index] | ||
return optimum_label, index, g_d_a[index] | ||
|
||
|
||
def splitdata(dataSet, bestfeature): # 按照optimum_label划分数据集 | ||
bestfeature_value = [] | ||
splitset = {} | ||
for value in dataSet[bestfeature]: | ||
if value not in bestfeature_value: | ||
bestfeature_value.append(value) | ||
for condition in bestfeature_value: | ||
splitset[condition] = (dataSet[dataSet[bestfeature] == condition]) # 得到{Di} | ||
|
||
return splitset | ||
|
||
class Node: | ||
def __init__(self,feature,subtree): | ||
self.feature=feature #feature = 类 or 类下的取值? | ||
self.subtree=subtree | ||
|
||
|
||
|
||
|
||
class DS_Tree: | ||
def __init__(self,data): | ||
self.root=None | ||
k=len(data.columns) | ||
|
||
|
||
|
||
def createtree(self,dataSet, labels, thresh=0): # 默认阈值为0 | ||
# sublabels是往下延展是用到的特征集合,每次使用一个特征就要删取该特征 | ||
|
||
#首先判断是否所有实例都属于一类: | ||
ifallinclass=[item for item in dataSet['是否发放贷款']] | ||
if len(set(ifallinclass))==1: | ||
return Node(ifallinclass[0],subtree=None) | ||
#实例就只有一类则直接返回该类 | ||
|
||
#如果特征没了,则用最大的类值作为节点 | ||
|
||
if len(labels)==0: | ||
|
||
return Node(max(ifallinclass, key=ifallinclass.count),subtree=None) | ||
bestfeature, i, score =feature_get(dataSet,labels) | ||
|
||
if(score>thresh): | ||
dict=splitdata(dataSet,bestfeature) | ||
labels.remove(bestfeature) | ||
subnode=[] | ||
for condition,dataset in dict.items(): | ||
subnode.append(self.createtree(dataset, labels)) | ||
|
||
|
||
return Node(bestfeature,subnode) | ||
|
||
|
||
|
||
|
||
else: | ||
return Node(max(ifallinclass, key=ifallinclass.count),subtree=None) | ||
|
||
|
||
def preorder(root): | ||
print(root.feature) | ||
if root.subtree!=None: | ||
for node in root.subtree: | ||
preorder(node) | ||
|
||
|
||
TREE=DS_Tree(data_f) | ||
TREE.root=TREE.createtree(dataSet=data_f,labels=labels) | ||
|
||
|
||
preorder(TREE.root) | ||
|
||
|
||
#终于算是编出来了 泪目= =|| | ||
|
||
|
||
|
||
|
||
|
||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
#朴素贝叶斯 | ||
|
||
import numpy as np | ||
|
||
# 创建特征1(0到5的整数) | ||
feature_1 = np.array([0, 1, 2, 3, 4, 5, 2, 3, 1, 4, | ||
0, 5, 3, 2, 1, 4, 5, 0, 2, 1, | ||
3, 4, 5, 1, 0, 4, 3, 2, 5, 1, | ||
0, 2, 3, 4, 5, 0, 1, 2, 3, 4, | ||
5, 3, 1, 4, 2, 5, 0, 1, 4, 3]) | ||
|
||
# 创建特征2(S,M,L三选一) | ||
feature_2 = np.array(['S', 'M', 'L', 'S', 'M', 'L', 'M', 'L', 'S', 'M', | ||
'L', 'S', 'S', 'M', 'L', 'M', 'L', 'S', 'M', 'L', | ||
'S', 'M', 'L', 'S', 'M', 'L', 'S', 'M', 'L', 'S', | ||
'M', 'L', 'S', 'M', 'L', 'S', 'M', 'L', 'S', 'M', | ||
'L', 'S', 'M', 'L', 'S', 'M', 'L', 'S', 'M', 'L']) | ||
|
||
# 合并特征 | ||
data_x = np.column_stack((feature_1, feature_2)) | ||
# 创建对应的标签 | ||
data_y = np.array([1, -1, -1, -1, -1, -1, 1, -1, 1, -1, | ||
1, -1, -1, -1, 1, -1, 1, -1, 1, -1, | ||
1, -1, 1, -1, 1, -1, 1, -1, 1, -1, | ||
1, -1, 1, -1, 1, -1, 1, -1, 1, -1, | ||
1, -1, 1, -1, 1, -1, 1, -1, 1, -1]) | ||
|
||
|
||
#计算目标 max (P(Y=1|X),P(Y=-1|X) | ||
|
||
|
||
|
||
#做一下训练集验证集划分 | ||
|
||
train_data=data_x[:40] | ||
test_data=data_x[40:50] | ||
train_labels=data_y[:40] | ||
test_label=data_y[40:50] | ||
|
||
|
||
#计算先验概率以及条件概率 | ||
|
||
|
||
|
||
def get_one(data): | ||
list=[] | ||
for i in data: | ||
if i not in list: | ||
list.append(i) | ||
return list | ||
label=get_one(train_labels) | ||
f1=get_one(feature_1) | ||
f2=get_one(feature_2) | ||
print(label) | ||
print(f1) | ||
print(f2) | ||
|
||
|
||
|
||
|
||
|
||
|
||
#计算P(yi) | ||
def P_y(label,data): | ||
pos=np.zeros(len(label)) | ||
for l in data: | ||
if(l==label[label.index(l)]): | ||
pos[label.index(l)]+=1 | ||
return pos | ||
|
||
#计算条件概率 | ||
def P_x_y(feature,label,data_feature): #feature x在某维度上的所有取值 | ||
pos=np.zeros(len(data_feature)) | ||
for i in range(data_feature): | ||
pass | ||
|
||
|
||
#暂时略过,今天脑子有点瓦特了。 | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.