Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
BlackTea-c committed Nov 26, 2023
0 parents commit 920c9f5
Show file tree
Hide file tree
Showing 8 changed files with 588 additions and 0 deletions.
87 changes: 87 additions & 0 deletions Decision Tree/CART.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@




import numpy as np
#Regression Tree
#区域划分问题 还是要递归的放法


data=[[1,4.5],[2,4.75],[3,4.91],[4,5.34],[5,5.8],[6,7.05],[7,7.90],[8,8.23],[9,8.70],[10,9.0]] #INDEX -1 为Y值
#Y为连续变量



def Split_find(dataSet):


min_outside={}
index=0
for split_pos in range(len(dataSet[0])-1): #选取切分量

min_inside={}

for split in range(len(dataSet)): #SPLIT=[X1,X2,X3....,Y]
right_region=[item for item in dataSet if item[split_pos]>dataSet[split][split_pos]]
left_region =[item for item in dataSet if item[split_pos]<=dataSet[split][split_pos]]
right_region_y=[item[-1] for item in right_region]
left_region_y=[item[-1] for item in left_region]
c_right=np.mean(right_region_y)
c_left=np.mean(left_region_y)
sum1=0
sum2=0
for y in right_region_y:
sum1+=(y-c_right)**2
for y in left_region_y:
sum2+=(y-c_left)**2
min_inside[split]=sum1+sum2


min_outside[(min(min_inside,key=min_inside.get),index)]=min(min_inside.values())
index+=1

point,j=min(min_outside,key=min_outside.get)
return point,j,min(min_outside.values())


print(Split_find(data))









class Node:
def __init__(self,split_point,right_region,left_region):

self.split_point=split_point
self.right_region=right_region
self.left_region=left_region

class regression_tree:
def __init__(self,root):
self.root=root


def create_tree(self,dataset): #停机条件为 检测新增划分能否降低误差


split_point,split_dim,loss_f=Split_find(dataSet=data)















158 changes: 158 additions & 0 deletions Decision Tree/Dt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#Decision Tree

#entropy越大 不确定性越高。
import numpy as np
import pandas as pd
import math
data_f=pd.read_excel('E:/Project_YOLO/Stattitiscs_learning_method/Decision Tree/data.xlsx')


#信息增益: 特征A对训练数据集D的信息增益G(D,A)=H(D)-H(D|A)

#信息熵

print(data_f)

def entropy(X,data_): #random variable
data=data_[X]
l=[]
for i in data:
if i not in l:
l.append(i)
b=np.zeros(len(l))
for i in data:
if(i==l[l.index(i)]):
b[l.index(i)]+=1
c=b/len(data)
d=[]
for i in c:
if i==0:
d.append(0)
else:
d.append(math.log(i,2)) #以2为底
d=np.array(d)

return -np.sum(c*d)


def condi_entropy(condition,X,data):
l=[]
for i in data[condition]:
if i not in l:
l.append(i)
b = np.zeros(len(l))
for i in data[condition]:
if (i == l[l.index(i)]):
b[l.index(i)] += 1
c = b / len(data[condition]) #得到H(Di)

d=[]
for fe in l:
d.append(entropy(X,data[data[condition]==fe]))

d=np.array(d)
return sum(c*d)




g1=entropy('是否发放贷款',data_f)-condi_entropy('年龄','是否发放贷款',data_f)
print(g1)
g2=entropy('是否发放贷款',data_f)-condi_entropy('有工作','是否发放贷款',data_f)
print(g2)
#....编辑成功!!

features=data_f.columns[1:-1]
labels=[]
for label in features:
labels.append(label)
#ID3算法 (C4.5算法则采用信息增益比来选择特征)
def feature_get(dataset, labels): # 传入数据集与标签
g_d_a = []
for label in labels:
g_d_a.append(entropy('是否发放贷款', dataset) - condi_entropy(label, '是否发放贷款', dataset))
index = g_d_a.index(max(g_d_a))
optimum_label = labels[index]
return optimum_label, index, g_d_a[index]


def splitdata(dataSet, bestfeature): # 按照optimum_label划分数据集
bestfeature_value = []
splitset = {}
for value in dataSet[bestfeature]:
if value not in bestfeature_value:
bestfeature_value.append(value)
for condition in bestfeature_value:
splitset[condition] = (dataSet[dataSet[bestfeature] == condition]) # 得到{Di}

return splitset

class Node:
def __init__(self,feature,subtree):
self.feature=feature #feature = 类 or 类下的取值?
self.subtree=subtree




class DS_Tree:
def __init__(self,data):
self.root=None
k=len(data.columns)



def createtree(self,dataSet, labels, thresh=0): # 默认阈值为0
# sublabels是往下延展是用到的特征集合,每次使用一个特征就要删取该特征

#首先判断是否所有实例都属于一类:
ifallinclass=[item for item in dataSet['是否发放贷款']]
if len(set(ifallinclass))==1:
return Node(ifallinclass[0],subtree=None)
#实例就只有一类则直接返回该类

#如果特征没了,则用最大的类值作为节点

if len(labels)==0:

return Node(max(ifallinclass, key=ifallinclass.count),subtree=None)
bestfeature, i, score =feature_get(dataSet,labels)

if(score>thresh):
dict=splitdata(dataSet,bestfeature)
labels.remove(bestfeature)
subnode=[]
for condition,dataset in dict.items():
subnode.append(self.createtree(dataset, labels))


return Node(bestfeature,subnode)




else:
return Node(max(ifallinclass, key=ifallinclass.count),subtree=None)


def preorder(root):
print(root.feature)
if root.subtree!=None:
for node in root.subtree:
preorder(node)


TREE=DS_Tree(data_f)
TREE.root=TREE.createtree(dataSet=data_f,labels=labels)


preorder(TREE.root)


#终于算是编出来了 泪目= =||






Binary file added Decision Tree/data.xlsx
Binary file not shown.
89 changes: 89 additions & 0 deletions Naive Byes/byes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#朴素贝叶斯

import numpy as np

# 创建特征1(0到5的整数)
feature_1 = np.array([0, 1, 2, 3, 4, 5, 2, 3, 1, 4,
0, 5, 3, 2, 1, 4, 5, 0, 2, 1,
3, 4, 5, 1, 0, 4, 3, 2, 5, 1,
0, 2, 3, 4, 5, 0, 1, 2, 3, 4,
5, 3, 1, 4, 2, 5, 0, 1, 4, 3])

# 创建特征2(S,M,L三选一)
feature_2 = np.array(['S', 'M', 'L', 'S', 'M', 'L', 'M', 'L', 'S', 'M',
'L', 'S', 'S', 'M', 'L', 'M', 'L', 'S', 'M', 'L',
'S', 'M', 'L', 'S', 'M', 'L', 'S', 'M', 'L', 'S',
'M', 'L', 'S', 'M', 'L', 'S', 'M', 'L', 'S', 'M',
'L', 'S', 'M', 'L', 'S', 'M', 'L', 'S', 'M', 'L'])

# 合并特征
data_x = np.column_stack((feature_1, feature_2))
# 创建对应的标签
data_y = np.array([1, -1, -1, -1, -1, -1, 1, -1, 1, -1,
1, -1, -1, -1, 1, -1, 1, -1, 1, -1,
1, -1, 1, -1, 1, -1, 1, -1, 1, -1,
1, -1, 1, -1, 1, -1, 1, -1, 1, -1,
1, -1, 1, -1, 1, -1, 1, -1, 1, -1])


#计算目标 max (P(Y=1|X),P(Y=-1|X)



#做一下训练集验证集划分

train_data=data_x[:40]
test_data=data_x[40:50]
train_labels=data_y[:40]
test_label=data_y[40:50]


#计算先验概率以及条件概率



def get_one(data):
list=[]
for i in data:
if i not in list:
list.append(i)
return list
label=get_one(train_labels)
f1=get_one(feature_1)
f2=get_one(feature_2)
print(label)
print(f1)
print(f2)






#计算P(yi)
def P_y(label,data):
pos=np.zeros(len(label))
for l in data:
if(l==label[label.index(l)]):
pos[label.index(l)]+=1
return pos

#计算条件概率
def P_x_y(feature,label,data_feature): #feature x在某维度上的所有取值
pos=np.zeros(len(data_feature))
for i in range(data_feature):
pass


#暂时略过,今天脑子有点瓦特了。











Loading

0 comments on commit 920c9f5

Please sign in to comment.