first commit

BlackTea-c · Nov 26, 2023 · 920c9f5 · 920c9f5
commit 920c9f5
Show file tree

Hide file tree

Showing 8 changed files with 588 additions and 0 deletions.
diff --git a/Decision Tree/CART.py b/Decision Tree/CART.py
@@ -0,0 +1,87 @@
+
+
+
+
+import  numpy as np
+#Regression Tree
+#区域划分问题  还是要递归的放法
+
+
+data=[[1,4.5],[2,4.75],[3,4.91],[4,5.34],[5,5.8],[6,7.05],[7,7.90],[8,8.23],[9,8.70],[10,9.0]]  #INDEX -1 为Y值
+ #Y为连续变量
+
+
+
+def Split_find(dataSet):
+
+
+    min_outside={}
+    index=0
+    for split_pos in range(len(dataSet[0])-1): #选取切分量
+
+       min_inside={}
+
+       for split in range(len(dataSet)):   #SPLIT=[X1,X2,X3....,Y]
+           right_region=[item for item in dataSet if item[split_pos]>dataSet[split][split_pos]]
+           left_region =[item for item in dataSet if item[split_pos]<=dataSet[split][split_pos]]
+           right_region_y=[item[-1] for item in right_region]
+           left_region_y=[item[-1] for item in left_region]
+           c_right=np.mean(right_region_y)
+           c_left=np.mean(left_region_y)
+           sum1=0
+           sum2=0
+           for y in right_region_y:
+               sum1+=(y-c_right)**2
+           for y in left_region_y:
+               sum2+=(y-c_left)**2
+           min_inside[split]=sum1+sum2
+
+
+       min_outside[(min(min_inside,key=min_inside.get),index)]=min(min_inside.values())
+       index+=1
+
+    point,j=min(min_outside,key=min_outside.get)
+    return  point,j,min(min_outside.values())
+
+
+print(Split_find(data))
+
+
+
+
+
+
+
+
+
+class Node:
+    def __init__(self,split_point,right_region,left_region):
+
+        self.split_point=split_point
+        self.right_region=right_region
+        self.left_region=left_region
+
+class regression_tree:
+    def __init__(self,root):
+        self.root=root
+
+
+    def create_tree(self,dataset): #停机条件为 检测新增划分能否降低误差
+
+
+        split_point,split_dim,loss_f=Split_find(dataSet=data)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/Decision Tree/Dt.py b/Decision Tree/Dt.py
@@ -0,0 +1,158 @@
+#Decision Tree
+
+#entropy越大 不确定性越高。
+import numpy as np
+import pandas as pd
+import math
+data_f=pd.read_excel('E:/Project_YOLO/Stattitiscs_learning_method/Decision Tree/data.xlsx')
+
+
+#信息增益: 特征A对训练数据集D的信息增益G（D,A）=H(D)-H(D|A)
+
+#信息熵
+
+print(data_f)
+
+def entropy(X,data_):  #random variable
+    data=data_[X]
+    l=[]
+    for i in data:
+        if i not in l:
+            l.append(i)
+    b=np.zeros(len(l))
+    for i in data:
+       if(i==l[l.index(i)]):
+           b[l.index(i)]+=1
+    c=b/len(data)
+    d=[]
+    for i in c:
+        if i==0:
+            d.append(0)
+        else:
+            d.append(math.log(i,2))  #以2为底
+    d=np.array(d)
+
+    return -np.sum(c*d)
+
+
+def condi_entropy(condition,X,data):
+    l=[]
+    for i in data[condition]:
+        if i not in l:
+            l.append(i)
+    b = np.zeros(len(l))
+    for i in data[condition]:
+        if (i == l[l.index(i)]):
+            b[l.index(i)] += 1
+    c = b / len(data[condition])  #得到H（Di）
+
+    d=[]
+    for fe in l:
+        d.append(entropy(X,data[data[condition]==fe]))
+
+    d=np.array(d)
+    return  sum(c*d)
+
+
+
+
+g1=entropy('是否发放贷款',data_f)-condi_entropy('年龄','是否发放贷款',data_f)
+print(g1)
+g2=entropy('是否发放贷款',data_f)-condi_entropy('有工作','是否发放贷款',data_f)
+print(g2)
+#....编辑成功！！
+
+features=data_f.columns[1:-1]
+labels=[]
+for label in features:
+    labels.append(label)
+#ID3算法   (C4.5算法则采用信息增益比来选择特征)
+def feature_get(dataset, labels):  # 传入数据集与标签
+    g_d_a = []
+    for label in labels:
+        g_d_a.append(entropy('是否发放贷款', dataset) - condi_entropy(label, '是否发放贷款', dataset))
+    index = g_d_a.index(max(g_d_a))
+    optimum_label = labels[index]
+    return optimum_label, index, g_d_a[index]
+
+
+def splitdata(dataSet, bestfeature):  # 按照optimum_label划分数据集
+    bestfeature_value = []
+    splitset = {}
+    for value in dataSet[bestfeature]:
+        if value not in bestfeature_value:
+            bestfeature_value.append(value)
+    for condition in bestfeature_value:
+        splitset[condition] = (dataSet[dataSet[bestfeature] == condition])  # 得到{Di}
+
+    return splitset
+
+class Node:
+    def __init__(self,feature,subtree):
+        self.feature=feature  #feature = 类 or 类下的取值？
+        self.subtree=subtree
+
+
+
+
+class DS_Tree:
+    def __init__(self,data):
+        self.root=None
+        k=len(data.columns)
+
+
+
+    def createtree(self,dataSet, labels, thresh=0):  # 默认阈值为0
+                                                           # sublabels是往下延展是用到的特征集合，每次使用一个特征就要删取该特征
+
+       #首先判断是否所有实例都属于一类：
+       ifallinclass=[item for item in dataSet['是否发放贷款']]
+       if len(set(ifallinclass))==1:
+           return  Node(ifallinclass[0],subtree=None)
+           #实例就只有一类则直接返回该类
+
+       #如果特征没了，则用最大的类值作为节点
+
+       if len(labels)==0:
+
+           return Node(max(ifallinclass, key=ifallinclass.count),subtree=None)
+       bestfeature, i, score =feature_get(dataSet,labels)
+
+       if(score>thresh):
+           dict=splitdata(dataSet,bestfeature)
+           labels.remove(bestfeature)
+           subnode=[]
+           for condition,dataset in dict.items():
+               subnode.append(self.createtree(dataset, labels))
+
+
+           return Node(bestfeature,subnode)
+
+
+
+
+       else:
+           return  Node(max(ifallinclass, key=ifallinclass.count),subtree=None)
+
+
+def preorder(root):
+    print(root.feature)
+    if root.subtree!=None:
+        for node in root.subtree:
+            preorder(node)
+
+
+TREE=DS_Tree(data_f)
+TREE.root=TREE.createtree(dataSet=data_f,labels=labels)
+
+
+preorder(TREE.root)
+
+
+#终于算是编出来了 泪目= =||
+
+
+
+
+
+
diff --git a/Decision Tree/data.xlsx b/Decision Tree/data.xlsx
diff --git a/Naive Byes/byes.py b/Naive Byes/byes.py
@@ -0,0 +1,89 @@
+#朴素贝叶斯
+
+import numpy as np
+
+# 创建特征1（0到5的整数）
+feature_1 = np.array([0, 1, 2, 3, 4, 5, 2, 3, 1, 4,
+                      0, 5, 3, 2, 1, 4, 5, 0, 2, 1,
+                      3, 4, 5, 1, 0, 4, 3, 2, 5, 1,
+                      0, 2, 3, 4, 5, 0, 1, 2, 3, 4,
+                      5, 3, 1, 4, 2, 5, 0, 1, 4, 3])
+
+# 创建特征2（S，M，L三选一）
+feature_2 = np.array(['S', 'M', 'L', 'S', 'M', 'L', 'M', 'L', 'S', 'M',
+                      'L', 'S', 'S', 'M', 'L', 'M', 'L', 'S', 'M', 'L',
+                      'S', 'M', 'L', 'S', 'M', 'L', 'S', 'M', 'L', 'S',
+                      'M', 'L', 'S', 'M', 'L', 'S', 'M', 'L', 'S', 'M',
+                      'L', 'S', 'M', 'L', 'S', 'M', 'L', 'S', 'M', 'L'])
+
+# 合并特征
+data_x = np.column_stack((feature_1, feature_2))
+# 创建对应的标签
+data_y = np.array([1, -1, -1, -1, -1, -1, 1, -1, 1, -1,
+                   1, -1, -1, -1, 1, -1, 1, -1, 1, -1,
+                   1, -1, 1, -1, 1, -1, 1, -1, 1, -1,
+                   1, -1, 1, -1, 1, -1, 1, -1, 1, -1,
+                   1, -1, 1, -1, 1, -1, 1, -1, 1, -1])
+
+
+#计算目标  max (P(Y=1|X),P(Y=-1|X)
+
+
+
+#做一下训练集验证集划分
+
+train_data=data_x[:40]
+test_data=data_x[40:50]
+train_labels=data_y[:40]
+test_label=data_y[40:50]
+
+
+#计算先验概率以及条件概率
+
+
+
+def get_one(data):
+    list=[]
+    for i in data:
+        if i not in list:
+            list.append(i)
+    return list
+label=get_one(train_labels)
+f1=get_one(feature_1)
+f2=get_one(feature_2)
+print(label)
+print(f1)
+print(f2)
+
+
+
+
+
+
+#计算P(yi)
+def P_y(label,data):
+    pos=np.zeros(len(label))
+    for l in data:
+        if(l==label[label.index(l)]):
+            pos[label.index(l)]+=1
+    return pos
+
+#计算条件概率
+def P_x_y(feature,label,data_feature):  #feature x在某维度上的所有取值
+    pos=np.zeros(len(data_feature))
+    for i in range(data_feature):
+        pass
+
+
+#暂时略过，今天脑子有点瓦特了。
+
+
+
+
+
+
+
+
+
+
+