diff --git a/Decision Tree/CART.py b/Decision Tree/CART.py new file mode 100644 index 0000000..4555d98 --- /dev/null +++ b/Decision Tree/CART.py @@ -0,0 +1,87 @@ + + + + +import numpy as np +#Regression Tree +#区域划分问题 还是要递归的放法 + + +data=[[1,4.5],[2,4.75],[3,4.91],[4,5.34],[5,5.8],[6,7.05],[7,7.90],[8,8.23],[9,8.70],[10,9.0]] #INDEX -1 为Y值 + #Y为连续变量 + + + +def Split_find(dataSet): + + + min_outside={} + index=0 + for split_pos in range(len(dataSet[0])-1): #选取切分量 + + min_inside={} + + for split in range(len(dataSet)): #SPLIT=[X1,X2,X3....,Y] + right_region=[item for item in dataSet if item[split_pos]>dataSet[split][split_pos]] + left_region =[item for item in dataSet if item[split_pos]<=dataSet[split][split_pos]] + right_region_y=[item[-1] for item in right_region] + left_region_y=[item[-1] for item in left_region] + c_right=np.mean(right_region_y) + c_left=np.mean(left_region_y) + sum1=0 + sum2=0 + for y in right_region_y: + sum1+=(y-c_right)**2 + for y in left_region_y: + sum2+=(y-c_left)**2 + min_inside[split]=sum1+sum2 + + + min_outside[(min(min_inside,key=min_inside.get),index)]=min(min_inside.values()) + index+=1 + + point,j=min(min_outside,key=min_outside.get) + return point,j,min(min_outside.values()) + + +print(Split_find(data)) + + + + + + + + + +class Node: + def __init__(self,split_point,right_region,left_region): + + self.split_point=split_point + self.right_region=right_region + self.left_region=left_region + +class regression_tree: + def __init__(self,root): + self.root=root + + + def create_tree(self,dataset): #停机条件为 检测新增划分能否降低误差 + + + split_point,split_dim,loss_f=Split_find(dataSet=data) + + + + + + + + + + + + + + + diff --git a/Decision Tree/Dt.py b/Decision Tree/Dt.py new file mode 100644 index 0000000..8bbb05d --- /dev/null +++ b/Decision Tree/Dt.py @@ -0,0 +1,158 @@ +#Decision Tree + +#entropy越大 不确定性越高。 +import numpy as np +import pandas as pd +import math +data_f=pd.read_excel('E:/Project_YOLO/Stattitiscs_learning_method/Decision Tree/data.xlsx') + + +#信息增益: 特征A对训练数据集D的信息增益G(D,A)=H(D)-H(D|A) + +#信息熵 + +print(data_f) + +def entropy(X,data_): #random variable + data=data_[X] + l=[] + for i in data: + if i not in l: + l.append(i) + b=np.zeros(len(l)) + for i in data: + if(i==l[l.index(i)]): + b[l.index(i)]+=1 + c=b/len(data) + d=[] + for i in c: + if i==0: + d.append(0) + else: + d.append(math.log(i,2)) #以2为底 + d=np.array(d) + + return -np.sum(c*d) + + +def condi_entropy(condition,X,data): + l=[] + for i in data[condition]: + if i not in l: + l.append(i) + b = np.zeros(len(l)) + for i in data[condition]: + if (i == l[l.index(i)]): + b[l.index(i)] += 1 + c = b / len(data[condition]) #得到H(Di) + + d=[] + for fe in l: + d.append(entropy(X,data[data[condition]==fe])) + + d=np.array(d) + return sum(c*d) + + + + +g1=entropy('是否发放贷款',data_f)-condi_entropy('年龄','是否发放贷款',data_f) +print(g1) +g2=entropy('是否发放贷款',data_f)-condi_entropy('有工作','是否发放贷款',data_f) +print(g2) +#....编辑成功!! + +features=data_f.columns[1:-1] +labels=[] +for label in features: + labels.append(label) +#ID3算法 (C4.5算法则采用信息增益比来选择特征) +def feature_get(dataset, labels): # 传入数据集与标签 + g_d_a = [] + for label in labels: + g_d_a.append(entropy('是否发放贷款', dataset) - condi_entropy(label, '是否发放贷款', dataset)) + index = g_d_a.index(max(g_d_a)) + optimum_label = labels[index] + return optimum_label, index, g_d_a[index] + + +def splitdata(dataSet, bestfeature): # 按照optimum_label划分数据集 + bestfeature_value = [] + splitset = {} + for value in dataSet[bestfeature]: + if value not in bestfeature_value: + bestfeature_value.append(value) + for condition in bestfeature_value: + splitset[condition] = (dataSet[dataSet[bestfeature] == condition]) # 得到{Di} + + return splitset + +class Node: + def __init__(self,feature,subtree): + self.feature=feature #feature = 类 or 类下的取值? + self.subtree=subtree + + + + +class DS_Tree: + def __init__(self,data): + self.root=None + k=len(data.columns) + + + + def createtree(self,dataSet, labels, thresh=0): # 默认阈值为0 + # sublabels是往下延展是用到的特征集合,每次使用一个特征就要删取该特征 + + #首先判断是否所有实例都属于一类: + ifallinclass=[item for item in dataSet['是否发放贷款']] + if len(set(ifallinclass))==1: + return Node(ifallinclass[0],subtree=None) + #实例就只有一类则直接返回该类 + + #如果特征没了,则用最大的类值作为节点 + + if len(labels)==0: + + return Node(max(ifallinclass, key=ifallinclass.count),subtree=None) + bestfeature, i, score =feature_get(dataSet,labels) + + if(score>thresh): + dict=splitdata(dataSet,bestfeature) + labels.remove(bestfeature) + subnode=[] + for condition,dataset in dict.items(): + subnode.append(self.createtree(dataset, labels)) + + + return Node(bestfeature,subnode) + + + + + else: + return Node(max(ifallinclass, key=ifallinclass.count),subtree=None) + + +def preorder(root): + print(root.feature) + if root.subtree!=None: + for node in root.subtree: + preorder(node) + + +TREE=DS_Tree(data_f) +TREE.root=TREE.createtree(dataSet=data_f,labels=labels) + + +preorder(TREE.root) + + +#终于算是编出来了 泪目= =|| + + + + + + diff --git a/Decision Tree/data.xlsx b/Decision Tree/data.xlsx new file mode 100644 index 0000000..807187e Binary files /dev/null and b/Decision Tree/data.xlsx differ diff --git a/Naive Byes/byes.py b/Naive Byes/byes.py new file mode 100644 index 0000000..955fdcf --- /dev/null +++ b/Naive Byes/byes.py @@ -0,0 +1,89 @@ +#朴素贝叶斯 + +import numpy as np + +# 创建特征1(0到5的整数) +feature_1 = np.array([0, 1, 2, 3, 4, 5, 2, 3, 1, 4, + 0, 5, 3, 2, 1, 4, 5, 0, 2, 1, + 3, 4, 5, 1, 0, 4, 3, 2, 5, 1, + 0, 2, 3, 4, 5, 0, 1, 2, 3, 4, + 5, 3, 1, 4, 2, 5, 0, 1, 4, 3]) + +# 创建特征2(S,M,L三选一) +feature_2 = np.array(['S', 'M', 'L', 'S', 'M', 'L', 'M', 'L', 'S', 'M', + 'L', 'S', 'S', 'M', 'L', 'M', 'L', 'S', 'M', 'L', + 'S', 'M', 'L', 'S', 'M', 'L', 'S', 'M', 'L', 'S', + 'M', 'L', 'S', 'M', 'L', 'S', 'M', 'L', 'S', 'M', + 'L', 'S', 'M', 'L', 'S', 'M', 'L', 'S', 'M', 'L']) + +# 合并特征 +data_x = np.column_stack((feature_1, feature_2)) +# 创建对应的标签 +data_y = np.array([1, -1, -1, -1, -1, -1, 1, -1, 1, -1, + 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, + 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, + 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, + 1, -1, 1, -1, 1, -1, 1, -1, 1, -1]) + + +#计算目标 max (P(Y=1|X),P(Y=-1|X) + + + +#做一下训练集验证集划分 + +train_data=data_x[:40] +test_data=data_x[40:50] +train_labels=data_y[:40] +test_label=data_y[40:50] + + +#计算先验概率以及条件概率 + + + +def get_one(data): + list=[] + for i in data: + if i not in list: + list.append(i) + return list +label=get_one(train_labels) +f1=get_one(feature_1) +f2=get_one(feature_2) +print(label) +print(f1) +print(f2) + + + + + + +#计算P(yi) +def P_y(label,data): + pos=np.zeros(len(label)) + for l in data: + if(l==label[label.index(l)]): + pos[label.index(l)]+=1 + return pos + +#计算条件概率 +def P_x_y(feature,label,data_feature): #feature x在某维度上的所有取值 + pos=np.zeros(len(data_feature)) + for i in range(data_feature): + pass + + +#暂时略过,今天脑子有点瓦特了。 + + + + + + + + + + + diff --git "a/k\350\277\221\351\202\273/K.py" "b/k\350\277\221\351\202\273/K.py" new file mode 100644 index 0000000..3be635e --- /dev/null +++ "b/k\350\277\221\351\202\273/K.py" @@ -0,0 +1,99 @@ +#K近邻算法 + +#构造kd树。 +import numpy as np + + + +#数据集 +x=np.array([[2,3],[5,4],[9,6],[4,7],[8,1],[7,2]]) +y=np.array([1,1,2,3,3,4]) +k=1 + +#度量函数 +def Euclidean_distance(x,y): #欧式距离 + distance=0 + for i in range(len(x)): + distance+=(x[i]-y[i])**2 + return distance**(1/2) + +def Manhattan_distance(x,y): #曼哈顿距离 + distance=0 + for i in range(len(x)): + distance+=abs(x[i]-y[i]) + +def max_distance(x,y): #Loo距离 + return max(abs(x-y)) + + + + +#构建结点对象 +class KdNode(object): + def __init__(self, dom_elt, dim=0, left=None, right=None): + self.dom_elt = dom_elt # k维向量节点(k维空间中的一个样本点) 具体值 + self.dim = dim # 整数(进行分割维度的序号) + self.left = left # 该结点分割超平面左子空间构成的kd-tree + self.right = right # 该结点分割超平面右子空间构成的kd-tree + + +class KdTree(object): + def __init__(self, data): + k = len(data[0]) # 数据维度 + + # 按第dim维划分数据集exset创建KdNode + def _CreateNode(dim, data_set): + if not data_set: # 数据集为空 + return None + + # 按要进行分割的那一维数据排序 + data_set.sort(key=lambda x: x[dim]) #key=排序依据 reverse=升序降序False升序 + split_pos = len(data_set) // 2 + median = data_set[split_pos] # 中位数分割点 + split_next = (dim + 1) % k # cycle coordinates + + # 递归的创建kd树 + return KdNode( + median, + dim, + _CreateNode(split_next, data_set[:split_pos]), # 创建左子树 + _CreateNode(split_next, data_set[split_pos + 1:])) # 创建右子树 + #函数嵌套 + + self.root = _CreateNode(0, data) # 从第0维分量开始构建kd树,返回根节点 + + + + + + +# kdTree的前序遍历 +def preorder(root): + print(root.dom_elt) + if root.left: # 节点不为空 + preorder(root.left) + if root.right: + preorder(root.right) + +data = [[2,3],[5,4],[9,6],[4,7],[8,1],[7,2]] +kd = KdTree(data) +print(kd.root.right.left.left) + + + + + + + + + + + + + + + + + + + diff --git a/perceptron/README.md b/perceptron/README.md new file mode 100644 index 0000000..002553c --- /dev/null +++ b/perceptron/README.md @@ -0,0 +1,37 @@ +# ֪Perceptron̳ + +## ֪ +֪򵥵˹ʽ֮һڽԪ⡣ + +## ֪Ĺԭ + +### ˼ +- ֪ն루x1, x2, ..., xnÿ붼һӦȨأw1, w2, ..., wn +- Ȩ˲ͣƫbias +- õĽݸͨΪԾյķ + +### ѧԭ +֪ѧʽʾ +- Ȩص: \( \sum_{i=1}^{n} w_i \cdot x_i + b \) +- Ծ \( f(x) = \begin{cases} 1, & \text{if } \sum_{i=1}^{n} w_i \cdot x_i + b > 0 \\ 0, & \text{otherwise} \end{cases} \) + +### Ȩظ㷨 +֪ѧϰʹü򵥵Ȩظ㷨ͨݶ½Stochastic Gradient DescentȨغƫС +ÿһѵУÿxiͶӦʵǩyiȨظ¹£ +- \( w_i = w_i + \alpha \cdot (y_i - \hat{y_i}) \cdot x_i \) \(\alpha\) ѧϰʣlearning rate\(\hat{y_i}\) Ԥֵ\(y_i\) ʵֵ +- \( b = b + \alpha \cdot (y_i - \hat{y_i}) \) + +### żʽ +֪һֶżʽʹݺͱǩڻdot productȨظ¡ +Ȩظ£ʹ¹ʽи£ +- \( w_i = w_i + \alpha \cdot (y_i - \hat{y_i}) \cdot x_i \) +- \( b = b + \alpha \cdot (y_i - \hat{y_i}) \cdot 1 \) \(x_i\) \(1\) ݺƫͨڻ㡣 + + + +## ܽ +֪򵥵ʽ֮һͨȨغƫѧϰɼ򵥵ĶԪ + +## ļ +eg1.py ԭʼ +eg_dual_form.py żʽʼǼGram diff --git a/perceptron/eg1.py b/perceptron/eg1.py new file mode 100644 index 0000000..02b088b --- /dev/null +++ b/perceptron/eg1.py @@ -0,0 +1,64 @@ +#求f(x)=sign(w*x+b) linear classifier +import time +import numpy as np +#数据点 +x=np.array([[3,3],[4,3],[1,1]]) +y=np.array([1,1,-1]) +#初始参数 +w=np.array([0,0]) +b=0 +#同n阶的矩阵乘法 +def muti_matrix(a,b): + result=0 + for i in range(len(a)): + result+=a[i]*b[i] + return result + +#判别模型 +def classifier_result(w,b,x): + if((muti_matrix(w,x)+b)>0): + return 1 + if ((muti_matrix(w,x)+b)<0): + return -1 + if ((muti_matrix(w,x)+b)==0): + return 0 + + +#损失函数 +def loss_function_L(w,b,x,y): + L=0 + record=[] #记录误分类点 + for i in range(len(x)): + #print(classifier_result(w,b,x[i])) + if(classifier_result(w,b,x[i])!=y[i]): + L+=-y[i]*(muti_matrix(w,x[i])+b) + record.append(i) + return L,record + + +#优化器 +#nata : 学习率 +def optimizer(nata,record): + #dl/dw 梯度 + Lw=[0,0] + #dl/b + Lb=0 + #if record !=[] + i=record[0] + Lw+=-y[i]*x[i] + Lb+=-y[i] + return nata*Lw,nata*Lb + + + +#主程序 +L,record=loss_function_L(w,b,x,y) +#print(L,record) +while(record!=[]): + w-=optimizer(1,record)[0] #梯度的反方向! + b-=optimizer(1,record)[1] + L,record=loss_function_L(w,b,x,y) + print(w,b,L) + #time.sleep(1.5) +print("results:",L," ",w," ",b) + diff --git a/perceptron/eg_dual_form.py b/perceptron/eg_dual_form.py new file mode 100644 index 0000000..d6af560 --- /dev/null +++ b/perceptron/eg_dual_form.py @@ -0,0 +1,54 @@ +#recall : perceptron ; f=sign(w*x+b) + +#loss function : -sum(M)(yi*(wx+b)) + +import numpy as np +#数据点 +x=np.array([[3,3],[4,3],[1,1]]) +y=np.array([1,1,-1]) +def muti_matrix(a,b): + result=0 + for i in range(len(a)): + result+=a[i]*b[i] + return result +def opti_target(i,a,b,Gram): #i为xi + sum=0 + for j in range(len(x)): + sum+=a[j]*y[j]*Gram[j][i] + return y[i]*(sum+b) + + + +#初始参数 +a=np.array([0,0,0]) +b=0 + +#计算Gram矩阵 +Gram=np.zeros((len(x),len(x))) +for i in range(len(x)): + for j in range(len(x)): + Gram[i][j]=muti_matrix(x[i],x[j]) + + + +sp=False #标记是否没有误分类数据 +nata=1 #learning rate +while(sp==False): + sp=True + for i in range(len(x)): + print(i,"opti:", opti_target(i, a, b, Gram)) + if(opti_target(i,a,b,Gram)<=0): + + a[i]=a[i]+nata + b=b+nata*y[i] + sp = False + print("a:",a,"b:",b) +print("result:",a,b) + + + + + + + +