Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
zhu78244 committed Oct 5, 2023
2 parents b8597f5 + 86b0a07 commit fb5a8da
Show file tree
Hide file tree
Showing 6 changed files with 335 additions and 3 deletions.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,17 @@
![k_neighbors](https://github.com/jinxianwei/CloudImg/assets/81373517/4b25b680-c883-48e2-9846-357959fe7363)
- [x] SVM-(分类任务)
不同SVM分类器在测试集决策边界的可视化
![svm](https://github.com/jinxianwei/CloudImg/assets/81373517/36703295-9af3-406e-b8bb-728c77852bb8)
![svm](https://github.com/jinxianwei/CloudImg/assets/81373517/2a154234-ba2a-45d8-88ef-0ea4bd59cabf)
- [x] LogisticRegression with PCA
训练集和测试集特征前两个主成分在类别上的可视化
![x_train_pca](https://github.com/jinxianwei/CloudImg/assets/81373517/00878756-df1f-4e64-a04b-213371fda10b)
![x_test_pca](https://github.com/jinxianwei/CloudImg/assets/81373517/d14fa1de-e5bf-46f2-8707-91d86bb2be21)
- [x] 绘制不同分类器在测试集上的预测概率
![prob](https://github.com/jinxianwei/CloudImg/assets/81373517/b498966e-64c9-4c3f-88db-8ff114d29ec8)
- [x] 异常值检测
预测为异常点和真实的异常点的可视化
![outlier_detection_pred](https://github.com/jinxianwei/CloudImg/assets/81373517/0975ce3d-b0bc-41b3-ba28-9b9d7464fbe6)
![outlier_detection](https://github.com/jinxianwei/CloudImg/assets/81373517/09efcfd2-866f-4f9d-b0db-f6988a7855e1)

#### 深度学习
依赖 **Pytorch**,框架 **Pytorch_Lightning**
Expand All @@ -44,6 +54,7 @@ python machine_learning/logistic_regression/train.py

# 逐步增强法(Adaboost)(分类任务)
python machine_learning/adaboost/adaboost_classifier.py
...

# 深度回归
python deep_learning/regression/train.py
Expand Down
88 changes: 88 additions & 0 deletions machine_learning/pca/pca_cla.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
import sys
current_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.insert(0, current_path)
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from utils.read_csv_data import read_csv_data

def main():
# 1. 读取csv数据
name_dict, data = read_csv_data("dataset/iris.csv")
# 鸢尾花数据集三类标签
label_dict = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
label_list = ['setosa', 'versicolor', 'virginica']

# 2. 确定特征和标签
x = data[:, :-1]
y = data[:, -1]

# 3. 处理特征
# 在主成分分析PCA之前,需要对特征进行标准化,确保所有特征在相同尺度下均衡
x = StandardScaler().fit_transform(x)

# 4. 划分训练集和测试集
x_train, x_test, y_train, y_test= train_test_split(x, y, stratify=y, test_size=0.5, random_state=0)
# 在训练过程中,可用的只有训练集, 测试集的数据边换也需要根据训练集的数据进行变换
x_t = StandardScaler().fit(x_train)
x_train = x_t.transform(x_train)
x_test = x_t.transform(x_test)

# 对训练集和测试集分别进行PCA降维处理
k = 0.98 # 设置降维占比
pca = PCA(n_components=k)
x_train_pca = pca.fit_transform(x_train) # 在训练集上拟合模型并进行降维
x_test_pca = pca.transform(x_test) # 将测试集降维
print("主成分的数量: {}".format(pca.n_components_))
# 结果显示含义为:当维度降低到xx时,保留了原特征98%的信息

# 5. 利用降维后的训练集建立逻辑回归模型
model = LogisticRegression()
model.fit(x_train_pca, y_train)

# 6. 对降维后的测试集进行分类,并进行模型评估
y_pred = model.predict(x_test_pca)
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
precision = precision_score(y_true=y_test, y_pred=y_pred, average='macro')
recall = recall_score(y_true=y_test, y_pred=y_pred, average='macro')
f1 = f1_score(y_true=y_test, y_pred=y_pred, average='macro')
print(f"精确率为{accuracy}, 准确度为{precision}, 召回率为{recall}, F1分数为{f1}")

report = classification_report(y_true=y_test, y_pred=y_pred)
print(report)

# 对降维后的前两个主成分进行类别的可视化
plt.figure()
colors = ["navy", "turquoise", "darkorange"]
lw = 2
for color, i, target_name in zip(colors, [0, 1, 2], label_list):
plt.scatter(
x_train_pca[y_train == i, 0], x_train_pca[y_train == i, 1], color=color, alpha=0.8, lw=lw, label=target_name
)
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.title("x_train_pca of IRIS dataset")
plt.savefig("./x_train_pca.png")

plt.figure()
colors = ["navy", "turquoise", "darkorange"]
lw = 2
for color, i, target_name in zip(colors, [0, 1, 2], label_list):
plt.scatter(
x_test_pca[y_test == i, 0], x_test_pca[y_test == i, 1], color=color, alpha=0.8, lw=lw, label=target_name
)
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.title("x_test_pca of IRIS dataset")
plt.savefig("./x_test_pca.png")


if __name__ == "__main__":
main()
101 changes: 101 additions & 0 deletions machine_learning/plot_classification_probability/plt_probability.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import os
import sys
current_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.insert(0, current_path)

import matplotlib.pyplot as plt
import numpy as np

from sklearn import datasets
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from utils.read_csv_data import read_csv_data
from sklearn.model_selection import train_test_split

def main():
# 1. 读取csv数据
name_dict, data = read_csv_data("dataset/iris.csv")
# 鸢尾花数据集三类标签
label_dict = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
label_list = ['setosa', 'versicolor', 'virginica']
feature_name = [val for key, val in name_dict.items()]

# 2. 确定特征和标签
x = data[:, :2] # 只选择前两个特征进行训练,为方便可视化
y = data[:, -1]

# 3. 划分训练集和测试集
x_train, x_test, y_train, y_test= train_test_split(x, y, stratify=y, test_size=0.5, random_state=0)

n_features = x.shape[1]

# 4. 创建模型
C = 10
kernel = 1.0 * RBF([1.0, 1.0]) # for GPC
# 创建不同类别的分类器
classifiers = {
"L1 logistic": LogisticRegression(
C=C, penalty="l1", solver="saga", multi_class="multinomial", max_iter=10000
),
"L2 logistic (Multinomial)": LogisticRegression(
C=C, penalty="l2", solver="saga", multi_class="multinomial", max_iter=10000
),
"L2 logistic (OvR)": LogisticRegression(
C=C, penalty="l2", solver="saga", multi_class="ovr", max_iter=10000
),
"Linear SVC": SVC(kernel="linear", C=C, probability=True, random_state=0),
"GPC": GaussianProcessClassifier(kernel),
}

n_classifiers = len(classifiers)

plt.figure(figsize=(3 * 2, n_classifiers * 2))
plt.subplots_adjust(bottom=0.2, top=0.95)

xx = np.linspace(3, 9, 100) # x[:, 0].min() ~ x[:, 0].max()
yy = np.linspace(1, 5, 100).T # x[:, 1].min() ~ x[:, 1].max() 为了可视化样本空间,需要考虑特征的最大最小数值区间
xx, yy = np.meshgrid(xx, yy)
Xfull = np.c_[xx.ravel(), yy.ravel()] # 正交得到样本空间的待预测点

# 5. 对不同模型进行训练
for index, (name, classifier) in enumerate(classifiers.items()):
# 训练模型
classifier.fit(x_train, y_train)
# 在测试集上进行预测
y_pred = classifier.predict(x_test)
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy (test) for %s: %0.1f%% " % (name, accuracy * 100))

# View probabilities:
# 对离散空间的所有样本点进行预测
probas = classifier.predict_proba(Xfull)
n_classes = np.unique(y_pred).size
for k in range(n_classes):
plt.subplot(n_classifiers, n_classes, index * n_classes + k + 1)
plt.title("{}".format(label_dict[k]))
if k == 0:
plt.ylabel(name)
imshow_handle = plt.imshow(
probas[:, k].reshape((100, 100)), extent=(3, 9, 1, 5), origin="lower"
)
plt.xticks(())
plt.yticks(())
idx = y_pred == k # 得到预测类别为k的样本索引True,忽略其他预测类别的样本(检查x_test[idx, 0].shape)
if idx.any():
# 绘制预测类别为k的测试集样本散点
plt.scatter(x_test[idx, 0], x_test[idx, 1], marker="o", c="w", edgecolor="k")
if k == 2:
plt.xlabel(feature_name[0])
plt.ylabel(feature_name[1])

ax = plt.axes([0.15, 0.04, 0.7, 0.05])
plt.title("Probability")
plt.colorbar(imshow_handle, cax=ax, orientation="horizontal")

plt.savefig('./prob.png')

if __name__ == "__main__":
main()
5 changes: 3 additions & 2 deletions machine_learning/svm/svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def main():
# 鸢尾花数据集三类标签
label_dict = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
label_list = ['setosa', 'versicolor', 'virginica']
feature_name = [val for key, val in name_dict.items()]

# 2. 确定特征和标签
x = data[:, :2] # 只选择前两个特征进行训练,为方便可视化
Expand Down Expand Up @@ -75,8 +76,8 @@ def main():
cmap=plt.cm.coolwarm,
alpha=0.8,
ax=ax,
xlabel=label_list[0],
ylabel=label_list[1],
xlabel=feature_name[0],
ylabel=feature_name[1],
)
ax.scatter(x_test[:, 0], x_test[:, 1], c=y_test, cmap=plt.cm.coolwarm, s=20, edgecolors="k")
ax.set_xticks(())
Expand Down
94 changes: 94 additions & 0 deletions utils/outlier_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import numpy as np
from sklearn.neighbors import LocalOutlierFactor
import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerPathCollection

def update_legend_marker_size(handle, orig):
"Customize size of the legend marker"
handle.update_from(orig)
handle.set_sizes([20])

# 数据异常值检测
def main():
# 1. 数据构造
np.random.seed(42)
# 构造正常值数据
X_inliers = 0.3 * np.random.randn(100, 2)
X_inliers = np.r_[X_inliers + 2, X_inliers - 2]
# 构造异常值数据
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
# 拼接数据
X = np.r_[X_inliers, X_outliers]

# 为数据定义标签(1: 正常值,-1: 异常值)
n_outliers = len(X_outliers)
ground_truth = np.ones(len(X), dtype=int)
ground_truth[-n_outliers:] = -1


# 2. 建立异常值检测模型
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
# 预测数据中哪些为异常值(1: 正常值, -1: 异常值)
y_pred = clf.fit_predict(X)
# 查看异常值数据的索引和个数
outlier_indices = np.where(y_pred == -1)
num_pred_outlier = outlier_indices[0].shape
print("根据模型预测,总体数据中异常值数据的个数为{}, 其数据标号为{}".format(num_pred_outlier, outlier_indices))
# 查看误差
n_errors = (y_pred != ground_truth).sum()
# 查看每个数据点为异常值的分数
X_scores = clf.negative_outlier_factor_

# 3. 对ground_truth和pred的数据进行可视化
# 3.1 将正常值标记为红色,异常值标记为蓝色(gt)
plt.scatter(X[:, 0], X[:, 1], c=ground_truth, cmap=plt.cm.coolwarm, s=10, label="Data points")
# plot circles with radius proportional to the outlier scores
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
# 将数据预测的异常值分数转为半径,绘制在数据点上(半径越大,该数据点为异常值的概率越大)
scatter = plt.scatter(
X[:, 0],
X[:, 1],
s=1000 * radius,
edgecolors="r",
facecolors="none",
label="Outlier scores",
)
plt.axis("tight")
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.xlabel("prediction errors: %d" % (n_errors))
plt.legend(
handler_map={scatter: HandlerPathCollection(update_func=update_legend_marker_size)}
)
plt.title("Local Outlier Factor (LOF)")
plt.savefig('./outlier_detection.png')

plt.clf() # 清空图像

# 3.2 将正常值标记为红色,异常值标记为蓝色(y_pred)
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=plt.cm.coolwarm, s=10, label="Data points")
# plot circles with radius proportional to the outlier scores
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
# 将数据预测的异常值分数转为半径,绘制在数据点上(半径越大,该数据点为异常值的概率越大)
scatter = plt.scatter(
X[:, 0],
X[:, 1],
s=1000 * radius,
edgecolors="r",
facecolors="none",
label="Outlier scores",
)
plt.axis("tight")
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.xlabel("prediction errors: %d" % (n_errors))
plt.legend(
handler_map={scatter: HandlerPathCollection(update_func=update_legend_marker_size)}
)
plt.title("Local Outlier Factor (LOF)")
plt.savefig('./outlier_detection_pred.png')



if __name__ == "__main__":
main()
37 changes: 37 additions & 0 deletions utils/outlier_detection_iris.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import numpy as np
from sklearn.neighbors import LocalOutlierFactor
from read_csv_data import read_csv_data
from sklearn.model_selection import GridSearchCV

def main():
# 1. 读取csv数据
name_dict, data = read_csv_data("dataset/iris.csv")
# 2. 建立异常值检测模型, 其中模型的参数需要适当调整
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
# 预测数据中哪些为异常值(1: 正常值, -1: 异常值)
y_pred = clf.fit_predict(data)
# 查看异常值数据的索引和个数
outlier_indices = np.where(y_pred == -1)
num_pred_outlier = outlier_indices[0].shape
print("根据模型预测,总体数据中异常值数据的个数为{}, 其数据标号为{}".format(num_pred_outlier, outlier_indices))

# TODO 3. 找一个比较合适的模型参数
# 定义参数范围
param_grid = {'n_neighbors': [5, 10, 15, 20],
'contamination': [0.05, 0.1, 0.15, 0.2]}
# 创建 LOF 模型
lof = LocalOutlierFactor()
# 使用 GridSearchCV 寻找最佳参数
grid_search = GridSearchCV(lof, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(data)

# 输出最佳参数
print("Best parameters:", grid_search.best_params_)






if __name__ == "__main__":
main()

0 comments on commit fb5a8da

Please sign in to comment.