Merge branch 'main' of https://github.com/jinxianwei/machine2deeplear…

…ning_lab
jinxianwei · Oct 5, 2023 · fb5a8da · fb5a8da
2 parents b8597f5 + 86b0a07
commit fb5a8da
Show file tree

Hide file tree

Showing 6 changed files with 335 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -22,7 +22,17 @@
 ![k_neighbors](https://github.com/jinxianwei/CloudImg/assets/81373517/4b25b680-c883-48e2-9846-357959fe7363)
 - [x] SVM-(分类任务)
 不同SVM分类器在测试集决策边界的可视化
-![svm](https://github.com/jinxianwei/CloudImg/assets/81373517/36703295-9af3-406e-b8bb-728c77852bb8)
+![svm](https://github.com/jinxianwei/CloudImg/assets/81373517/2a154234-ba2a-45d8-88ef-0ea4bd59cabf)
+- [x] LogisticRegression with PCA
+训练集和测试集特征前两个主成分在类别上的可视化
+![x_train_pca](https://github.com/jinxianwei/CloudImg/assets/81373517/00878756-df1f-4e64-a04b-213371fda10b)
+![x_test_pca](https://github.com/jinxianwei/CloudImg/assets/81373517/d14fa1de-e5bf-46f2-8707-91d86bb2be21)
+- [x] 绘制不同分类器在测试集上的预测概率
+![prob](https://github.com/jinxianwei/CloudImg/assets/81373517/b498966e-64c9-4c3f-88db-8ff114d29ec8)
+- [x] 异常值检测
+预测为异常点和真实的异常点的可视化
+![outlier_detection_pred](https://github.com/jinxianwei/CloudImg/assets/81373517/0975ce3d-b0bc-41b3-ba28-9b9d7464fbe6)
+![outlier_detection](https://github.com/jinxianwei/CloudImg/assets/81373517/09efcfd2-866f-4f9d-b0db-f6988a7855e1)
 
 #### 深度学习
 依赖 **Pytorch**，框架 **Pytorch_Lightning**
@@ -44,6 +54,7 @@ python machine_learning/logistic_regression/train.py
 
 # 逐步增强法(Adaboost)(分类任务)
 python machine_learning/adaboost/adaboost_classifier.py
+...
 
 # 深度回归
 python  deep_learning/regression/train.py

diff --git a/machine_learning/pca/pca_cla.py b/machine_learning/pca/pca_cla.py
@@ -0,0 +1,88 @@
+import os
+import sys
+current_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.insert(0, current_path)
+import matplotlib.pyplot as plt
+
+from sklearn.decomposition import PCA
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report
+from sklearn.preprocessing import StandardScaler
+import matplotlib.pyplot as plt
+
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+
+from utils.read_csv_data import read_csv_data
+
+def main():
+    # 1. 读取csv数据
+    name_dict, data = read_csv_data("dataset/iris.csv")
+    # 鸢尾花数据集三类标签
+    label_dict = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
+    label_list = ['setosa', 'versicolor', 'virginica']
+
+    # 2. 确定特征和标签
+    x = data[:, :-1]
+    y = data[:, -1]
+
+    # 3. 处理特征
+    # 在主成分分析PCA之前，需要对特征进行标准化，确保所有特征在相同尺度下均衡
+    x = StandardScaler().fit_transform(x)
+
+    # 4. 划分训练集和测试集
+    x_train, x_test, y_train, y_test= train_test_split(x, y, stratify=y, test_size=0.5, random_state=0)
+    # 在训练过程中，可用的只有训练集， 测试集的数据边换也需要根据训练集的数据进行变换
+    x_t = StandardScaler().fit(x_train)
+    x_train = x_t.transform(x_train)
+    x_test = x_t.transform(x_test)
+
+    # 对训练集和测试集分别进行PCA降维处理
+    k = 0.98 # 设置降维占比
+    pca = PCA(n_components=k)
+    x_train_pca = pca.fit_transform(x_train) # 在训练集上拟合模型并进行降维
+    x_test_pca = pca.transform(x_test) # 将测试集降维
+    print("主成分的数量: {}".format(pca.n_components_))
+    # 结果显示含义为：当维度降低到xx时，保留了原特征98%的信息   
+
+    # 5. 利用降维后的训练集建立逻辑回归模型
+    model = LogisticRegression()
+    model.fit(x_train_pca, y_train)
+
+    # 6. 对降维后的测试集进行分类，并进行模型评估
+    y_pred = model.predict(x_test_pca)
+    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
+    precision = precision_score(y_true=y_test, y_pred=y_pred, average='macro')
+    recall = recall_score(y_true=y_test, y_pred=y_pred, average='macro')
+    f1 = f1_score(y_true=y_test, y_pred=y_pred, average='macro')
+    print(f"精确率为{accuracy}, 准确度为{precision}, 召回率为{recall}, F1分数为{f1}")
+
+    report = classification_report(y_true=y_test, y_pred=y_pred)
+    print(report)
+
+    # 对降维后的前两个主成分进行类别的可视化
+    plt.figure()
+    colors = ["navy", "turquoise", "darkorange"]
+    lw = 2
+    for color, i, target_name in zip(colors, [0, 1, 2], label_list):
+        plt.scatter(
+            x_train_pca[y_train == i, 0], x_train_pca[y_train == i, 1], color=color, alpha=0.8, lw=lw, label=target_name
+        )
+    plt.legend(loc="best", shadow=False, scatterpoints=1)
+    plt.title("x_train_pca of IRIS dataset")
+    plt.savefig("./x_train_pca.png")
+
+    plt.figure()
+    colors = ["navy", "turquoise", "darkorange"]
+    lw = 2
+    for color, i, target_name in zip(colors, [0, 1, 2], label_list):
+        plt.scatter(
+            x_test_pca[y_test == i, 0], x_test_pca[y_test == i, 1], color=color, alpha=0.8, lw=lw, label=target_name
+        )
+    plt.legend(loc="best", shadow=False, scatterpoints=1)
+    plt.title("x_test_pca of IRIS dataset")
+    plt.savefig("./x_test_pca.png")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/machine_learning/plot_classification_probability/plt_probability.py b/machine_learning/plot_classification_probability/plt_probability.py
@@ -0,0 +1,101 @@
+import os
+import sys
+current_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.insert(0, current_path)
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn import datasets
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import RBF
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score
+from sklearn.svm import SVC
+from utils.read_csv_data import read_csv_data
+from sklearn.model_selection import train_test_split
+
+def main():
+    # 1. 读取csv数据
+    name_dict, data = read_csv_data("dataset/iris.csv")
+    # 鸢尾花数据集三类标签
+    label_dict = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
+    label_list = ['setosa', 'versicolor', 'virginica']
+    feature_name = [val for key, val in name_dict.items()]
+
+    # 2. 确定特征和标签
+    x = data[:, :2] # 只选择前两个特征进行训练，为方便可视化
+    y = data[:, -1]
+
+    # 3. 划分训练集和测试集
+    x_train, x_test, y_train, y_test= train_test_split(x, y, stratify=y, test_size=0.5, random_state=0)
+
+    n_features = x.shape[1]
+
+    # 4. 创建模型
+    C = 10
+    kernel = 1.0 * RBF([1.0, 1.0])  # for GPC
+    # 创建不同类别的分类器
+    classifiers = {
+        "L1 logistic": LogisticRegression(
+            C=C, penalty="l1", solver="saga", multi_class="multinomial", max_iter=10000
+        ),
+        "L2 logistic (Multinomial)": LogisticRegression(
+            C=C, penalty="l2", solver="saga", multi_class="multinomial", max_iter=10000
+        ),
+        "L2 logistic (OvR)": LogisticRegression(
+            C=C, penalty="l2", solver="saga", multi_class="ovr", max_iter=10000
+        ),
+        "Linear SVC": SVC(kernel="linear", C=C, probability=True, random_state=0),
+        "GPC": GaussianProcessClassifier(kernel),
+    }
+
+    n_classifiers = len(classifiers)
+
+    plt.figure(figsize=(3 * 2, n_classifiers * 2))
+    plt.subplots_adjust(bottom=0.2, top=0.95)
+
+    xx = np.linspace(3, 9, 100) # x[:, 0].min() ~ x[:, 0].max()
+    yy = np.linspace(1, 5, 100).T # x[:, 1].min() ~ x[:, 1].max() 为了可视化样本空间，需要考虑特征的最大最小数值区间
+    xx, yy = np.meshgrid(xx, yy)
+    Xfull = np.c_[xx.ravel(), yy.ravel()] # 正交得到样本空间的待预测点
+
+    # 5. 对不同模型进行训练
+    for index, (name, classifier) in enumerate(classifiers.items()):
+        # 训练模型
+        classifier.fit(x_train, y_train)
+        # 在测试集上进行预测
+        y_pred = classifier.predict(x_test)
+        accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
+        print("Accuracy (test) for %s: %0.1f%% " % (name, accuracy * 100))
+
+        # View probabilities:
+        # 对离散空间的所有样本点进行预测
+        probas = classifier.predict_proba(Xfull)
+        n_classes = np.unique(y_pred).size
+        for k in range(n_classes):
+            plt.subplot(n_classifiers, n_classes, index * n_classes + k + 1)
+            plt.title("{}".format(label_dict[k]))
+            if k == 0:
+                plt.ylabel(name)
+            imshow_handle = plt.imshow(
+                probas[:, k].reshape((100, 100)), extent=(3, 9, 1, 5), origin="lower"
+            )
+            plt.xticks(())
+            plt.yticks(())
+            idx = y_pred == k # 得到预测类别为k的样本索引True，忽略其他预测类别的样本(检查x_test[idx, 0].shape)
+            if idx.any():
+                # 绘制预测类别为k的测试集样本散点
+                plt.scatter(x_test[idx, 0], x_test[idx, 1], marker="o", c="w", edgecolor="k")
+            if k == 2:
+                plt.xlabel(feature_name[0])
+                plt.ylabel(feature_name[1])
+
+    ax = plt.axes([0.15, 0.04, 0.7, 0.05])
+    plt.title("Probability")
+    plt.colorbar(imshow_handle, cax=ax, orientation="horizontal")
+
+    plt.savefig('./prob.png')
+
+if __name__ == "__main__":
+    main()
diff --git a/machine_learning/svm/svm.py b/machine_learning/svm/svm.py
@@ -23,6 +23,7 @@ def main():
     # 鸢尾花数据集三类标签
     label_dict = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
     label_list = ['setosa', 'versicolor', 'virginica']
+    feature_name = [val for key, val in name_dict.items()]
 
     # 2. 确定特征和标签
     x = data[:, :2] # 只选择前两个特征进行训练，为方便可视化
@@ -75,8 +76,8 @@ def main():
             cmap=plt.cm.coolwarm,
             alpha=0.8,
             ax=ax,
-            xlabel=label_list[0],
-            ylabel=label_list[1],
+            xlabel=feature_name[0],
+            ylabel=feature_name[1],
         )
         ax.scatter(x_test[:, 0], x_test[:, 1], c=y_test, cmap=plt.cm.coolwarm, s=20, edgecolors="k")
         ax.set_xticks(())

diff --git a/utils/outlier_detection.py b/utils/outlier_detection.py
@@ -0,0 +1,94 @@
+import numpy as np
+from sklearn.neighbors import LocalOutlierFactor
+import matplotlib.pyplot as plt
+from matplotlib.legend_handler import HandlerPathCollection
+
+def update_legend_marker_size(handle, orig):
+    "Customize size of the legend marker"
+    handle.update_from(orig)
+    handle.set_sizes([20])
+
+# 数据异常值检测
+def main():
+    # 1. 数据构造
+    np.random.seed(42)
+    # 构造正常值数据
+    X_inliers = 0.3 * np.random.randn(100, 2)
+    X_inliers = np.r_[X_inliers + 2, X_inliers - 2]
+    # 构造异常值数据
+    X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
+    # 拼接数据
+    X = np.r_[X_inliers, X_outliers]
+
+    # 为数据定义标签(1: 正常值，-1: 异常值)
+    n_outliers = len(X_outliers)
+    ground_truth = np.ones(len(X), dtype=int)
+    ground_truth[-n_outliers:] = -1
+
+
+    # 2. 建立异常值检测模型
+    clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
+    # 预测数据中哪些为异常值(1: 正常值， -1: 异常值)
+    y_pred = clf.fit_predict(X)
+    # 查看异常值数据的索引和个数
+    outlier_indices = np.where(y_pred == -1)
+    num_pred_outlier = outlier_indices[0].shape
+    print("根据模型预测，总体数据中异常值数据的个数为{}, 其数据标号为{}".format(num_pred_outlier, outlier_indices))
+    # 查看误差
+    n_errors = (y_pred != ground_truth).sum()
+    # 查看每个数据点为异常值的分数
+    X_scores = clf.negative_outlier_factor_
+
+    # 3. 对ground_truth和pred的数据进行可视化
+    # 3.1 将正常值标记为红色，异常值标记为蓝色(gt)
+    plt.scatter(X[:, 0], X[:, 1], c=ground_truth, cmap=plt.cm.coolwarm, s=10, label="Data points")
+    # plot circles with radius proportional to the outlier scores
+    radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
+    # 将数据预测的异常值分数转为半径，绘制在数据点上(半径越大，该数据点为异常值的概率越大)
+    scatter = plt.scatter(
+        X[:, 0],
+        X[:, 1],
+        s=1000 * radius,
+        edgecolors="r",
+        facecolors="none",
+        label="Outlier scores",
+    )
+    plt.axis("tight")
+    plt.xlim((-5, 5))
+    plt.ylim((-5, 5))
+    plt.xlabel("prediction errors: %d" % (n_errors))
+    plt.legend(
+        handler_map={scatter: HandlerPathCollection(update_func=update_legend_marker_size)}
+    )
+    plt.title("Local Outlier Factor (LOF)")
+    plt.savefig('./outlier_detection.png')
+
+    plt.clf() # 清空图像
+
+    # 3.2 将正常值标记为红色，异常值标记为蓝色(y_pred)
+    plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=plt.cm.coolwarm, s=10, label="Data points")
+    # plot circles with radius proportional to the outlier scores
+    radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
+    # 将数据预测的异常值分数转为半径，绘制在数据点上(半径越大，该数据点为异常值的概率越大)
+    scatter = plt.scatter(
+        X[:, 0],
+        X[:, 1],
+        s=1000 * radius,
+        edgecolors="r",
+        facecolors="none",
+        label="Outlier scores",
+    )
+    plt.axis("tight")
+    plt.xlim((-5, 5))
+    plt.ylim((-5, 5))
+    plt.xlabel("prediction errors: %d" % (n_errors))
+    plt.legend(
+        handler_map={scatter: HandlerPathCollection(update_func=update_legend_marker_size)}
+    )
+    plt.title("Local Outlier Factor (LOF)")
+    plt.savefig('./outlier_detection_pred.png')
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/outlier_detection_iris.py b/utils/outlier_detection_iris.py
@@ -0,0 +1,37 @@
+import numpy as np
+from sklearn.neighbors import LocalOutlierFactor
+from read_csv_data import read_csv_data
+from sklearn.model_selection import GridSearchCV
+
+def main():
+    # 1. 读取csv数据
+    name_dict, data = read_csv_data("dataset/iris.csv")    
+    # 2. 建立异常值检测模型, 其中模型的参数需要适当调整
+    clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
+    # 预测数据中哪些为异常值(1: 正常值， -1: 异常值)
+    y_pred = clf.fit_predict(data)
+    # 查看异常值数据的索引和个数
+    outlier_indices = np.where(y_pred == -1)
+    num_pred_outlier = outlier_indices[0].shape
+    print("根据模型预测，总体数据中异常值数据的个数为{}, 其数据标号为{}".format(num_pred_outlier, outlier_indices))
+
+    # TODO 3. 找一个比较合适的模型参数
+    # 定义参数范围
+    param_grid = {'n_neighbors': [5, 10, 15, 20],
+                'contamination': [0.05, 0.1, 0.15, 0.2]}
+    # 创建 LOF 模型
+    lof = LocalOutlierFactor()
+    # 使用 GridSearchCV 寻找最佳参数
+    grid_search = GridSearchCV(lof, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
+    grid_search.fit(data)
+
+    # 输出最佳参数
+    print("Best parameters:", grid_search.best_params_)
+
+
+
+
+
+
+if __name__ == "__main__":
+    main()