-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' of https://github.com/jinxianwei/machine2deeplear…
- Loading branch information
Showing
6 changed files
with
335 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import os | ||
import sys | ||
current_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | ||
sys.path.insert(0, current_path) | ||
import matplotlib.pyplot as plt | ||
|
||
from sklearn.decomposition import PCA | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.metrics import classification_report | ||
from sklearn.preprocessing import StandardScaler | ||
import matplotlib.pyplot as plt | ||
|
||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score | ||
|
||
from utils.read_csv_data import read_csv_data | ||
|
||
def main(): | ||
# 1. 读取csv数据 | ||
name_dict, data = read_csv_data("dataset/iris.csv") | ||
# 鸢尾花数据集三类标签 | ||
label_dict = {0: 'setosa', 1: 'versicolor', 2: 'virginica'} | ||
label_list = ['setosa', 'versicolor', 'virginica'] | ||
|
||
# 2. 确定特征和标签 | ||
x = data[:, :-1] | ||
y = data[:, -1] | ||
|
||
# 3. 处理特征 | ||
# 在主成分分析PCA之前,需要对特征进行标准化,确保所有特征在相同尺度下均衡 | ||
x = StandardScaler().fit_transform(x) | ||
|
||
# 4. 划分训练集和测试集 | ||
x_train, x_test, y_train, y_test= train_test_split(x, y, stratify=y, test_size=0.5, random_state=0) | ||
# 在训练过程中,可用的只有训练集, 测试集的数据边换也需要根据训练集的数据进行变换 | ||
x_t = StandardScaler().fit(x_train) | ||
x_train = x_t.transform(x_train) | ||
x_test = x_t.transform(x_test) | ||
|
||
# 对训练集和测试集分别进行PCA降维处理 | ||
k = 0.98 # 设置降维占比 | ||
pca = PCA(n_components=k) | ||
x_train_pca = pca.fit_transform(x_train) # 在训练集上拟合模型并进行降维 | ||
x_test_pca = pca.transform(x_test) # 将测试集降维 | ||
print("主成分的数量: {}".format(pca.n_components_)) | ||
# 结果显示含义为:当维度降低到xx时,保留了原特征98%的信息 | ||
|
||
# 5. 利用降维后的训练集建立逻辑回归模型 | ||
model = LogisticRegression() | ||
model.fit(x_train_pca, y_train) | ||
|
||
# 6. 对降维后的测试集进行分类,并进行模型评估 | ||
y_pred = model.predict(x_test_pca) | ||
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) | ||
precision = precision_score(y_true=y_test, y_pred=y_pred, average='macro') | ||
recall = recall_score(y_true=y_test, y_pred=y_pred, average='macro') | ||
f1 = f1_score(y_true=y_test, y_pred=y_pred, average='macro') | ||
print(f"精确率为{accuracy}, 准确度为{precision}, 召回率为{recall}, F1分数为{f1}") | ||
|
||
report = classification_report(y_true=y_test, y_pred=y_pred) | ||
print(report) | ||
|
||
# 对降维后的前两个主成分进行类别的可视化 | ||
plt.figure() | ||
colors = ["navy", "turquoise", "darkorange"] | ||
lw = 2 | ||
for color, i, target_name in zip(colors, [0, 1, 2], label_list): | ||
plt.scatter( | ||
x_train_pca[y_train == i, 0], x_train_pca[y_train == i, 1], color=color, alpha=0.8, lw=lw, label=target_name | ||
) | ||
plt.legend(loc="best", shadow=False, scatterpoints=1) | ||
plt.title("x_train_pca of IRIS dataset") | ||
plt.savefig("./x_train_pca.png") | ||
|
||
plt.figure() | ||
colors = ["navy", "turquoise", "darkorange"] | ||
lw = 2 | ||
for color, i, target_name in zip(colors, [0, 1, 2], label_list): | ||
plt.scatter( | ||
x_test_pca[y_test == i, 0], x_test_pca[y_test == i, 1], color=color, alpha=0.8, lw=lw, label=target_name | ||
) | ||
plt.legend(loc="best", shadow=False, scatterpoints=1) | ||
plt.title("x_test_pca of IRIS dataset") | ||
plt.savefig("./x_test_pca.png") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
101 changes: 101 additions & 0 deletions
101
machine_learning/plot_classification_probability/plt_probability.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import os | ||
import sys | ||
current_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | ||
sys.path.insert(0, current_path) | ||
|
||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
|
||
from sklearn import datasets | ||
from sklearn.gaussian_process import GaussianProcessClassifier | ||
from sklearn.gaussian_process.kernels import RBF | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.metrics import accuracy_score | ||
from sklearn.svm import SVC | ||
from utils.read_csv_data import read_csv_data | ||
from sklearn.model_selection import train_test_split | ||
|
||
def main(): | ||
# 1. 读取csv数据 | ||
name_dict, data = read_csv_data("dataset/iris.csv") | ||
# 鸢尾花数据集三类标签 | ||
label_dict = {0: 'setosa', 1: 'versicolor', 2: 'virginica'} | ||
label_list = ['setosa', 'versicolor', 'virginica'] | ||
feature_name = [val for key, val in name_dict.items()] | ||
|
||
# 2. 确定特征和标签 | ||
x = data[:, :2] # 只选择前两个特征进行训练,为方便可视化 | ||
y = data[:, -1] | ||
|
||
# 3. 划分训练集和测试集 | ||
x_train, x_test, y_train, y_test= train_test_split(x, y, stratify=y, test_size=0.5, random_state=0) | ||
|
||
n_features = x.shape[1] | ||
|
||
# 4. 创建模型 | ||
C = 10 | ||
kernel = 1.0 * RBF([1.0, 1.0]) # for GPC | ||
# 创建不同类别的分类器 | ||
classifiers = { | ||
"L1 logistic": LogisticRegression( | ||
C=C, penalty="l1", solver="saga", multi_class="multinomial", max_iter=10000 | ||
), | ||
"L2 logistic (Multinomial)": LogisticRegression( | ||
C=C, penalty="l2", solver="saga", multi_class="multinomial", max_iter=10000 | ||
), | ||
"L2 logistic (OvR)": LogisticRegression( | ||
C=C, penalty="l2", solver="saga", multi_class="ovr", max_iter=10000 | ||
), | ||
"Linear SVC": SVC(kernel="linear", C=C, probability=True, random_state=0), | ||
"GPC": GaussianProcessClassifier(kernel), | ||
} | ||
|
||
n_classifiers = len(classifiers) | ||
|
||
plt.figure(figsize=(3 * 2, n_classifiers * 2)) | ||
plt.subplots_adjust(bottom=0.2, top=0.95) | ||
|
||
xx = np.linspace(3, 9, 100) # x[:, 0].min() ~ x[:, 0].max() | ||
yy = np.linspace(1, 5, 100).T # x[:, 1].min() ~ x[:, 1].max() 为了可视化样本空间,需要考虑特征的最大最小数值区间 | ||
xx, yy = np.meshgrid(xx, yy) | ||
Xfull = np.c_[xx.ravel(), yy.ravel()] # 正交得到样本空间的待预测点 | ||
|
||
# 5. 对不同模型进行训练 | ||
for index, (name, classifier) in enumerate(classifiers.items()): | ||
# 训练模型 | ||
classifier.fit(x_train, y_train) | ||
# 在测试集上进行预测 | ||
y_pred = classifier.predict(x_test) | ||
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) | ||
print("Accuracy (test) for %s: %0.1f%% " % (name, accuracy * 100)) | ||
|
||
# View probabilities: | ||
# 对离散空间的所有样本点进行预测 | ||
probas = classifier.predict_proba(Xfull) | ||
n_classes = np.unique(y_pred).size | ||
for k in range(n_classes): | ||
plt.subplot(n_classifiers, n_classes, index * n_classes + k + 1) | ||
plt.title("{}".format(label_dict[k])) | ||
if k == 0: | ||
plt.ylabel(name) | ||
imshow_handle = plt.imshow( | ||
probas[:, k].reshape((100, 100)), extent=(3, 9, 1, 5), origin="lower" | ||
) | ||
plt.xticks(()) | ||
plt.yticks(()) | ||
idx = y_pred == k # 得到预测类别为k的样本索引True,忽略其他预测类别的样本(检查x_test[idx, 0].shape) | ||
if idx.any(): | ||
# 绘制预测类别为k的测试集样本散点 | ||
plt.scatter(x_test[idx, 0], x_test[idx, 1], marker="o", c="w", edgecolor="k") | ||
if k == 2: | ||
plt.xlabel(feature_name[0]) | ||
plt.ylabel(feature_name[1]) | ||
|
||
ax = plt.axes([0.15, 0.04, 0.7, 0.05]) | ||
plt.title("Probability") | ||
plt.colorbar(imshow_handle, cax=ax, orientation="horizontal") | ||
|
||
plt.savefig('./prob.png') | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import numpy as np | ||
from sklearn.neighbors import LocalOutlierFactor | ||
import matplotlib.pyplot as plt | ||
from matplotlib.legend_handler import HandlerPathCollection | ||
|
||
def update_legend_marker_size(handle, orig): | ||
"Customize size of the legend marker" | ||
handle.update_from(orig) | ||
handle.set_sizes([20]) | ||
|
||
# 数据异常值检测 | ||
def main(): | ||
# 1. 数据构造 | ||
np.random.seed(42) | ||
# 构造正常值数据 | ||
X_inliers = 0.3 * np.random.randn(100, 2) | ||
X_inliers = np.r_[X_inliers + 2, X_inliers - 2] | ||
# 构造异常值数据 | ||
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) | ||
# 拼接数据 | ||
X = np.r_[X_inliers, X_outliers] | ||
|
||
# 为数据定义标签(1: 正常值,-1: 异常值) | ||
n_outliers = len(X_outliers) | ||
ground_truth = np.ones(len(X), dtype=int) | ||
ground_truth[-n_outliers:] = -1 | ||
|
||
|
||
# 2. 建立异常值检测模型 | ||
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1) | ||
# 预测数据中哪些为异常值(1: 正常值, -1: 异常值) | ||
y_pred = clf.fit_predict(X) | ||
# 查看异常值数据的索引和个数 | ||
outlier_indices = np.where(y_pred == -1) | ||
num_pred_outlier = outlier_indices[0].shape | ||
print("根据模型预测,总体数据中异常值数据的个数为{}, 其数据标号为{}".format(num_pred_outlier, outlier_indices)) | ||
# 查看误差 | ||
n_errors = (y_pred != ground_truth).sum() | ||
# 查看每个数据点为异常值的分数 | ||
X_scores = clf.negative_outlier_factor_ | ||
|
||
# 3. 对ground_truth和pred的数据进行可视化 | ||
# 3.1 将正常值标记为红色,异常值标记为蓝色(gt) | ||
plt.scatter(X[:, 0], X[:, 1], c=ground_truth, cmap=plt.cm.coolwarm, s=10, label="Data points") | ||
# plot circles with radius proportional to the outlier scores | ||
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min()) | ||
# 将数据预测的异常值分数转为半径,绘制在数据点上(半径越大,该数据点为异常值的概率越大) | ||
scatter = plt.scatter( | ||
X[:, 0], | ||
X[:, 1], | ||
s=1000 * radius, | ||
edgecolors="r", | ||
facecolors="none", | ||
label="Outlier scores", | ||
) | ||
plt.axis("tight") | ||
plt.xlim((-5, 5)) | ||
plt.ylim((-5, 5)) | ||
plt.xlabel("prediction errors: %d" % (n_errors)) | ||
plt.legend( | ||
handler_map={scatter: HandlerPathCollection(update_func=update_legend_marker_size)} | ||
) | ||
plt.title("Local Outlier Factor (LOF)") | ||
plt.savefig('./outlier_detection.png') | ||
|
||
plt.clf() # 清空图像 | ||
|
||
# 3.2 将正常值标记为红色,异常值标记为蓝色(y_pred) | ||
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=plt.cm.coolwarm, s=10, label="Data points") | ||
# plot circles with radius proportional to the outlier scores | ||
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min()) | ||
# 将数据预测的异常值分数转为半径,绘制在数据点上(半径越大,该数据点为异常值的概率越大) | ||
scatter = plt.scatter( | ||
X[:, 0], | ||
X[:, 1], | ||
s=1000 * radius, | ||
edgecolors="r", | ||
facecolors="none", | ||
label="Outlier scores", | ||
) | ||
plt.axis("tight") | ||
plt.xlim((-5, 5)) | ||
plt.ylim((-5, 5)) | ||
plt.xlabel("prediction errors: %d" % (n_errors)) | ||
plt.legend( | ||
handler_map={scatter: HandlerPathCollection(update_func=update_legend_marker_size)} | ||
) | ||
plt.title("Local Outlier Factor (LOF)") | ||
plt.savefig('./outlier_detection_pred.png') | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import numpy as np | ||
from sklearn.neighbors import LocalOutlierFactor | ||
from read_csv_data import read_csv_data | ||
from sklearn.model_selection import GridSearchCV | ||
|
||
def main(): | ||
# 1. 读取csv数据 | ||
name_dict, data = read_csv_data("dataset/iris.csv") | ||
# 2. 建立异常值检测模型, 其中模型的参数需要适当调整 | ||
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1) | ||
# 预测数据中哪些为异常值(1: 正常值, -1: 异常值) | ||
y_pred = clf.fit_predict(data) | ||
# 查看异常值数据的索引和个数 | ||
outlier_indices = np.where(y_pred == -1) | ||
num_pred_outlier = outlier_indices[0].shape | ||
print("根据模型预测,总体数据中异常值数据的个数为{}, 其数据标号为{}".format(num_pred_outlier, outlier_indices)) | ||
|
||
# TODO 3. 找一个比较合适的模型参数 | ||
# 定义参数范围 | ||
param_grid = {'n_neighbors': [5, 10, 15, 20], | ||
'contamination': [0.05, 0.1, 0.15, 0.2]} | ||
# 创建 LOF 模型 | ||
lof = LocalOutlierFactor() | ||
# 使用 GridSearchCV 寻找最佳参数 | ||
grid_search = GridSearchCV(lof, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5) | ||
grid_search.fit(data) | ||
|
||
# 输出最佳参数 | ||
print("Best parameters:", grid_search.best_params_) | ||
|
||
|
||
|
||
|
||
|
||
|
||
if __name__ == "__main__": | ||
main() |