From 3ed3ea71f302c5f9f570e5bae9bdac74f29b9395 Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 28 Dec 2023 19:26:58 +0800 Subject: [PATCH] perf:Improved the common function for clustering --- .../data_mining/model/clustering.py | 225 ++++++-------- .../model/func/algo_clustering/_common.py | 291 ++++++++++++++++-- .../model/func/algo_clustering/_dbscan.py | 94 ------ .../model/func/algo_clustering/_kmeans.py | 161 ---------- 4 files changed, 352 insertions(+), 419 deletions(-) diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py index 93bdf85b..8aeb3cd0 100644 --- a/geochemistrypi/data_mining/model/clustering.py +++ b/geochemistrypi/data_mining/model/clustering.py @@ -12,9 +12,9 @@ from ..constants import MLFLOW_ARTIFACT_DATA_PATH, MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH from ..utils.base import clear_output, save_data, save_fig, save_text from ._base import WorkflowBase -from .func.algo_clustering._common import plot_results, plot_silhouette_diagram, score -from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters, dbscan_result_plot -from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters, plot_silhouette_diagram_kmeans, scatter2d, scatter3d +from .func.algo_clustering._common import plot_silhouette_diagram, plot_silhouette_value_diagram, scatter2d, scatter3d, score +from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters +from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters class ClusteringWorkflowBase(WorkflowBase): @@ -64,26 +64,44 @@ def _score(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, store_ mlflow.log_metrics(scores) @staticmethod - def _plot_results(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, cluster_centers_: pd.DataFrame, local_path: str, mlflow_path: str) -> None: - """Plot the cluster_results .""" - print("-----* results diagram *-----") - plot_results(data, labels, algorithm_name, cluster_centers_) - save_fig(f"results - {algorithm_name}", local_path, mlflow_path) - data = pd.concat([data, labels], axis=1) - save_data(data, f"results - {algorithm_name}", local_path, mlflow_path) + def _scatter2d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + """Plot the two-dimensional diagram of the clustering result.""" + print("-----* Cluster Two-Dimensional Diagram *-----") + scatter2d(data, labels, algorithm_name) + save_fig(f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) + data_with_labels = pd.concat([data, labels], axis=1) + save_data(data_with_labels, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) + + @staticmethod + def _scatter3d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + """Plot the three-dimensional diagram of the clustering result.""" + print("-----* Cluster Three-Dimensional Diagram *-----") + scatter3d(data, labels, algorithm_name) + save_fig(f"Cluster Three-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) + data_with_labels = pd.concat([data, labels], axis=1) + save_data(data_with_labels, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) @staticmethod - def _plot_silhouette_diagram(data: pd.DataFrame, labels: pd.DataFrame, cluster_centers_: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_silhouette_diagram(data: pd.DataFrame, labels: pd.DataFrame, model: object, cluster_centers_: np.ndarray, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the silhouette diagram of the clustering result.""" print("-----* Silhouette Diagram *-----") - plot_silhouette_diagram(data, labels, algorithm_name) + plot_silhouette_diagram(data, labels, cluster_centers_, model, algorithm_name) save_fig(f"Silhouette Diagram - {algorithm_name}", local_path, mlflow_path) data_with_labels = pd.concat([data, labels], axis=1) save_data(data_with_labels, "Silhouette Diagram - Data With Labels", local_path, mlflow_path) - if isinstance(cluster_centers_, pd.DataFrame): + if not isinstance(cluster_centers_, str): cluster_center_data = pd.DataFrame(cluster_centers_, columns=data.columns) save_data(cluster_center_data, "Silhouette Diagram - Cluster Centers", local_path, mlflow_path) + @staticmethod + def _plot_silhouette_value_diagram(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + """Plot the silhouette value diagram of the clustering result.""" + print("-----* Silhouette value Diagram *-----") + plot_silhouette_value_diagram(data, labels, algorithm_name) + save_fig(f"Silhouette value Diagram - {algorithm_name}", local_path, mlflow_path) + data_with_labels = pd.concat([data, labels], axis=1) + save_data(data_with_labels, "Silhouette value Diagram - Data With Labels", local_path, mlflow_path) + def common_components(self) -> None: """Invoke all common application functions for clustering algorithms.""" GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") @@ -94,18 +112,68 @@ def common_components(self) -> None: algorithm_name=self.naming, store_path=GEOPI_OUTPUT_METRICS_PATH, ) - # self._plot_results( - # data=self.X, - # labels=self.clustering_result["clustering result"], - # cluster_centers_=self.get_cluster_centers(), - # algorithm_name=self.naming, - # local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - # mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - # ) + if self.X.shape[1] >= 3: + # choose two of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(self.X, 2) + self._scatter2d( + data=two_dimen_data, + labels=self.clustering_result["clustering result"], + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + + # choose three of dimensions to draw + three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(self.X, 3) + self._scatter3d( + data=three_dimen_data, + labels=self.clustering_result["clustering result"], + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + elif self.X.shape[1] == 3: + # choose two of dimensions to draw + two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(self.X, 2) + self._scatter2d( + data=two_dimen_data, + labels=self.clustering_result["clustering result"], + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + + # no need to choose + self._scatter3d( + data=self.X, + labels=self.clustering_result["clustering result"], + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + elif self.X.shape[1] == 2: + self._scatter2d( + data=self.X, + labels=self.clustering_result["clustering result"], + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + else: + pass + self._plot_silhouette_diagram( data=self.X, labels=self.clustering_result["clustering result"], cluster_centers_=self.get_cluster_centers(), + model=self.model, + algorithm_name=self.naming, + local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, + mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, + ) + self._plot_silhouette_value_diagram( + data=self.X, + labels=self.clustering_result["clustering result"], algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -244,111 +312,13 @@ def manual_hyper_parameters(cls) -> Dict: clear_output() return hyper_parameters - @staticmethod - def _plot_silhouette_diagram_kmeans( - data: pd.DataFrame, - cluster_labels: pd.DataFrame, - cluster_centers_: np.ndarray, - n_clusters: int, - algorithm_name: str, - local_path: str, - mlflow_path: str, - ) -> None: - """Plot the silhouette diagram of the clustering result.""" - print("-----* KMeans's Silhouette Diagram *-----") - plot_silhouette_diagram_kmeans(data, cluster_labels, cluster_centers_, n_clusters, algorithm_name) - save_fig(f"KMeans's Silhouette Diagram - {algorithm_name}", local_path, mlflow_path) - data_with_labels = pd.concat([data, cluster_labels], axis=1) - save_data(data_with_labels, "KMeans's Silhouette Diagram - Data With Labels", local_path, mlflow_path) - cluster_center_data = pd.DataFrame(cluster_centers_, columns=data.columns) - save_data(cluster_center_data, "KMeans's Silhouette Diagram - Cluster Centers", local_path, mlflow_path) - - @staticmethod - def _scatter2d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: - """Plot the two-dimensional diagram of the clustering result.""" - print("-----* Cluster Two-Dimensional Diagram *-----") - scatter2d(data, cluster_labels, algorithm_name) - save_fig(f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) - data_with_labels = pd.concat([data, cluster_labels], axis=1) - save_data(data_with_labels, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) - - @staticmethod - def _scatter3d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: - """Plot the three-dimensional diagram of the clustering result.""" - print("-----* Cluster Three-Dimensional Diagram *-----") - scatter3d(data, cluster_labels, algorithm_name) - save_fig(f"Cluster Three-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) - data_with_labels = pd.concat([data, cluster_labels], axis=1) - save_data(data_with_labels, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) - def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None: """Invoke all special application functions for this algorithms by Scikit-learn framework.""" GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._get_inertia_scores( algorithm_name=self.naming, store_path=GEOPI_OUTPUT_METRICS_PATH, ) - self._plot_silhouette_diagram_kmeans( - data=self.X, - cluster_labels=self.clustering_result["clustering result"], - cluster_centers_=self.get_cluster_centers(), - n_clusters=self.n_clusters, - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - - # Draw graphs when the number of principal components > 3 - if self.X.shape[1] >= 3: - # choose two of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(self.X, 2) - self._scatter2d( - data=two_dimen_data, - cluster_labels=self.clustering_result["clustering result"], - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - - # choose three of dimensions to draw - three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(self.X, 3) - self._scatter3d( - data=three_dimen_data, - cluster_labels=self.clustering_result["clustering result"], - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif self.X.shape[1] == 3: - # choose two of dimensions to draw - two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(self.X, 2) - self._scatter2d( - data=two_dimen_data, - cluster_labels=self.clustering_result["clustering result"], - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - - # no need to choose - self._scatter3d( - data=self.X, - cluster_labels=self.clustering_result["clustering result"], - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - elif self.X.shape[1] == 2: - self._scatter2d( - data=self.X, - cluster_labels=self.clustering_result["clustering result"], - algorithm_name=self.naming, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) - else: - pass class DBSCANClustering(ClusteringWorkflowBase): @@ -440,25 +410,8 @@ def manual_hyper_parameters(cls) -> Dict: clear_output() return hyper_parameters - @staticmethod - def _clustering_result_plot(X: pd.DataFrame, trained_model: any, algorithm_name: str, imag_config: dict, local_path: str, mlflow_path: str) -> None: - """Plot the clustering result in 2D graph.""" - print("-------** Cluster Two-Dimensional Diagram **----------") - dbscan_result_plot(X, trained_model, imag_config, algorithm_name) - save_fig(f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) - save_data(X, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) - def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None: """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") - self._clustering_result_plot( - X=self.X, - trained_model=self.model, - algorithm_name=self.naming, - imag_config=self.image_config, - local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, - mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, - ) class AffinityPropagationClustering(ClusteringWorkflowBase): diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_common.py b/geochemistrypi/data_mining/model/func/algo_clustering/_common.py index 9a58d65f..855d6a3f 100644 --- a/geochemistrypi/data_mining/model/func/algo_clustering/_common.py +++ b/geochemistrypi/data_mining/model/func/algo_clustering/_common.py @@ -1,31 +1,34 @@ # -*- coding: utf-8 -*- +from itertools import cycle from typing import Dict +import matplotlib.cm as cm import matplotlib.pyplot as plt +import numpy as np import pandas as pd import seaborn as sns from rich import print from sklearn.metrics import calinski_harabasz_score, silhouette_samples, silhouette_score -def score(X: pd.DataFrame, labels: pd.DataFrame) -> Dict: +def score(data: pd.DataFrame, labels: pd.DataFrame) -> Dict: """Calculate the scores of the clustering model. Parameters ---------- - X : pd.DataFrame (n_samples, n_components) + data : pd.DataFrame (n_samples, n_components) The true values. - label : pd.DataFrame (n_samples, n_components) - The labels values. + labels : pd.DataFrame (n_samples, n_components) + Labels of each point. Returns ------- scores : dict The scores of the clustering model. """ - silhouette = silhouette_score(X, labels) - calinski_harabaz = calinski_harabasz_score(X, labels) + silhouette = silhouette_score(data, labels) + calinski_harabaz = calinski_harabasz_score(data, labels) print("silhouette_score: ", silhouette) print("calinski_harabasz_score:", calinski_harabaz) scores = { @@ -35,49 +38,281 @@ def score(X: pd.DataFrame, labels: pd.DataFrame) -> Dict: return scores -def plot_results(X, labels, algorithm_name: str, cluster_centers_=None) -> None: - """Plot clustering results of the clustering model. +def scatter2d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str) -> None: + """ + Draw the result-2D diagram for analysis. Parameters ---------- - X : pd.DataFrame (n_samples, n_components) - The true values. + data : pd.DataFrame (n_samples, n_components) + The true values. - label : pd.DataFrame (n_samples, n_components) - The labels values. + labels : pd.DataFrame (n_samples,) + Labels of each point. algorithm_name : str - The name of the algorithm model. + the name of the algorithm + """ + markers = ["+", "v", ".", "d", "o", "s", "1", "D", "X", "^", "p", "<", "*", "H", "3", "P"] + colors = [ + "#1f77b4", + "#ff7f0e", + "#2ca02c", + "#d62728", + "#9467bd", + "#8c564b", + "#e377c2", + "#7f7f7f", + "#bcbd22", + "#17becf", + "#33a02c", + "#1f77b4", + "#ff7f0e", + "#2ca02c", + "#d62728", + "#9467bd", + "#8c564b", + "#e377c2", + "#7f7f7f", + "#bcbd22", + ] + + marker_cycle = cycle(markers) + color_cycle = cycle(colors) + + fig = plt.figure() + fig.set_size_inches(18, 10) + plt.subplot(111) + for i, label in enumerate(set(labels)): + cluster_data = data[labels == label] + color = next(color_cycle) + marker = next(marker_cycle) + plt.scatter(cluster_data.iloc[:, 0], cluster_data.iloc[:, 1], c=color, marker=marker) + plt.xlabel(f"{data.columns[0]}") + plt.ylabel(f"{data.columns[1]}") + plt.title(f"Cluster Data Bi-plot - {algorithm_name}") + + +def scatter3d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str) -> None: + """ + Draw the result-3D diagram for analysis. + + Parameters + ---------- + data : pd.DataFrame (n_samples, n_components) + The true values. + + labels : pd.DataFrame (n_samples,) + Labels of each point. - cluster_centers - The center of the algorithm model. + algorithm_name : str + the name of the algorithm """ - sns.scatterplot(x=X.iloc[:, 0], y=X.iloc[:, 1], hue=labels, palette="viridis", s=50, alpha=0.8) + plt.figure() + namelist = data.columns.values.tolist() + fig = plt.figure(figsize=(12, 6), facecolor="w") + plt.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.9) + + ax = fig.add_subplot(121, projection="3d") + ax.scatter(data.iloc[:, 0], data.iloc[:, 1], data.iloc[:, 2], alpha=0.3, c="#FF0000", s=6) + ax.set_xlabel(namelist[0]) + ax.set_ylabel(namelist[1]) + ax.set_zlabel(namelist[2]) + plt.grid(True) + + ax2 = fig.add_subplot(122, projection="3d") + markers = ["+", "v", ".", "d", "o", "s", "1", "D", "X", "^", "p", "<", "*", "H", "3", "P"] + colors = [ + "#1f77b4", + "#ff7f0e", + "#2ca02c", + "#d62728", + "#9467bd", + "#8c564b", + "#e377c2", + "#7f7f7f", + "#bcbd22", + "#17becf", + "#33a02c", + "#1f77b4", + "#ff7f0e", + "#2ca02c", + "#d62728", + "#9467bd", + "#8c564b", + "#e377c2", + "#7f7f7f", + "#bcbd22", + ] + marker_cycle = cycle(markers) + color_cycle = cycle(colors) + + for i, label in enumerate(set(labels)): + cluster_data = data[labels == label] + color = next(color_cycle) + marker = next(marker_cycle) + ax2.scatter(cluster_data.iloc[:, 0], cluster_data.iloc[:, 1], cluster_data.iloc[:, 2], c=color, marker=marker, s=6, cmap=plt.cm.Paired, edgecolors="none") + + ax2.set_xlabel(namelist[0]) + ax2.set_ylabel(namelist[1]) + ax2.set_zlabel(namelist[2]) + plt.grid(True) + ax.set_title(f"Base Data Tri-plot - {algorithm_name}") + ax2.set_title(f"Cluster Data Tri-plot - {algorithm_name}") + + +def plot_silhouette_diagram(data: pd.DataFrame, labels: pd.DataFrame, cluster_centers_: np.ndarray, model: object, algorithm_name: str) -> None: + """ + Draw the silhouette diagram for analysis. + + Parameters + ---------- + data : pd.DataFrame (n_samples, n_components) + The true values. + + labels : pd.DataFrame (n_samples,) + Labels of each point. + + cluster_centers_: np.ndarray (n_samples,) + Coordinates of cluster centers. If the algorithm stops before fully converging (see tol and max_iter), these will not be consistent with labels_. + + model : sklearn algorithm model + The sklearn algorithm model trained with X. + + algorithm_name : str + the name of the algorithm + + References + ---------- + Silhouette analysis can be used to study the separation distance between the resulting clusters. + The silhouette plot displays a measure of how close each point in one cluster is to other points in the + neighboring clusters and thus provides a way to assess parameters like number of clusters visually. + This measure has a range of [-1, 1]. + + https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html + """ + if hasattr(model, "n_clusters"): + n_clusters = model.n_clusters + else: + n_clusters = len(set(labels)) - (1 if -1 in labels else 0) + + # Create a subplot with 1 row and 2 columns + fig, (ax1, ax2) = plt.subplots(1, 2) + fig.set_size_inches(18, 10) + + # The 1st subplot is the silhouette plot + # The silhouette coefficient can range from -1, 1 but in this example all + # lie within [-0.1, 1] + ax1.set_xlim([-0.1, 1]) + # The (n_clusters+1)*10 is for inserting blank space between silhouette + # plots of individual clusters, to demarcate them clearly. + ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10]) + + # The silhouette_score gives the average value for all the samples. + # This gives a perspective into the density and separation of the formed + # clusters + silhouette_avg = silhouette_score(data, labels) + print( + "For n_clusters =", + n_clusters, + "The average silhouette_score is :", + silhouette_avg, + ) + + # Compute the silhouette scores for each sample + sample_silhouette_values = silhouette_samples(data, labels) + + if n_clusters >= 20: + Fontsize = 5 + y_long = 7 + else: + Fontsize = None + y_long = 10 + + y_lower = 10 + for i in range(n_clusters): + # Aggregate the silhouette scores for samples belonging to + # cluster i, and sort them + ith_cluster_silhouette_values = sample_silhouette_values[labels == i] + + ith_cluster_silhouette_values.sort() + + size_cluster_i = ith_cluster_silhouette_values.shape[0] + y_upper = y_lower + size_cluster_i + + color = cm.nipy_spectral(float(i) / n_clusters) + ax1.fill_betweenx( + np.arange(y_lower, y_upper), + 0, + ith_cluster_silhouette_values, + facecolor=color, + edgecolor=color, + alpha=0.7, + ) + + # Label the silhouette plots with their cluster numbers at the middle + ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontsize=Fontsize) + + # Compute the new y_lower for next plot + y_lower = y_upper + y_long # 10 for the 0 samples + + ax1.set_title("The silhouette plot for the various clusters.") + ax1.set_xlabel("The silhouette coefficient values") + ax1.set_ylabel("Cluster label") + + # The vertical line for average silhouette score of all the values + ax1.axvline(x=silhouette_avg, color="red", linestyle="--") + + ax1.set_yticks([]) # Clear the yaxis labels / ticks + ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) + + # 2nd Plot showing the actual clusters formed + colors = cm.nipy_spectral(labels.astype(float) / n_clusters) + ax2.scatter(data.iloc[:, 0], data.iloc[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k") + if not isinstance(cluster_centers_, str): - plt.scatter(cluster_centers_[:, 0], cluster_centers_[:, 1], c="red", marker="X", s=200, label="Cluster Centers") - plt.title(f"results - {algorithm_name}") - plt.xlabel("Feature 1") - plt.ylabel("Feature 2") - plt.legend() + # Labeling the clusters + centers = cluster_centers_ + # Draw white circles at cluster centers + ax2.scatter( + centers[:, 0], + centers[:, 1], + marker="o", + c="white", + alpha=1, + s=200, + edgecolor="k", + ) + + for i, c in enumerate(centers): + ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k") + + ax2.set_title("The visualization of the clustered data.") + ax2.set_xlabel("Feature space for the 1st feature") + ax2.set_ylabel("Feature space for the 2nd feature") + plt.suptitle( + f"Silhouette analysis for clustering on sample data with n_clusters = %d - {algorithm_name}" % n_clusters, + fontsize=14, + fontweight="bold", + ) -def plot_silhouette_diagram(X, labels, algorithm_name: str): +def plot_silhouette_value_diagram(data, labels, algorithm_name: str): """Calculate the scores of the clustering model. Parameters ---------- - X : pd.DataFrame (n_samples, n_components) + data : pd.DataFrame (n_samples, n_components) The true values. - label : pd.DataFrame (n_samples, n_components) - The labels values. + labels : pd.DataFrame (n_samples, n_components) + Labels of each point. algorithm_name : str The name of the algorithm model. """ - silhouette_values = silhouette_samples(X, labels) + silhouette_values = silhouette_samples(data, labels) sns.histplot(silhouette_values, bins=30, kde=True) - plt.title(f"Silhouette Diagram - {algorithm_name}") + plt.title(f"Silhouette value Diagram - {algorithm_name}") plt.xlabel("Silhouette Coefficient") plt.ylabel("Frequency") - plt.legend() diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py b/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py index 951dca13..003334bf 100644 --- a/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py +++ b/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py @@ -1,8 +1,5 @@ from typing import Dict -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd from rich import print from ....constants import SECTION @@ -47,94 +44,3 @@ def dbscan_manual_hyper_parameters() -> Dict: "p": p, } return hyper_parameters - - -def dbscan_result_plot(data: pd.DataFrame, trained_model: any, image_config: dict, algorithm_name: str) -> None: - """ - Draw the clustering result diagram for analysis. - - Parameters - ---------- - data: pd.DataFrame (n_samples, n_components) - Data for silhouette. - - trained_model: any - The algorithm which to be used - - algorithm_name : str - the name of the algorithm - - References - ---------- - The DBSCAN algorithm is deterministic, always generating the same clusters when given the same data in the same order. - - https://scikit-learn.org/stable/modules/clustering.html/dbscan - - """ - db = trained_model.fit(data) - labels = trained_model.labels_ - core_samples_mask = np.zeros_like(db.labels_, dtype=bool) - core_samples_mask[db.core_sample_indices_] = True - n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) - print("Estimated number of clusters: %d" % n_clusters_) - unique_labels = set(labels) - - # create drawing canvas - fig, ax = plt.subplots(figsize=(image_config["width"], image_config["height"]), dpi=image_config["dpi"]) - - # draw the main content - colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] - for k, col in zip(unique_labels, colors): - if k == -1: - col = [0, 0, 0, 1] - class_member_mask = labels == k - xy = data[class_member_mask & core_samples_mask] - ax.plot( - xy.iloc[:, 0], - xy.iloc[:, 1], - image_config["marker_angle"], - markerfacecolor=tuple(col), - markeredgecolor=image_config["edgecolor"], - markersize=image_config["markersize1"], - alpha=image_config["alpha1"], - ) - xy = data[class_member_mask & ~core_samples_mask] - ax.plot( - xy.iloc[:, 0], - xy.iloc[:, 1], - image_config["marker_circle"], - markerfacecolor=tuple(col), - markeredgecolor=image_config["edgecolor"], - markersize=image_config["markersize2"], - alpha=image_config["alpha2"], - ) - - # automatically optimize picture layout structure - fig.tight_layout() - xmin, xmax = ax.get_xlim() - ymin, ymax = ax.get_ylim() - x_adjustment = (xmax - xmin) * 0.1 - y_adjustment = (ymax - ymin) * 0.1 - ax.axis([xmin - x_adjustment, xmax + x_adjustment, ymin - y_adjustment, ymax + y_adjustment]) - - # convert the font of the axes - plt.tick_params(labelsize=image_config["labelsize"]) # adjust the font size of the axis label - # plt.setp(ax.get_xticklabels(), rotation=image_config['xrotation'], ha=image_config['xha'], - # rotation_mode="anchor") # axis label rotation Angle - # plt.setp(ax.get_yticklabels(), rotation=image_config['rot'], ha=image_config['yha'], - # rotation_mode="anchor") # axis label rotation Angle - x1_label = ax.get_xticklabels() # adjust the axis label font - [x1_label_temp.set_fontname(image_config["axislabelfont"]) for x1_label_temp in x1_label] - y1_label = ax.get_yticklabels() - [y1_label_temp.set_fontname(image_config["axislabelfont"]) for y1_label_temp in y1_label] - - ax.set_title( - label=algorithm_name, - fontdict={ - "size": image_config["title_size"], - "color": image_config["title_color"], - "family": image_config["title_font"], - }, - loc=image_config["title_location"], - pad=image_config["title_pad"], - ) diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py b/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py index 5e3b2654..82668a55 100644 --- a/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py +++ b/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py @@ -1,11 +1,6 @@ from typing import Dict -import matplotlib.cm as cm -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd from rich import print -from sklearn.metrics import silhouette_samples, silhouette_score from ....constants import SECTION from ....data.data_readiness import float_input, num_input, str_input @@ -38,159 +33,3 @@ def kmeans_manual_hyper_parameters() -> Dict: algorithm = str_input(algorithms, SECTION[2]) hyper_parameters = {"n_clusters": n_clusters, "init": init, "max_iter": max_iters, "tol": tol, "algorithm": algorithm} return hyper_parameters - - -def plot_silhouette_diagram_kmeans(data: pd.DataFrame, cluster_labels: pd.DataFrame, cluster_centers_: np.ndarray, n_clusters: int, algorithm_name: str) -> None: - """ - Draw the silhouette diagram for analysis. - - Parameters - ---------- - data: pd.DataFrame (n_samples, n_components) - Data for silhouette. - - cluster_labels: pd.DataFrame (n_samples,) - Labels of each point. - - cluster_centers_: np.ndarray (n_samples,) - Coordinates of cluster centers. If the algorithm stops before fully converging (see tol and max_iter), these will not be consistent with labels_. - - n_clusters: int - Number of features seen during fit. - - algorithm_name : str - the name of the algorithm - - References - ---------- - Silhouette analysis can be used to study the separation distance between the resulting clusters. - The silhouette plot displays a measure of how close each point in one cluster is to other points in the - neighboring clusters and thus provides a way to assess parameters like number of clusters visually. - This measure has a range of [-1, 1]. - - https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html - """ - # Create a subplot with 1 row and 2 columns - fig, (ax1, ax2) = plt.subplots(1, 2) - fig.set_size_inches(18, 7) - - # The 1st subplot is the silhouette plot - # The silhouette coefficient can range from -1, 1 but in this example all - # lie within [-0.1, 1] - ax1.set_xlim([-0.1, 1]) - # The (n_clusters+1)*10 is for inserting blank space between silhouette - # plots of individual clusters, to demarcate them clearly. - ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10]) - - # The silhouette_score gives the average value for all the samples. - # This gives a perspective into the density and separation of the formed - # clusters - silhouette_avg = silhouette_score(data, cluster_labels) - print( - "For n_clusters =", - n_clusters, - "The average silhouette_score is :", - silhouette_avg, - ) - - # Compute the silhouette scores for each sample - sample_silhouette_values = silhouette_samples(data, cluster_labels) - - y_lower = 10 - for i in range(n_clusters): - # Aggregate the silhouette scores for samples belonging to - # cluster i, and sort them - ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] - - ith_cluster_silhouette_values.sort() - - size_cluster_i = ith_cluster_silhouette_values.shape[0] - y_upper = y_lower + size_cluster_i - - color = cm.nipy_spectral(float(i) / n_clusters) - ax1.fill_betweenx( - np.arange(y_lower, y_upper), - 0, - ith_cluster_silhouette_values, - facecolor=color, - edgecolor=color, - alpha=0.7, - ) - - # Label the silhouette plots with their cluster numbers at the middle - ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) - - # Compute the new y_lower for next plot - y_lower = y_upper + 10 # 10 for the 0 samples - - ax1.set_title("The silhouette plot for the various clusters.") - ax1.set_xlabel("The silhouette coefficient values") - ax1.set_ylabel("Cluster label") - - # The vertical line for average silhouette score of all the values - ax1.axvline(x=silhouette_avg, color="red", linestyle="--") - - ax1.set_yticks([]) # Clear the yaxis labels / ticks - ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) - - # 2nd Plot showing the actual clusters formed - colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) - ax2.scatter(data.iloc[:, 0], data.iloc[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k") - - # Labeling the clusters - centers = cluster_centers_ - # Draw white circles at cluster centers - ax2.scatter( - centers[:, 0], - centers[:, 1], - marker="o", - c="white", - alpha=1, - s=200, - edgecolor="k", - ) - - for i, c in enumerate(centers): - ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k") - - ax2.set_title("The visualization of the clustered data.") - ax2.set_xlabel("Feature space for the 1st feature") - ax2.set_ylabel("Feature space for the 2nd feature") - plt.suptitle( - f"Silhouette analysis for KMeans clustering on sample data with n_clusters = %d - {algorithm_name}" % n_clusters, - fontsize=14, - fontweight="bold", - ) - - -def scatter2d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name: str) -> None: - plt.figure() - plt.subplot(111) - plt.scatter(data.iloc[:, 0], data.iloc[:, 1], c=cluster_labels) - - plt.xlabel(f"{data.columns[0]}") - plt.ylabel(f"{data.columns[1]}") - plt.title(f"Cluster Data Bi-plot - {algorithm_name}") - - -def scatter3d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name: str) -> None: - plt.figure() - namelist = data.columns.values.tolist() - fig = plt.figure(figsize=(12, 6), facecolor="w") - plt.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.9) - - ax = fig.add_subplot(121, projection="3d") - ax.scatter(data.iloc[:, 0], data.iloc[:, 1], data.iloc[:, 2], alpha=0.3, c="#FF0000", s=6) - ax.set_xlabel(namelist[0]) - ax.set_ylabel(namelist[1]) - ax.set_zlabel(namelist[2]) - plt.grid(True) - - ax2 = fig.add_subplot(122, projection="3d") - ax2.scatter(data.iloc[:, 0], data.iloc[:, 1], data.iloc[:, 2], c=cluster_labels, s=6, cmap=plt.cm.Paired, edgecolors="none") - ax2.set_xlabel(namelist[0]) - ax2.set_ylabel(namelist[1]) - ax2.set_zlabel(namelist[2]) - plt.grid(True) - ax.set_title(f"Base Data Tri-plot - {algorithm_name}") - ax2.set_title(f"Cluster Data Tri-plot - {algorithm_name}")