Revert "feat: add agglomerative clustering algorithm"

This reverts commit f8c559b.
ZJUEarthData · Jan 1, 2024 · 6921a5c · 6921a5c
1 parent f8c559b
commit 6921a5c
Show file tree

Hide file tree

Showing 5 changed files with 41 additions and 167 deletions.
diff --git a/README.md b/README.md
@@ -205,7 +205,7 @@ The whole package is under construction and the documentation is progressively e
   Email: [email protected]
 
 **Technical Group:**
-+ Jianming Zhao (Jamie, Zhejiang University, China)
++ Jianming Zhao (Jamie, Jilin University, Changchun, China)
 + Jianhao Sun (Jin, China University of Geosciences, Wuhan, China)
 + Kaixin Zheng (Hayne, Sun Yat-sen University, China)
 + Jianing Wang (National University of Singapore, Singapore)

diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -64,7 +64,7 @@
     # "Decision Tree",
     # Histogram-based Gradient Boosting,
 ]
-CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative"]
+CLUSTERING_MODELS = ["KMeans", "DBSCAN"]
 DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"]
 
 # The model can deal with missing values

diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py
@@ -5,18 +5,16 @@
 
 import mlflow
 import numpy as np
-from numpy.typing import ArrayLike
 import pandas as pd
 from rich import print
-from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
+from sklearn.cluster import DBSCAN, AffinityPropagation, KMeans
 
 from ..constants import MLFLOW_ARTIFACT_DATA_PATH, MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH
 from ..utils.base import clear_output, save_data, save_fig, save_text
 from ._base import WorkflowBase
 from .func.algo_clustering._common import plot_results, plot_silhouette_diagram, score
 from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters, dbscan_result_plot
 from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters, plot_silhouette_diagram_kmeans, scatter2d, scatter3d
-from .func.algo_clustering._agglomerative import agglomerative_manual_hyper_parameters
 
 
 class ClusteringWorkflowBase(WorkflowBase):
@@ -199,7 +197,7 @@ def __init__(
             might change in the future for a better heuristic.
 
         References
-        ----------
+        ----------------------------------------
         Scikit-learn API: sklearn.cluster.KMeans
         https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
         """
@@ -406,7 +404,7 @@ def __init__(
             The number of parallel jobs to run. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details.
 
         References
-        ----------
+        ----------------------------------------
         Scikit-learn API: sklearn.cluster.DBSCAN
         https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
         """
@@ -463,138 +461,43 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
         )
 
 
-class Agglomerative(ClusteringWorkflowBase):
-    """The automation workflow of using Agglomerative Clustering to make insightful products."""
-
-    name = "Agglomerative"
-    special_function = []
+class AffinityPropagationClustering(ClusteringWorkflowBase):
+    name = "AffinityPropagation"
 
     def __init__(
         self,
-        n_clusters: int = 2,
-        affinity: str = "euclidean",
-        metric: str = None,
-        memory: str = None,
-        connectivity: ArrayLike = None,
-        compute_full_tree: str = "auto",
-        linkage: str = "ward",
-        distance_threshold: float = None,
-        compute_distances: bool = False,
-    ) -> None:
-        """
-        Parameters
-        ----------
-        n_clusters : int or None, default=2
-            The number of clusters to find. It must be ``None`` if
-            ``distance_threshold`` is not ``None``.
-
-        affinity : str or callable, default='euclidean'
-            Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
-            "manhattan", "cosine", or "precomputed".
-            If linkage is "ward", only "euclidean" is accepted.
-            If "precomputed", a distance matrix (instead of a similarity matrix)
-            is needed as input for the fit method.
-
-        memory : str or object with the joblib.Memory interface, default=None
-            Used to cache the output of the computation of the tree.
-            By default, no caching is done. If a string is given, it is the
-            path to the caching directory.
-
-        connectivity : array-like or callable, default=None
-            Connectivity matrix. Defines for each sample the neighboring
-            samples following a given structure of the data.
-            This can be a connectivity matrix itself or a callable that transforms
-            the data into a connectivity matrix, such as derived from
-            `kneighbors_graph`. Default is ``None``, i.e, the
-            hierarchical clustering algorithm is unstructured.
-
-        compute_full_tree : 'auto' or bool, default='auto'
-            Stop early the construction of the tree at ``n_clusters``. This is
-            useful to decrease computation time if the number of clusters is not
-            small compared to the number of samples. This option is useful only
-            when specifying a connectivity matrix. Note also that when varying the
-            number of clusters and using caching, it may be advantageous to compute
-            the full tree. It must be ``True`` if ``distance_threshold`` is not
-            ``None``. By default `compute_full_tree` is "auto", which is equivalent
-            to `True` when `distance_threshold` is not `None` or that `n_clusters`
-            is inferior to the maximum between 100 or `0.02 * n_samples`.
-            Otherwise, "auto" is equivalent to `False`.
-
-        linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
-            Which linkage criterion to use. The linkage criterion determines which
-            distance to use between sets of observation. The algorithm will merge
-            the pairs of cluster that minimize this criterion.
-
-            - 'ward' minimizes the variance of the clusters being merged.
-            - 'average' uses the average of the distances of each observation of
-            the two sets.
-            - 'complete' or 'maximum' linkage uses the maximum distances between
-            all observations of the two sets.
-            - 'single' uses the minimum of the distances between all observations
-            of the two sets.
-
-            .. versionadded:: 0.20
-                Added the 'single' option
-
-        distance_threshold : float, default=None
-            The linkage distance threshold above which, clusters will not be
-            merged. If not ``None``, ``n_clusters`` must be ``None`` and
-            ``compute_full_tree`` must be ``True``.
-
-            .. versionadded:: 0.21
-
-        compute_distances : bool, default=False
-            Computes distances between clusters even if `distance_threshold` is not
-            used. This can be used to make dendrogram visualization, but introduces
-            a computational and memory overhead.
-
-            .. versionadded:: 0.24
-
-        References
-        ----------
-        sklearn.cluster.AgglomerativeClustering
-        https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
-        """
+        *,
+        damping=0.5,
+        max_iter=200,
+        convergence_iter=15,
+        copy=True,
+        preference=None,
+        affinity="euclidean",
+        verbose=False,
+        random_state=None,
+    ):
 
         super().__init__()
-        self.n_clusters = n_clusters
-        self.distance_threshold = distance_threshold
-        self.memory = memory
-        self.connectivity = connectivity
-        self.compute_full_tree = compute_full_tree
-        self.linkage = linkage
+        self.damping = damping
+        self.max_iter = max_iter
+        self.convergence_iter = convergence_iter
+        self.copy = copy
+        self.verbose = verbose
+        self.preference = preference
         self.affinity = affinity
-        self.metric = metric
-        self.compute_distances = compute_distances
-
-        self.model = AgglomerativeClustering(
-            n_clusters=self.n_clusters,
-            affinity=self.affinity,
-            memory=self.memory,
-            connectivity=self.connectivity,
-            compute_full_tree=self.compute_full_tree,
-            linkage=self.linkage,
-            distance_threshold=self.distance_threshold,
-            compute_distances=self.compute_distances,
+        self.random_state = random_state
+        self.model = AffinityPropagation(
+            damping=self.damping,
+            max_iter=self.max_iter,
+            convergence_iter=self.convergence_iter,
+            copy=self.copy,
+            preference=None,
+            affinity="euclidean",
+            verbose=False,
+            random_state=None,
         )
+        self.naming = AffinityPropagationClustering.name
 
-        self.naming = Agglomerative.name
-
-    @classmethod
-    def manual_hyper_parameters(cls) -> Dict:
-        """Manual hyper-parameters specification."""
-        print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-")
-        hyper_parameters = agglomerative_manual_hyper_parameters()
-        clear_output()
-        return hyper_parameters
-
-    def special_components(self, **kwargs) -> None:
-        """Invoke all special application functions for this algorithms by Scikit-learn framework."""
-        pass
-
-
-class AffinityPropagationClustering(ClusteringWorkflowBase):
-    name = "AffinityPropagation"
     pass
 
 
@@ -613,6 +516,11 @@ class WardHierarchicalClustering(ClusteringWorkflowBase):
     pass
 
 
+class AgglomerativeClustering(ClusteringWorkflowBase):
+    name = "Agglomerative"
+    pass
+
+
 class OPTICSClustering(ClusteringWorkflowBase):
     name = "OPTICS"
     pass

diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py b/geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py
diff --git a/geochemistrypi/data_mining/process/cluster.py b/geochemistrypi/data_mining/process/cluster.py
@@ -3,7 +3,8 @@
 from typing import Optional
 
 import pandas as pd
-from ..model.clustering import ClusteringWorkflowBase, DBSCANClustering, KMeansClustering, Agglomerative
+
+from ..model.clustering import ClusteringWorkflowBase, DBSCANClustering, KMeansClustering
 from ._base import ModelSelectionBase
 
 
@@ -47,12 +48,6 @@ def activate(
                 leaf_size=hyper_parameters["leaf_size"],
                 p=hyper_parameters["p"],
             )
-        elif self.model_name == "Agglomerative":
-            hyper_parameters = Agglomerative.manual_hyper_parameters()
-            self.clt_workflow = Agglomerative(
-                n_clusters=hyper_parameters["n_clusters"],
-                linkage=hyper_parameters["linkage"],
-            )
         elif self.model_name == "":
             pass