Skip to content

Commit

Permalink
Revert "feat: add agglomerative clustering algorithm"
Browse files Browse the repository at this point in the history
This reverts commit f8c559b.
  • Loading branch information
PotatoXi committed Jan 1, 2024
1 parent f8c559b commit 6921a5c
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 167 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ The whole package is under construction and the documentation is progressively e
Email: [email protected]

**Technical Group:**
+ Jianming Zhao (Jamie, Zhejiang University, China)
+ Jianming Zhao (Jamie, Jilin University, Changchun, China)
+ Jianhao Sun (Jin, China University of Geosciences, Wuhan, China)
+ Kaixin Zheng (Hayne, Sun Yat-sen University, China)
+ Jianing Wang (National University of Singapore, Singapore)
Expand Down
2 changes: 1 addition & 1 deletion geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
# "Decision Tree",
# Histogram-based Gradient Boosting,
]
CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative"]
CLUSTERING_MODELS = ["KMeans", "DBSCAN"]
DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"]

# The model can deal with missing values
Expand Down
166 changes: 37 additions & 129 deletions geochemistrypi/data_mining/model/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@

import mlflow
import numpy as np
from numpy.typing import ArrayLike
import pandas as pd
from rich import print
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
from sklearn.cluster import DBSCAN, AffinityPropagation, KMeans

from ..constants import MLFLOW_ARTIFACT_DATA_PATH, MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH
from ..utils.base import clear_output, save_data, save_fig, save_text
from ._base import WorkflowBase
from .func.algo_clustering._common import plot_results, plot_silhouette_diagram, score
from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters, dbscan_result_plot
from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters, plot_silhouette_diagram_kmeans, scatter2d, scatter3d
from .func.algo_clustering._agglomerative import agglomerative_manual_hyper_parameters


class ClusteringWorkflowBase(WorkflowBase):
Expand Down Expand Up @@ -199,7 +197,7 @@ def __init__(
might change in the future for a better heuristic.
References
----------
----------------------------------------
Scikit-learn API: sklearn.cluster.KMeans
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
"""
Expand Down Expand Up @@ -406,7 +404,7 @@ def __init__(
The number of parallel jobs to run. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details.
References
----------
----------------------------------------
Scikit-learn API: sklearn.cluster.DBSCAN
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
"""
Expand Down Expand Up @@ -463,138 +461,43 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
)


class Agglomerative(ClusteringWorkflowBase):
"""The automation workflow of using Agglomerative Clustering to make insightful products."""

name = "Agglomerative"
special_function = []
class AffinityPropagationClustering(ClusteringWorkflowBase):
name = "AffinityPropagation"

def __init__(
self,
n_clusters: int = 2,
affinity: str = "euclidean",
metric: str = None,
memory: str = None,
connectivity: ArrayLike = None,
compute_full_tree: str = "auto",
linkage: str = "ward",
distance_threshold: float = None,
compute_distances: bool = False,
) -> None:
"""
Parameters
----------
n_clusters : int or None, default=2
The number of clusters to find. It must be ``None`` if
``distance_threshold`` is not ``None``.
affinity : str or callable, default='euclidean'
Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
"manhattan", "cosine", or "precomputed".
If linkage is "ward", only "euclidean" is accepted.
If "precomputed", a distance matrix (instead of a similarity matrix)
is needed as input for the fit method.
memory : str or object with the joblib.Memory interface, default=None
Used to cache the output of the computation of the tree.
By default, no caching is done. If a string is given, it is the
path to the caching directory.
connectivity : array-like or callable, default=None
Connectivity matrix. Defines for each sample the neighboring
samples following a given structure of the data.
This can be a connectivity matrix itself or a callable that transforms
the data into a connectivity matrix, such as derived from
`kneighbors_graph`. Default is ``None``, i.e, the
hierarchical clustering algorithm is unstructured.
compute_full_tree : 'auto' or bool, default='auto'
Stop early the construction of the tree at ``n_clusters``. This is
useful to decrease computation time if the number of clusters is not
small compared to the number of samples. This option is useful only
when specifying a connectivity matrix. Note also that when varying the
number of clusters and using caching, it may be advantageous to compute
the full tree. It must be ``True`` if ``distance_threshold`` is not
``None``. By default `compute_full_tree` is "auto", which is equivalent
to `True` when `distance_threshold` is not `None` or that `n_clusters`
is inferior to the maximum between 100 or `0.02 * n_samples`.
Otherwise, "auto" is equivalent to `False`.
linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
Which linkage criterion to use. The linkage criterion determines which
distance to use between sets of observation. The algorithm will merge
the pairs of cluster that minimize this criterion.
- 'ward' minimizes the variance of the clusters being merged.
- 'average' uses the average of the distances of each observation of
the two sets.
- 'complete' or 'maximum' linkage uses the maximum distances between
all observations of the two sets.
- 'single' uses the minimum of the distances between all observations
of the two sets.
.. versionadded:: 0.20
Added the 'single' option
distance_threshold : float, default=None
The linkage distance threshold above which, clusters will not be
merged. If not ``None``, ``n_clusters`` must be ``None`` and
``compute_full_tree`` must be ``True``.
.. versionadded:: 0.21
compute_distances : bool, default=False
Computes distances between clusters even if `distance_threshold` is not
used. This can be used to make dendrogram visualization, but introduces
a computational and memory overhead.
.. versionadded:: 0.24
References
----------
sklearn.cluster.AgglomerativeClustering
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
"""
*,
damping=0.5,
max_iter=200,
convergence_iter=15,
copy=True,
preference=None,
affinity="euclidean",
verbose=False,
random_state=None,
):

super().__init__()
self.n_clusters = n_clusters
self.distance_threshold = distance_threshold
self.memory = memory
self.connectivity = connectivity
self.compute_full_tree = compute_full_tree
self.linkage = linkage
self.damping = damping
self.max_iter = max_iter
self.convergence_iter = convergence_iter
self.copy = copy
self.verbose = verbose
self.preference = preference
self.affinity = affinity
self.metric = metric
self.compute_distances = compute_distances

self.model = AgglomerativeClustering(
n_clusters=self.n_clusters,
affinity=self.affinity,
memory=self.memory,
connectivity=self.connectivity,
compute_full_tree=self.compute_full_tree,
linkage=self.linkage,
distance_threshold=self.distance_threshold,
compute_distances=self.compute_distances,
self.random_state = random_state
self.model = AffinityPropagation(
damping=self.damping,
max_iter=self.max_iter,
convergence_iter=self.convergence_iter,
copy=self.copy,
preference=None,
affinity="euclidean",
verbose=False,
random_state=None,
)
self.naming = AffinityPropagationClustering.name

self.naming = Agglomerative.name

@classmethod
def manual_hyper_parameters(cls) -> Dict:
"""Manual hyper-parameters specification."""
print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-")
hyper_parameters = agglomerative_manual_hyper_parameters()
clear_output()
return hyper_parameters

def special_components(self, **kwargs) -> None:
"""Invoke all special application functions for this algorithms by Scikit-learn framework."""
pass


class AffinityPropagationClustering(ClusteringWorkflowBase):
name = "AffinityPropagation"
pass


Expand All @@ -613,6 +516,11 @@ class WardHierarchicalClustering(ClusteringWorkflowBase):
pass


class AgglomerativeClustering(ClusteringWorkflowBase):
name = "Agglomerative"
pass


class OPTICSClustering(ClusteringWorkflowBase):
name = "OPTICS"
pass
Expand Down

This file was deleted.

9 changes: 2 additions & 7 deletions geochemistrypi/data_mining/process/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from typing import Optional

import pandas as pd
from ..model.clustering import ClusteringWorkflowBase, DBSCANClustering, KMeansClustering, Agglomerative

from ..model.clustering import ClusteringWorkflowBase, DBSCANClustering, KMeansClustering
from ._base import ModelSelectionBase


Expand Down Expand Up @@ -47,12 +48,6 @@ def activate(
leaf_size=hyper_parameters["leaf_size"],
p=hyper_parameters["p"],
)
elif self.model_name == "Agglomerative":
hyper_parameters = Agglomerative.manual_hyper_parameters()
self.clt_workflow = Agglomerative(
n_clusters=hyper_parameters["n_clusters"],
linkage=hyper_parameters["linkage"],
)
elif self.model_name == "":
pass

Expand Down

0 comments on commit 6921a5c

Please sign in to comment.