Merge pull request #305 from ZJUEarthData/dev/Yongkang

feat: Added AffinityPropagation algorithm in clustering
ZJUEarthData · Feb 27, 2024 · f329ae8 · f329ae8
2 parents 6fd12db + ca96d0c
commit f329ae8
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 12 deletions.
diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -65,7 +65,7 @@
     # "Decision Tree",
     # Histogram-based Gradient Boosting,
 ]
-CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative"]
+CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative", "AffinityPropagation"]
 DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"]
 
 # The model can deal with missing values

diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py
@@ -13,6 +13,7 @@
 from ..constants import MLFLOW_ARTIFACT_DATA_PATH, MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH
 from ..utils.base import clear_output, save_data, save_fig, save_text
 from ._base import WorkflowBase
+from .func.algo_clustering._affinitypropagation import affinitypropagation_manual_hyper_parameters
 from .func.algo_clustering._agglomerative import agglomerative_manual_hyper_parameters
 from .func.algo_clustering._common import plot_silhouette_diagram, plot_silhouette_value_diagram, scatter2d, scatter3d, score
 from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters
@@ -557,15 +558,63 @@ class AffinityPropagationClustering(ClusteringWorkflowBase):
     def __init__(
         self,
         *,
-        damping=0.5,
-        max_iter=200,
-        convergence_iter=15,
-        copy=True,
-        preference=None,
-        affinity="euclidean",
-        verbose=False,
-        random_state=None,
-    ):
+        damping: float = 0.5,
+        max_iter: int = 200,
+        convergence_iter: int = 15,
+        copy: bool = True,
+        preference: Optional[Dict] = None,
+        affinity: str = "euclidean",
+        verbose: bool = False,
+        random_state: Optional[Dict] = None,
+    ) -> None:
+        """
+        Parameters
+        ----------
+        damping : float, default=0.5
+            Damping factor in the range `[0.5, 1.0)` is the extent to
+            which the current value is maintained relative to
+            incoming values (weighted 1 - damping). This in order
+            to avoid numerical oscillations when updating these
+            values (messages).
+
+        max_iter : int, default=200
+            Maximum number of iterations.
+
+        convergence_iter : int, default=15
+            Number of iterations with no change in the number
+            of estimated clusters that stops the convergence.
+
+        copy : bool, default=True
+            Make a copy of input data.
+
+        preference : array-like of shape (n_samples,) or float, default=None
+            Preferences for each point - points with larger values of
+            preferences are more likely to be chosen as exemplars. The number
+            of exemplars, ie of clusters, is influenced by the input
+            preferences value. If the preferences are not passed as arguments,
+            they will be set to the median of the input similarities.
+
+        affinity : {'euclidean', 'precomputed'}, default='euclidean'
+            Which affinity to use. At the moment 'precomputed' and
+            ``euclidean`` are supported. 'euclidean' uses the
+            negative squared euclidean distance between points.
+
+        verbose : bool, default=False
+            Whether to be verbose.
+
+        random_state : int, RandomState instance or None, default=None
+            Pseudo-random number generator to control the starting state.
+            Use an int for reproducible results across function calls.
+            See the :term:`Glossary <random_state>`.
+
+            .. versionadded:: 0.23
+                this parameter was previously hardcoded as 0.
+
+        References
+        ----------------------------------------
+        Scikit-learn API: sklearn.cluster.AffinityPropagation
+        https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation
+        """
 
         super().__init__()
         self.damping = damping
@@ -592,7 +641,16 @@ def __init__(
         )
         self.naming = AffinityPropagationClustering.name
 
-    pass
+    @classmethod
+    def manual_hyper_parameters(cls) -> Dict:
+        """Manual hyper-parameters specification."""
+        print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-")
+        hyper_parameters = affinitypropagation_manual_hyper_parameters()
+        clear_output()
+        return hyper_parameters
+
+    def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
+        """Invoke all special application functions for this algorithms by Scikit-learn framework."""
 
 
 class MeanShiftClustering(ClusteringWorkflowBase):

diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_affinitypropagation.py b/geochemistrypi/data_mining/model/func/algo_clustering/_affinitypropagation.py
@@ -0,0 +1,30 @@
+from typing import Dict
+
+from rich import print
+
+from ....constants import SECTION
+from ....data.data_readiness import float_input, num_input, str_input
+
+
+def affinitypropagation_manual_hyper_parameters() -> Dict:
+    """Manually set hyperparameters.
+
+    Returns
+    -------
+    hyper_parameters : dict
+    """
+    print("damping: The extent to which the current value is maintained relative to incoming values ")
+    print("Please specify the number of clusters for AffinityPropagation. A good starting range could be between 0.5 and 1, such as 0.5.")
+    damping = float_input(0.5, SECTION[2], "damping: ")
+    print("Max Iter: Maximum number of iterations of the algorithm for a single run.")
+    print("Please specify the maximum number of iterations of the affinitypropagation algorithm for a single run. A good starting range could be between 100 and 400, such as 200.")
+    max_iter = num_input(SECTION[2], "Max Iter: ")
+    print("convergence_iter: Number of iterations with no change in the number of estimated clusters that stops the convergence.")
+    print("Please specify the convergence number of iterations of the affinitypropagation algorithm. A good starting range could be between 10 and 15, such as 15.")
+    convergence_iter = num_input(SECTION[2], "convergence_iter: ")
+    print("affinity: Different affinity methods for affinitypropagation in clustering.")
+    print("Please specify the affinity to use for the computation. It is generally recommended to leave it as 'euclidean'.")
+    affinity = ["euclidean", "precomputed"]
+    affinity = str_input(affinity, SECTION[2])
+    hyper_parameters = {"damping": damping, "max_iter": max_iter, "convergence_iter": convergence_iter, "affinity": affinity}
+    return hyper_parameters
diff --git a/geochemistrypi/data_mining/process/cluster.py b/geochemistrypi/data_mining/process/cluster.py
@@ -4,7 +4,7 @@
 
 import pandas as pd
 
-from ..model.clustering import Agglomerative, ClusteringWorkflowBase, DBSCANClustering, KMeansClustering
+from ..model.clustering import AffinityPropagationClustering, Agglomerative, ClusteringWorkflowBase, DBSCANClustering, KMeansClustering
 from ._base import ModelSelectionBase
 
 
@@ -54,6 +54,14 @@ def activate(
                 n_clusters=hyper_parameters["n_clusters"],
                 linkage=hyper_parameters["linkage"],
             )
+        elif self.model_name == "AffinityPropagation":
+            hyper_parameters = AffinityPropagationClustering.manual_hyper_parameters()
+            self.clt_workflow = AffinityPropagationClustering(
+                damping=hyper_parameters["damping"],
+                max_iter=hyper_parameters["max_iter"],
+                convergence_iter=hyper_parameters["convergence_iter"],
+                affinity=hyper_parameters["affinity"],
+            )
         elif self.model_name == "":
             pass