From ca96d0c40425e8f435edcd8cffb8c3846fa3af56 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 30 Jan 2024 15:33:24 +0800 Subject: [PATCH] feat: Added AffinityPropagation algorithm in clustering --- geochemistrypi/data_mining/constants.py | 2 +- .../data_mining/model/clustering.py | 78 ++++++++++++++++--- .../algo_clustering/_affinitypropagation.py | 30 +++++++ geochemistrypi/data_mining/process/cluster.py | 10 ++- 4 files changed, 108 insertions(+), 12 deletions(-) create mode 100644 geochemistrypi/data_mining/model/func/algo_clustering/_affinitypropagation.py diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py index 6ccc7a23..00e1c8dc 100644 --- a/geochemistrypi/data_mining/constants.py +++ b/geochemistrypi/data_mining/constants.py @@ -64,7 +64,7 @@ # "Decision Tree", # Histogram-based Gradient Boosting, ] -CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative"] +CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative", "AffinityPropagation"] DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"] # The model can deal with missing values diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py index fbbe8afa..6dc456ae 100644 --- a/geochemistrypi/data_mining/model/clustering.py +++ b/geochemistrypi/data_mining/model/clustering.py @@ -13,6 +13,7 @@ from ..constants import MLFLOW_ARTIFACT_DATA_PATH, MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH from ..utils.base import clear_output, save_data, save_fig, save_text from ._base import WorkflowBase +from .func.algo_clustering._affinitypropagation import affinitypropagation_manual_hyper_parameters from .func.algo_clustering._agglomerative import agglomerative_manual_hyper_parameters from .func.algo_clustering._common import plot_silhouette_diagram, plot_silhouette_value_diagram, scatter2d, scatter3d, score from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters @@ -557,15 +558,63 @@ class AffinityPropagationClustering(ClusteringWorkflowBase): def __init__( self, *, - damping=0.5, - max_iter=200, - convergence_iter=15, - copy=True, - preference=None, - affinity="euclidean", - verbose=False, - random_state=None, - ): + damping: float = 0.5, + max_iter: int = 200, + convergence_iter: int = 15, + copy: bool = True, + preference: Optional[Dict] = None, + affinity: str = "euclidean", + verbose: bool = False, + random_state: Optional[Dict] = None, + ) -> None: + """ + Parameters + ---------- + damping : float, default=0.5 + Damping factor in the range `[0.5, 1.0)` is the extent to + which the current value is maintained relative to + incoming values (weighted 1 - damping). This in order + to avoid numerical oscillations when updating these + values (messages). + + max_iter : int, default=200 + Maximum number of iterations. + + convergence_iter : int, default=15 + Number of iterations with no change in the number + of estimated clusters that stops the convergence. + + copy : bool, default=True + Make a copy of input data. + + preference : array-like of shape (n_samples,) or float, default=None + Preferences for each point - points with larger values of + preferences are more likely to be chosen as exemplars. The number + of exemplars, ie of clusters, is influenced by the input + preferences value. If the preferences are not passed as arguments, + they will be set to the median of the input similarities. + + affinity : {'euclidean', 'precomputed'}, default='euclidean' + Which affinity to use. At the moment 'precomputed' and + ``euclidean`` are supported. 'euclidean' uses the + negative squared euclidean distance between points. + + verbose : bool, default=False + Whether to be verbose. + + random_state : int, RandomState instance or None, default=None + Pseudo-random number generator to control the starting state. + Use an int for reproducible results across function calls. + See the :term:`Glossary `. + + .. versionadded:: 0.23 + this parameter was previously hardcoded as 0. + + References + ---------------------------------------- + Scikit-learn API: sklearn.cluster.AffinityPropagation + https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation + """ super().__init__() self.damping = damping @@ -592,7 +641,16 @@ def __init__( ) self.naming = AffinityPropagationClustering.name - pass + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = affinitypropagation_manual_hyper_parameters() + clear_output() + return hyper_parameters + + def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" class MeanShiftClustering(ClusteringWorkflowBase): diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_affinitypropagation.py b/geochemistrypi/data_mining/model/func/algo_clustering/_affinitypropagation.py new file mode 100644 index 00000000..bb2da18b --- /dev/null +++ b/geochemistrypi/data_mining/model/func/algo_clustering/_affinitypropagation.py @@ -0,0 +1,30 @@ +from typing import Dict + +from rich import print + +from ....constants import SECTION +from ....data.data_readiness import float_input, num_input, str_input + + +def affinitypropagation_manual_hyper_parameters() -> Dict: + """Manually set hyperparameters. + + Returns + ------- + hyper_parameters : dict + """ + print("damping: The extent to which the current value is maintained relative to incoming values ") + print("Please specify the number of clusters for AffinityPropagation. A good starting range could be between 0.5 and 1, such as 0.5.") + damping = float_input(0.5, SECTION[2], "damping: ") + print("Max Iter: Maximum number of iterations of the algorithm for a single run.") + print("Please specify the maximum number of iterations of the affinitypropagation algorithm for a single run. A good starting range could be between 100 and 400, such as 200.") + max_iter = num_input(SECTION[2], "Max Iter: ") + print("convergence_iter: Number of iterations with no change in the number of estimated clusters that stops the convergence.") + print("Please specify the convergence number of iterations of the affinitypropagation algorithm. A good starting range could be between 10 and 15, such as 15.") + convergence_iter = num_input(SECTION[2], "convergence_iter: ") + print("affinity: Different affinity methods for affinitypropagation in clustering.") + print("Please specify the affinity to use for the computation. It is generally recommended to leave it as 'euclidean'.") + affinity = ["euclidean", "precomputed"] + affinity = str_input(affinity, SECTION[2]) + hyper_parameters = {"damping": damping, "max_iter": max_iter, "convergence_iter": convergence_iter, "affinity": affinity} + return hyper_parameters diff --git a/geochemistrypi/data_mining/process/cluster.py b/geochemistrypi/data_mining/process/cluster.py index 03686770..db87eb41 100644 --- a/geochemistrypi/data_mining/process/cluster.py +++ b/geochemistrypi/data_mining/process/cluster.py @@ -4,7 +4,7 @@ import pandas as pd -from ..model.clustering import Agglomerative, ClusteringWorkflowBase, DBSCANClustering, KMeansClustering +from ..model.clustering import AffinityPropagationClustering, Agglomerative, ClusteringWorkflowBase, DBSCANClustering, KMeansClustering from ._base import ModelSelectionBase @@ -54,6 +54,14 @@ def activate( n_clusters=hyper_parameters["n_clusters"], linkage=hyper_parameters["linkage"], ) + elif self.model_name == "AffinityPropagation": + hyper_parameters = AffinityPropagationClustering.manual_hyper_parameters() + self.clt_workflow = AffinityPropagationClustering( + damping=hyper_parameters["damping"], + max_iter=hyper_parameters["max_iter"], + convergence_iter=hyper_parameters["convergence_iter"], + affinity=hyper_parameters["affinity"], + ) elif self.model_name == "": pass