From f8c559bcfe19696182dfd8e5d0a5cde2831a6c85 Mon Sep 17 00:00:00 2001 From: jmz <45778832+PotatoXi@users.noreply.github.com> Date: Mon, 1 Jan 2024 15:37:17 +0800 Subject: [PATCH 1/4] feat: add agglomerative clustering algorithm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. add agglomerative framwork 2. agglomerative manual hyperparameters (2 hyperparameters can be adjusted. ) --- README.md | 2 +- geochemistrypi/data_mining/constants.py | 2 +- .../data_mining/model/clustering.py | 166 ++++++++++++++---- .../func/algo_clustering/_agglomerative.py | 29 +++ geochemistrypi/data_mining/process/cluster.py | 9 +- 5 files changed, 167 insertions(+), 41 deletions(-) create mode 100644 geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py diff --git a/README.md b/README.md index 51892e14..f1ce6c77 100644 --- a/README.md +++ b/README.md @@ -205,7 +205,7 @@ The whole package is under construction and the documentation is progressively e Email: sanyhew1097618435@163.com **Technical Group:** -+ Jianming Zhao (Jamie, Jilin University, Changchun, China) ++ Jianming Zhao (Jamie, Zhejiang University, China) + Jianhao Sun (Jin, China University of Geosciences, Wuhan, China) + Kaixin Zheng (Hayne, Sun Yat-sen University, China) + Jianing Wang (National University of Singapore, Singapore) diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py index 2b8d1fc6..8822ea74 100644 --- a/geochemistrypi/data_mining/constants.py +++ b/geochemistrypi/data_mining/constants.py @@ -64,7 +64,7 @@ # "Decision Tree", # Histogram-based Gradient Boosting, ] -CLUSTERING_MODELS = ["KMeans", "DBSCAN"] +CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative"] DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"] # The model can deal with missing values diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py index 93bdf85b..1d1cbff0 100644 --- a/geochemistrypi/data_mining/model/clustering.py +++ b/geochemistrypi/data_mining/model/clustering.py @@ -5,9 +5,10 @@ import mlflow import numpy as np +from numpy.typing import ArrayLike import pandas as pd from rich import print -from sklearn.cluster import DBSCAN, AffinityPropagation, KMeans +from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering from ..constants import MLFLOW_ARTIFACT_DATA_PATH, MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH from ..utils.base import clear_output, save_data, save_fig, save_text @@ -15,6 +16,7 @@ from .func.algo_clustering._common import plot_results, plot_silhouette_diagram, score from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters, dbscan_result_plot from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters, plot_silhouette_diagram_kmeans, scatter2d, scatter3d +from .func.algo_clustering._agglomerative import agglomerative_manual_hyper_parameters class ClusteringWorkflowBase(WorkflowBase): @@ -197,7 +199,7 @@ def __init__( might change in the future for a better heuristic. References - ---------------------------------------- + ---------- Scikit-learn API: sklearn.cluster.KMeans https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html """ @@ -404,7 +406,7 @@ def __init__( The number of parallel jobs to run. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details. References - ---------------------------------------- + ---------- Scikit-learn API: sklearn.cluster.DBSCAN https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html """ @@ -461,43 +463,138 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None: ) -class AffinityPropagationClustering(ClusteringWorkflowBase): - name = "AffinityPropagation" +class Agglomerative(ClusteringWorkflowBase): + """The automation workflow of using Agglomerative Clustering to make insightful products.""" + + name = "Agglomerative" + special_function = [] def __init__( self, - *, - damping=0.5, - max_iter=200, - convergence_iter=15, - copy=True, - preference=None, - affinity="euclidean", - verbose=False, - random_state=None, - ): + n_clusters: int = 2, + affinity: str = "euclidean", + metric: str = None, + memory: str = None, + connectivity: ArrayLike = None, + compute_full_tree: str = "auto", + linkage: str = "ward", + distance_threshold: float = None, + compute_distances: bool = False, + ) -> None: + """ + Parameters + ---------- + n_clusters : int or None, default=2 + The number of clusters to find. It must be ``None`` if + ``distance_threshold`` is not ``None``. + + affinity : str or callable, default='euclidean' + Metric used to compute the linkage. Can be "euclidean", "l1", "l2", + "manhattan", "cosine", or "precomputed". + If linkage is "ward", only "euclidean" is accepted. + If "precomputed", a distance matrix (instead of a similarity matrix) + is needed as input for the fit method. + + memory : str or object with the joblib.Memory interface, default=None + Used to cache the output of the computation of the tree. + By default, no caching is done. If a string is given, it is the + path to the caching directory. + + connectivity : array-like or callable, default=None + Connectivity matrix. Defines for each sample the neighboring + samples following a given structure of the data. + This can be a connectivity matrix itself or a callable that transforms + the data into a connectivity matrix, such as derived from + `kneighbors_graph`. Default is ``None``, i.e, the + hierarchical clustering algorithm is unstructured. + + compute_full_tree : 'auto' or bool, default='auto' + Stop early the construction of the tree at ``n_clusters``. This is + useful to decrease computation time if the number of clusters is not + small compared to the number of samples. This option is useful only + when specifying a connectivity matrix. Note also that when varying the + number of clusters and using caching, it may be advantageous to compute + the full tree. It must be ``True`` if ``distance_threshold`` is not + ``None``. By default `compute_full_tree` is "auto", which is equivalent + to `True` when `distance_threshold` is not `None` or that `n_clusters` + is inferior to the maximum between 100 or `0.02 * n_samples`. + Otherwise, "auto" is equivalent to `False`. + + linkage : {'ward', 'complete', 'average', 'single'}, default='ward' + Which linkage criterion to use. The linkage criterion determines which + distance to use between sets of observation. The algorithm will merge + the pairs of cluster that minimize this criterion. + + - 'ward' minimizes the variance of the clusters being merged. + - 'average' uses the average of the distances of each observation of + the two sets. + - 'complete' or 'maximum' linkage uses the maximum distances between + all observations of the two sets. + - 'single' uses the minimum of the distances between all observations + of the two sets. + + .. versionadded:: 0.20 + Added the 'single' option + + distance_threshold : float, default=None + The linkage distance threshold above which, clusters will not be + merged. If not ``None``, ``n_clusters`` must be ``None`` and + ``compute_full_tree`` must be ``True``. + + .. versionadded:: 0.21 + + compute_distances : bool, default=False + Computes distances between clusters even if `distance_threshold` is not + used. This can be used to make dendrogram visualization, but introduces + a computational and memory overhead. + + .. versionadded:: 0.24 + + References + ---------- + sklearn.cluster.AgglomerativeClustering + https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html + """ super().__init__() - self.damping = damping - self.max_iter = max_iter - self.convergence_iter = convergence_iter - self.copy = copy - self.verbose = verbose - self.preference = preference + self.n_clusters = n_clusters + self.distance_threshold = distance_threshold + self.memory = memory + self.connectivity = connectivity + self.compute_full_tree = compute_full_tree + self.linkage = linkage self.affinity = affinity - self.random_state = random_state - self.model = AffinityPropagation( - damping=self.damping, - max_iter=self.max_iter, - convergence_iter=self.convergence_iter, - copy=self.copy, - preference=None, - affinity="euclidean", - verbose=False, - random_state=None, + self.metric = metric + self.compute_distances = compute_distances + + self.model = AgglomerativeClustering( + n_clusters=self.n_clusters, + affinity=self.affinity, + memory=self.memory, + connectivity=self.connectivity, + compute_full_tree=self.compute_full_tree, + linkage=self.linkage, + distance_threshold=self.distance_threshold, + compute_distances=self.compute_distances, ) - self.naming = AffinityPropagationClustering.name + self.naming = Agglomerative.name + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") + hyper_parameters = agglomerative_manual_hyper_parameters() + clear_output() + return hyper_parameters + + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + pass + + +class AffinityPropagationClustering(ClusteringWorkflowBase): + name = "AffinityPropagation" pass @@ -516,11 +613,6 @@ class WardHierarchicalClustering(ClusteringWorkflowBase): pass -class AgglomerativeClustering(ClusteringWorkflowBase): - name = "Agglomerative" - pass - - class OPTICSClustering(ClusteringWorkflowBase): name = "OPTICS" pass diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py b/geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py new file mode 100644 index 00000000..5d8336a2 --- /dev/null +++ b/geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py @@ -0,0 +1,29 @@ +from typing import Dict + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from rich import print + +from ....constants import SECTION +from ....data.data_readiness import float_input, num_input, str_input + +def agglomerative_manual_hyper_parameters() -> Dict: + """Manually set hyperparameters. + + Returns + ------- + hyper_parameters : dict + """ + print("N Clusters: The number of clusters to form as well as the number of centroids to generate.") + print("Please specify the number of clusters for agglomerative. A good starting range could be between 2 and 10, such as '4'.") + n_clusters = num_input(SECTION[2], "N Clusters: ") + print("linkage: The linkage criterion determines which distance to use between sets of observation. ") + print("Please specify the linkage criterion. It is generally recommended to leave it set to 'ward'.") + linkages = ["ward", "complete", "average", "single"] + linkage = str_input(linkages, SECTION[2]) + hyper_parameters = { + "n_clusters": n_clusters, + "linkage": linkage, + } + return hyper_parameters \ No newline at end of file diff --git a/geochemistrypi/data_mining/process/cluster.py b/geochemistrypi/data_mining/process/cluster.py index e7a38ed5..bf1184ea 100644 --- a/geochemistrypi/data_mining/process/cluster.py +++ b/geochemistrypi/data_mining/process/cluster.py @@ -3,8 +3,7 @@ from typing import Optional import pandas as pd - -from ..model.clustering import ClusteringWorkflowBase, DBSCANClustering, KMeansClustering +from ..model.clustering import ClusteringWorkflowBase, DBSCANClustering, KMeansClustering, Agglomerative from ._base import ModelSelectionBase @@ -48,6 +47,12 @@ def activate( leaf_size=hyper_parameters["leaf_size"], p=hyper_parameters["p"], ) + elif self.model_name == "Agglomerative": + hyper_parameters = Agglomerative.manual_hyper_parameters() + self.clt_workflow = Agglomerative( + n_clusters=hyper_parameters["n_clusters"], + linkage=hyper_parameters["linkage"], + ) elif self.model_name == "": pass From 6921a5c4b8c01161c86eb986cb62d8a26d4c15a4 Mon Sep 17 00:00:00 2001 From: jmz <45778832+PotatoXi@users.noreply.github.com> Date: Mon, 1 Jan 2024 15:57:25 +0800 Subject: [PATCH 2/4] Revert "feat: add agglomerative clustering algorithm" This reverts commit f8c559bcfe19696182dfd8e5d0a5cde2831a6c85. --- README.md | 2 +- geochemistrypi/data_mining/constants.py | 2 +- .../data_mining/model/clustering.py | 166 ++++-------------- .../func/algo_clustering/_agglomerative.py | 29 --- geochemistrypi/data_mining/process/cluster.py | 9 +- 5 files changed, 41 insertions(+), 167 deletions(-) delete mode 100644 geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py diff --git a/README.md b/README.md index f1ce6c77..51892e14 100644 --- a/README.md +++ b/README.md @@ -205,7 +205,7 @@ The whole package is under construction and the documentation is progressively e Email: sanyhew1097618435@163.com **Technical Group:** -+ Jianming Zhao (Jamie, Zhejiang University, China) ++ Jianming Zhao (Jamie, Jilin University, Changchun, China) + Jianhao Sun (Jin, China University of Geosciences, Wuhan, China) + Kaixin Zheng (Hayne, Sun Yat-sen University, China) + Jianing Wang (National University of Singapore, Singapore) diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py index 8822ea74..2b8d1fc6 100644 --- a/geochemistrypi/data_mining/constants.py +++ b/geochemistrypi/data_mining/constants.py @@ -64,7 +64,7 @@ # "Decision Tree", # Histogram-based Gradient Boosting, ] -CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative"] +CLUSTERING_MODELS = ["KMeans", "DBSCAN"] DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"] # The model can deal with missing values diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py index 1d1cbff0..93bdf85b 100644 --- a/geochemistrypi/data_mining/model/clustering.py +++ b/geochemistrypi/data_mining/model/clustering.py @@ -5,10 +5,9 @@ import mlflow import numpy as np -from numpy.typing import ArrayLike import pandas as pd from rich import print -from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering +from sklearn.cluster import DBSCAN, AffinityPropagation, KMeans from ..constants import MLFLOW_ARTIFACT_DATA_PATH, MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH from ..utils.base import clear_output, save_data, save_fig, save_text @@ -16,7 +15,6 @@ from .func.algo_clustering._common import plot_results, plot_silhouette_diagram, score from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters, dbscan_result_plot from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters, plot_silhouette_diagram_kmeans, scatter2d, scatter3d -from .func.algo_clustering._agglomerative import agglomerative_manual_hyper_parameters class ClusteringWorkflowBase(WorkflowBase): @@ -199,7 +197,7 @@ def __init__( might change in the future for a better heuristic. References - ---------- + ---------------------------------------- Scikit-learn API: sklearn.cluster.KMeans https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html """ @@ -406,7 +404,7 @@ def __init__( The number of parallel jobs to run. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details. References - ---------- + ---------------------------------------- Scikit-learn API: sklearn.cluster.DBSCAN https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html """ @@ -463,138 +461,43 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None: ) -class Agglomerative(ClusteringWorkflowBase): - """The automation workflow of using Agglomerative Clustering to make insightful products.""" - - name = "Agglomerative" - special_function = [] +class AffinityPropagationClustering(ClusteringWorkflowBase): + name = "AffinityPropagation" def __init__( self, - n_clusters: int = 2, - affinity: str = "euclidean", - metric: str = None, - memory: str = None, - connectivity: ArrayLike = None, - compute_full_tree: str = "auto", - linkage: str = "ward", - distance_threshold: float = None, - compute_distances: bool = False, - ) -> None: - """ - Parameters - ---------- - n_clusters : int or None, default=2 - The number of clusters to find. It must be ``None`` if - ``distance_threshold`` is not ``None``. - - affinity : str or callable, default='euclidean' - Metric used to compute the linkage. Can be "euclidean", "l1", "l2", - "manhattan", "cosine", or "precomputed". - If linkage is "ward", only "euclidean" is accepted. - If "precomputed", a distance matrix (instead of a similarity matrix) - is needed as input for the fit method. - - memory : str or object with the joblib.Memory interface, default=None - Used to cache the output of the computation of the tree. - By default, no caching is done. If a string is given, it is the - path to the caching directory. - - connectivity : array-like or callable, default=None - Connectivity matrix. Defines for each sample the neighboring - samples following a given structure of the data. - This can be a connectivity matrix itself or a callable that transforms - the data into a connectivity matrix, such as derived from - `kneighbors_graph`. Default is ``None``, i.e, the - hierarchical clustering algorithm is unstructured. - - compute_full_tree : 'auto' or bool, default='auto' - Stop early the construction of the tree at ``n_clusters``. This is - useful to decrease computation time if the number of clusters is not - small compared to the number of samples. This option is useful only - when specifying a connectivity matrix. Note also that when varying the - number of clusters and using caching, it may be advantageous to compute - the full tree. It must be ``True`` if ``distance_threshold`` is not - ``None``. By default `compute_full_tree` is "auto", which is equivalent - to `True` when `distance_threshold` is not `None` or that `n_clusters` - is inferior to the maximum between 100 or `0.02 * n_samples`. - Otherwise, "auto" is equivalent to `False`. - - linkage : {'ward', 'complete', 'average', 'single'}, default='ward' - Which linkage criterion to use. The linkage criterion determines which - distance to use between sets of observation. The algorithm will merge - the pairs of cluster that minimize this criterion. - - - 'ward' minimizes the variance of the clusters being merged. - - 'average' uses the average of the distances of each observation of - the two sets. - - 'complete' or 'maximum' linkage uses the maximum distances between - all observations of the two sets. - - 'single' uses the minimum of the distances between all observations - of the two sets. - - .. versionadded:: 0.20 - Added the 'single' option - - distance_threshold : float, default=None - The linkage distance threshold above which, clusters will not be - merged. If not ``None``, ``n_clusters`` must be ``None`` and - ``compute_full_tree`` must be ``True``. - - .. versionadded:: 0.21 - - compute_distances : bool, default=False - Computes distances between clusters even if `distance_threshold` is not - used. This can be used to make dendrogram visualization, but introduces - a computational and memory overhead. - - .. versionadded:: 0.24 - - References - ---------- - sklearn.cluster.AgglomerativeClustering - https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html - """ + *, + damping=0.5, + max_iter=200, + convergence_iter=15, + copy=True, + preference=None, + affinity="euclidean", + verbose=False, + random_state=None, + ): super().__init__() - self.n_clusters = n_clusters - self.distance_threshold = distance_threshold - self.memory = memory - self.connectivity = connectivity - self.compute_full_tree = compute_full_tree - self.linkage = linkage + self.damping = damping + self.max_iter = max_iter + self.convergence_iter = convergence_iter + self.copy = copy + self.verbose = verbose + self.preference = preference self.affinity = affinity - self.metric = metric - self.compute_distances = compute_distances - - self.model = AgglomerativeClustering( - n_clusters=self.n_clusters, - affinity=self.affinity, - memory=self.memory, - connectivity=self.connectivity, - compute_full_tree=self.compute_full_tree, - linkage=self.linkage, - distance_threshold=self.distance_threshold, - compute_distances=self.compute_distances, + self.random_state = random_state + self.model = AffinityPropagation( + damping=self.damping, + max_iter=self.max_iter, + convergence_iter=self.convergence_iter, + copy=self.copy, + preference=None, + affinity="euclidean", + verbose=False, + random_state=None, ) + self.naming = AffinityPropagationClustering.name - self.naming = Agglomerative.name - - @classmethod - def manual_hyper_parameters(cls) -> Dict: - """Manual hyper-parameters specification.""" - print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-") - hyper_parameters = agglomerative_manual_hyper_parameters() - clear_output() - return hyper_parameters - - def special_components(self, **kwargs) -> None: - """Invoke all special application functions for this algorithms by Scikit-learn framework.""" - pass - - -class AffinityPropagationClustering(ClusteringWorkflowBase): - name = "AffinityPropagation" pass @@ -613,6 +516,11 @@ class WardHierarchicalClustering(ClusteringWorkflowBase): pass +class AgglomerativeClustering(ClusteringWorkflowBase): + name = "Agglomerative" + pass + + class OPTICSClustering(ClusteringWorkflowBase): name = "OPTICS" pass diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py b/geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py deleted file mode 100644 index 5d8336a2..00000000 --- a/geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py +++ /dev/null @@ -1,29 +0,0 @@ -from typing import Dict - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from rich import print - -from ....constants import SECTION -from ....data.data_readiness import float_input, num_input, str_input - -def agglomerative_manual_hyper_parameters() -> Dict: - """Manually set hyperparameters. - - Returns - ------- - hyper_parameters : dict - """ - print("N Clusters: The number of clusters to form as well as the number of centroids to generate.") - print("Please specify the number of clusters for agglomerative. A good starting range could be between 2 and 10, such as '4'.") - n_clusters = num_input(SECTION[2], "N Clusters: ") - print("linkage: The linkage criterion determines which distance to use between sets of observation. ") - print("Please specify the linkage criterion. It is generally recommended to leave it set to 'ward'.") - linkages = ["ward", "complete", "average", "single"] - linkage = str_input(linkages, SECTION[2]) - hyper_parameters = { - "n_clusters": n_clusters, - "linkage": linkage, - } - return hyper_parameters \ No newline at end of file diff --git a/geochemistrypi/data_mining/process/cluster.py b/geochemistrypi/data_mining/process/cluster.py index bf1184ea..e7a38ed5 100644 --- a/geochemistrypi/data_mining/process/cluster.py +++ b/geochemistrypi/data_mining/process/cluster.py @@ -3,7 +3,8 @@ from typing import Optional import pandas as pd -from ..model.clustering import ClusteringWorkflowBase, DBSCANClustering, KMeansClustering, Agglomerative + +from ..model.clustering import ClusteringWorkflowBase, DBSCANClustering, KMeansClustering from ._base import ModelSelectionBase @@ -47,12 +48,6 @@ def activate( leaf_size=hyper_parameters["leaf_size"], p=hyper_parameters["p"], ) - elif self.model_name == "Agglomerative": - hyper_parameters = Agglomerative.manual_hyper_parameters() - self.clt_workflow = Agglomerative( - n_clusters=hyper_parameters["n_clusters"], - linkage=hyper_parameters["linkage"], - ) elif self.model_name == "": pass From 3630f3a7bffad98b70e0e9239a889ab6216cf451 Mon Sep 17 00:00:00 2001 From: jmz <45778832+PotatoXi@users.noreply.github.com> Date: Sat, 13 Jan 2024 18:35:58 +0800 Subject: [PATCH 3/4] feat: add agglomerative clustering algorithm --- README.md | 72 +++++---- geochemistrypi/data_mining/constants.py | 2 +- .../data_mining/model/clustering.py | 140 +++++++++++++++++- .../func/algo_clustering/_agglomerative.py | 27 ++++ geochemistrypi/data_mining/process/cluster.py | 8 +- 5 files changed, 213 insertions(+), 36 deletions(-) create mode 100644 geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py diff --git a/README.md b/README.md index 51892e14..fdf81145 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,12 @@
--- -**Documentation**: https://geochemistrypi.readthedocs.io -**Source Code**: https://github.com/ZJUEarthData/geochemistrypi -___ +**Documentation**: ``https://geochemistrypi.readthedocs.io`` + +**Source Code**: ``https://github.com/ZJUEarthData/geochemistrypi`` + +--- Geochemistry π is an **open-sourced highly automated machine learning Python framework** dedicating to build up MLOps level 1 software product for data-driven geochemistry discovery on tabular data. @@ -24,19 +26,20 @@ Core capabilities are: + **Model Inference** Key features are: + + **Easy to use:** The automation of data mining process provides the users with simple number options to choose. + **Extensible:** It allows appending new algorithms through Scikit-learn with automatic hyper parameter searching by FLAML and Ray. + **Traceable**: It integrates MLflow to build special storage mechanism to streamline the end-to-end machine learning lifecycle. Latest Update: follow up by clicking `Starred` and `Watch` on our [GitHub repository](https://github.com/ZJUEarthData/geochemistrypi), then get email notifications of the newest features automatically. -The following figure is the simplified overview of Geochemistry π:-The following figure is the frontend-backend separation architecture of Geochemistry:
@@ -45,37 +48,45 @@ The following figure is the frontend-backend separation architecture of Geochemi ## Quick Installation One instruction to download on **command line**, such as Terminal on macOS, Power Shell on Windows. + ``` pip install geochemistrypi ``` + One instruction to download on **Jupyter Notebook** or **Google Colab**. + ``` !pip install geochemistrypi ``` + Check the latest version of our software: + ``` geochemistrypi --version ``` -**Note**: For more detail on installation, please refer to our online documentation in **Installation Manual** under the section of **FOR USER**. Over there, we highly recommend to use virtual environment (Conda) to avoid dependency version problems. +**Note**: For more detail on installation, please refer to our online documentation in **Installation Manual** under the section of **FOR USER**. Over there, we highly recommend to use virtual environment (Conda) to avoid dependency version problems. ## Quick Update One instruction to update the software to the latest version on **command line**, such as Terminal on macOS, Power Shell on Windows. + ``` pip install --upgrade geochemistrypi ``` + One instruction to download on **Jupyter Notebook** or **Google Colab**. + ``` !pip install --upgrade geochemistrypi ``` + Check the latest version of our software: + ``` geochemistrypi --version ``` - - ## Example **How to run:** After successfully downloading, run this instruction on **command line / Jupyter Notebook / Google Colab** whatever directory it is. @@ -83,25 +94,33 @@ geochemistrypi --version ### Case 1: Run with built-in data set for testing On command line: + ``` geochemistrypi data-mining ``` + On Jupyter Notebook / Google Colab: + ``` !geochemistrypi data-mining ``` + **Note**: There are four built-in data sets corresponding to four kinds of model pattern. ### Case 2: Run with your own data set On command line: + ``` geochemistrypi data-mining --data your_own_data_set.xlsx ``` + On Jupyter Notebook / Google Colab: + ``` !geochemistrypi data-mining --data your_own_data_set.xlsx ``` + **Note**: Currently, `.xlsx` and `.csv` files are supported. Please specify the path your data file exists. For Google Colab, don't forget to upload your dataset first. ### Case 3: Implement model inference on application data @@ -117,11 +136,11 @@ On Jupyter Notebook / Google Colab: ``` !geochemistrypi data-mining --training your_own_training_data.xlsx --inference your_own_inference_data.xlsx ``` + **Note**: Please make sure the column names (data schema) in both training data file and inference data file are the same. Because the operations you perform via our software on the training data will be record automatically and subsequently applied to the inference data in the same order. The training data in our pipeline will be divided into the train set and test set used for training the ML model and evaluating the model's performance. The score includes two types. The first type is the scores from the prediction on the test set while the second type is cv scores from the cross validation on the train set. - ### Case 4: Activate MLflow web interface On command line: @@ -135,6 +154,7 @@ On Jupyter Notebook / Google Colab: ``` !geochemistrypi data-mining --mlflow ``` + **Note**: Once you run our software, there are two folders (`geopi_output` and `geopi_tracking`) generated automatically. Make sure the directory where you execute using the above command should have the genereted file `geopi_tracking`. Copy the URL shown on the console into any browser to open the MLflow web interface. The URL is normally like this http://127.0.0.1:5000. Search MLflow online to see more operations and usages. @@ -146,8 +166,6 @@ For more details: Please refer to: + [Geochemistry π - Download and Run the Beta Version (International - Youtube)](https://www.youtube.com/watch?v=EeVaJ3H7_AU&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM&index=9) + [Geochemistry π - Download and Run the Beta Version (China - Bilibili)](https://www.bilibili.com/video/BV1UM4y1Q7Ju/?spm_id_from=333.999.0.0&vd_source=27944ab3b73a78970c1a52a5dcbb9140) - - ## Roadmap ### First Phase @@ -165,6 +183,7 @@ Its data section provides feature engineering based on **arithmatic operation**. Its models section provides both **supervised learning** and **unsupervised learning** methods from **Scikit-learn** framework, including four types of algorithms, regression, classification, clustering, and dimensional reduction. Integrated with **FLAML** and **Ray** framework, it allows the users to run AutoML easily, fastly and cost-effectively on the built-in supervised learning algorithms in our framework. The following figure is the hierarchical architecture of Geochemistry π: +
@@ -173,23 +192,22 @@ The following figure is the hierarchical architecture of Geochemistry π: Currently, we are building three access ways to provide more user-friendly service, including **web portal**, **CLI package** and **API**. It allows the user to perform **continuous training** and **model inference** by automating the ML pipeline and **machine learning lifecycle management** by unique storage mechanism in different access layers. -The following figure is the system architecture diagram:
-The following figure is the design pattern hierarchical architecture:
@@ -197,15 +215,16 @@ The following figure is the storage mechanism:
The whole package is under construction and the documentation is progressively evolving.
-
-
## Team Info
+
**Leader:**
+
+ Can He (Sany, National University of Singapore, Singapore)
Email: sanyhew1097618435@163.com
**Technical Group:**
-+ Jianming Zhao (Jamie, Jilin University, Changchun, China)
+
++ Jianming Zhao (Jamie, Zhejiang University, China)
+ Jianhao Sun (Jin, China University of Geosciences, Wuhan, China)
+ Kaixin Zheng (Hayne, Sun Yat-sen University, China)
+ Jianing Wang (National University of Singapore, Singapore)
@@ -215,6 +234,7 @@ The whole package is under construction and the documentation is progressively e
+ Chengtu Li(Trenki, Henan Polytechnic University, Beijing, China)
**Product Group**:
+
+ Yang Lyu (Daisy, Zhejiang University, China)
+ Wenyu Zhao (Molly, Zhejiang University, China)
+ Keran Li (Kirk, Chengdu University of Technology, China)
@@ -225,8 +245,6 @@ The whole package is under construction and the documentation is progressively e
+ Zhenglin Xu (Garry, Jilin University, China)
+ Junchi Liao(Roceda, University of Electronic Science and Technology of China, China)
-
-
## Join Us :)
**The recruitment of research interns is ongoing !!!**
@@ -234,6 +252,7 @@ The whole package is under construction and the documentation is progressively e
**Key Point: All things are done online, remote work (\*^▽^\*)**
**What can you learn?**
+
+ Learning the full cycle of data mining (Scikit-learn, Ray, Mlflow) on tabular data, including the algorithms in regression,classification, clustering, and decomposition.
+ Learning to be a qualified Python developer, including any Python programing contents towards data mining, basic software engineering techniques like frontend (React, Typescript, Ant Design scaffold) and backend (SQL & NoSQL database, RESFful API, FastAPI) development, and cooperation tools like Git.
@@ -245,6 +264,7 @@ The whole package is under construction and the documentation is progressively e
+ Bonus depending on your performance.
**Current Working Pattern:**
+
+ Online working and cooperation
+ Three weeks per working cycle -> One online meeting per working cycle
+ One cycle report (see below) per cycle - 5 mins to finish
@@ -259,11 +279,10 @@ Chinese Page: https://person.zju.edu.cn/zhangzhou#0
**Do you want to contribute to this open-source program?**
Contact with your CV: sanyhew1097618435@163.com
-
-
## In-house Materials
Materials are in both Chinese and English. Others unshown below are internal materials.
+
1. [Guideline Manual – Geochemistry π (International - Google drive)](https://docs.google.com/document/d/1LjwB5Lazk33E5vbtnFPJio_MyjYQxjEu/edit?usp=sharing&ouid=110717816678586054594&rtpof=true&sd=true)
2. [Guideline Manual – Geochemistry π (China - Tencent Docs)](https://docs.qq.com/doc/DQ21IZUdVQktqRWpm?&u=6868f96d4a384b309036e04e637e367a)
3. [Learning Steps for Newbies – Geochemistry π (International - Google drive)](https://docs.google.com/document/d/1GQO-SXwEx_8midr362pqfxNZtfUf-nA6/edit?usp=sharing&ouid=110717816678586054594&rtpof=true&sd=true)
@@ -277,15 +296,14 @@ Materials are in both Chinese and English. Others unshown below are internal mat
Technical record videos are on Bilibili and Youtube synchronously while other meeting videos are internal materials.
More Videos will be recorded soon.
+
1. [ZJU_Earth_Data Introduction (Geochemical Data, Python, Geochemistry π) - Prof. Zhang](https://www.bilibili.com/video/BV1Lf4y1w7EK?spm_id_from=333.999.0.0)
2. [How to Collaborate and Provide Bug Report on Geochemistry π Through GitHub - Can He (Sany)](https://www.youtube.com/watch?v=1DWoEsqsfvQ&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM&index=3)
3. [Geochemistry π - Download and Run the Beta Version](https://www.youtube.com/watch?v=EeVaJ3H7_AU&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM&index=9)
4. [How to Create and Use Virtual Environment on Geochemistry π - Can He (Sany)](https://www.youtube.com/watch?v=4KFi7OXxD-c&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM&index=4)
5. [How to use Github-Desktop in conflict resolution - Qiuhao Zhao (Brad)](https://www.youtube.com/watch?v=KT1g5JpuUVI&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM)
-6. [Virtual Environment & Packages On Windows - Jianming Zhao (Jamie)](https://www.youtube.com/watch?v=e4VqSBuNp_o&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM&index=2)
-7. [Git Workflow & Coordinating Synchronization - Jianming Zhao (Jamie)](https://www.bilibili.com/video/BV1Sa4y1f74k?spm_id_from=333.999.0.0&vd_source=9adcf2c5fdeffe1d11c89d441ef598ba)
-
-
+6. [Virtual Environment & Packages On Windows - Jianming Zhao (Jamie)](https://www.youtube.com/watch?v=e4VqSBuNp_o&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM&index=2)
+7. [Git Workflow & Coordinating Synchronization - Jianming Zhao (Jamie)](https://www.bilibili.com/video/BV1Sa4y1f74k?spm_id_from=333.999.0.0&vd_source=9adcf2c5fdeffe1d11c89d441ef598ba)
## Contributors
diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
index 2b8d1fc6..8822ea74 100644
--- a/geochemistrypi/data_mining/constants.py
+++ b/geochemistrypi/data_mining/constants.py
@@ -64,7 +64,7 @@
# "Decision Tree",
# Histogram-based Gradient Boosting,
]
-CLUSTERING_MODELS = ["KMeans", "DBSCAN"]
+CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative"]
DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"]
# The model can deal with missing values
diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py
index 93bdf85b..1dfad172 100644
--- a/geochemistrypi/data_mining/model/clustering.py
+++ b/geochemistrypi/data_mining/model/clustering.py
@@ -6,12 +6,14 @@
import mlflow
import numpy as np
import pandas as pd
+from numpy.typing import ArrayLike
from rich import print
-from sklearn.cluster import DBSCAN, AffinityPropagation, KMeans
+from sklearn.cluster import DBSCAN, AffinityPropagation, AgglomerativeClustering, KMeans
from ..constants import MLFLOW_ARTIFACT_DATA_PATH, MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH
from ..utils.base import clear_output, save_data, save_fig, save_text
from ._base import WorkflowBase
+from .func.algo_clustering._agglomerative import agglomerative_manual_hyper_parameters
from .func.algo_clustering._common import plot_results, plot_silhouette_diagram, score
from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters, dbscan_result_plot
from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters, plot_silhouette_diagram_kmeans, scatter2d, scatter3d
@@ -404,7 +406,7 @@ def __init__(
The number of parallel jobs to run. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details.
References
- ----------------------------------------
+ ----------
Scikit-learn API: sklearn.cluster.DBSCAN
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
"""
@@ -461,6 +463,135 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
)
+class Agglomerative(ClusteringWorkflowBase):
+ """The automation workflow of using Agglomerative Clustering to make insightful products."""
+
+ name = "Agglomerative"
+ special_function = []
+
+ def __init__(
+ self,
+ n_clusters: int = 2,
+ *,
+ affinity: str = "euclidean",
+ memory: str = None,
+ connectivity: ArrayLike = None,
+ compute_full_tree: str = "auto",
+ linkage: str = "ward",
+ distance_threshold: float = None,
+ compute_distances: bool = False,
+ ) -> None:
+ """
+ Parameters
+ ----------
+ n_clusters : int or None, default=2
+ The number of clusters to find. It must be ``None`` if
+ ``distance_threshold`` is not ``None``.
+
+ affinity : str or callable, default='euclidean'
+ Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
+ "manhattan", "cosine", or "precomputed".
+ If linkage is "ward", only "euclidean" is accepted.
+ If "precomputed", a distance matrix (instead of a similarity matrix)
+ is needed as input for the fit method.
+
+ memory : str or object with the joblib.Memory interface, default=None
+ Used to cache the output of the computation of the tree.
+ By default, no caching is done. If a string is given, it is the
+ path to the caching directory.
+
+ connectivity : array-like or callable, default=None
+ Connectivity matrix. Defines for each sample the neighboring
+ samples following a given structure of the data.
+ This can be a connectivity matrix itself or a callable that transforms
+ the data into a connectivity matrix, such as derived from
+ `kneighbors_graph`. Default is ``None``, i.e, the
+ hierarchical clustering algorithm is unstructured.
+
+ compute_full_tree : 'auto' or bool, default='auto'
+ Stop early the construction of the tree at ``n_clusters``. This is
+ useful to decrease computation time if the number of clusters is not
+ small compared to the number of samples. This option is useful only
+ when specifying a connectivity matrix. Note also that when varying the
+ number of clusters and using caching, it may be advantageous to compute
+ the full tree. It must be ``True`` if ``distance_threshold`` is not
+ ``None``. By default `compute_full_tree` is "auto", which is equivalent
+ to `True` when `distance_threshold` is not `None` or that `n_clusters`
+ is inferior to the maximum between 100 or `0.02 * n_samples`.
+ Otherwise, "auto" is equivalent to `False`.
+
+ linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
+ Which linkage criterion to use. The linkage criterion determines which
+ distance to use between sets of observation. The algorithm will merge
+ the pairs of cluster that minimize this criterion.
+
+ - 'ward' minimizes the variance of the clusters being merged.
+ - 'average' uses the average of the distances of each observation of
+ the two sets.
+ - 'complete' or 'maximum' linkage uses the maximum distances between
+ all observations of the two sets.
+ - 'single' uses the minimum of the distances between all observations
+ of the two sets.
+
+ .. versionadded:: 0.20
+ Added the 'single' option
+
+ distance_threshold : float, default=None
+ The linkage distance threshold above which, clusters will not be
+ merged. If not ``None``, ``n_clusters`` must be ``None`` and
+ ``compute_full_tree`` must be ``True``.
+
+ .. versionadded:: 0.21
+
+ compute_distances : bool, default=False
+ Computes distances between clusters even if `distance_threshold` is not
+ used. This can be used to make dendrogram visualization, but introduces
+ a computational and memory overhead.
+
+ .. versionadded:: 0.24
+
+ References
+ ----------
+ sklearn.cluster.AgglomerativeClustering
+ https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
+ """
+
+ super().__init__()
+ self.n_clusters = n_clusters
+ self.affinity = affinity
+ self.distance_threshold = distance_threshold
+ self.memory = memory
+ self.connectivity = connectivity
+ self.compute_full_tree = compute_full_tree
+ self.linkage = linkage
+ self.compute_distances = compute_distances
+
+ self.model = AgglomerativeClustering(
+ n_clusters=self.n_clusters,
+ affinity=self.affinity,
+ memory=self.memory,
+ connectivity=self.connectivity,
+ compute_full_tree=self.compute_full_tree,
+ linkage=self.linkage,
+ distance_threshold=self.distance_threshold,
+ compute_distances=self.compute_distances,
+ )
+
+ self.naming = Agglomerative.name
+
+ @classmethod
+ def manual_hyper_parameters(cls) -> Dict:
+ """Manual hyper-parameters specification."""
+ print(f"-*-*- {cls.name} - Hyper-parameters Specification -*-*-")
+ hyper_parameters = agglomerative_manual_hyper_parameters()
+ clear_output()
+ return hyper_parameters
+
+ def special_components(self, **kwargs) -> None:
+ """Invoke all special application functions for this algorithms by Scikit-learn framework."""
+ pass
+
+
class AffinityPropagationClustering(ClusteringWorkflowBase):
name = "AffinityPropagation"
@@ -516,11 +647,6 @@ class WardHierarchicalClustering(ClusteringWorkflowBase):
pass
-class AgglomerativeClustering(ClusteringWorkflowBase):
- name = "Agglomerative"
- pass
-
-
class OPTICSClustering(ClusteringWorkflowBase):
name = "OPTICS"
pass
diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py b/geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py
new file mode 100644
index 00000000..f847cdb3
--- /dev/null
+++ b/geochemistrypi/data_mining/model/func/algo_clustering/_agglomerative.py
@@ -0,0 +1,27 @@
+from typing import Dict
+
+from rich import print
+
+from ....constants import SECTION
+from ....data.data_readiness import num_input, str_input
+
+
+def agglomerative_manual_hyper_parameters() -> Dict:
+ """Manually set hyperparameters.
+
+ Returns
+ -------
+ hyper_parameters : dict
+ """
+ print("N Clusters: The number of clusters to form as well as the number of centroids to generate.")
+ print("Please specify the number of clusters for agglomerative. A good starting range could be between 2 and 10, such as '4'.")
+ n_clusters = num_input(SECTION[2], "N Clusters: ")
+ print("linkage: The linkage criterion determines which distance to use between sets of observation. ")
+ print("Please specify the linkage criterion. It is generally recommended to leave it set to 'ward'.")
+ linkages = ["ward", "complete", "average", "single"]
+ linkage = str_input(linkages, SECTION[2])
+ hyper_parameters = {
+ "n_clusters": n_clusters,
+ "linkage": linkage,
+ }
+ return hyper_parameters
diff --git a/geochemistrypi/data_mining/process/cluster.py b/geochemistrypi/data_mining/process/cluster.py
index e7a38ed5..03686770 100644
--- a/geochemistrypi/data_mining/process/cluster.py
+++ b/geochemistrypi/data_mining/process/cluster.py
@@ -4,7 +4,7 @@
import pandas as pd
-from ..model.clustering import ClusteringWorkflowBase, DBSCANClustering, KMeansClustering
+from ..model.clustering import Agglomerative, ClusteringWorkflowBase, DBSCANClustering, KMeansClustering
from ._base import ModelSelectionBase
@@ -48,6 +48,12 @@ def activate(
leaf_size=hyper_parameters["leaf_size"],
p=hyper_parameters["p"],
)
+ elif self.model_name == "Agglomerative":
+ hyper_parameters = Agglomerative.manual_hyper_parameters()
+ self.clt_workflow = Agglomerative(
+ n_clusters=hyper_parameters["n_clusters"],
+ linkage=hyper_parameters["linkage"],
+ )
elif self.model_name == "":
pass
From df372c552e6ce8c7474715df503d3c34c7205534 Mon Sep 17 00:00:00 2001
From: jmz <45778832+PotatoXi@users.noreply.github.com>
Date: Sat, 13 Jan 2024 19:58:44 +0800
Subject: [PATCH 4/4] Revert "feat: add agglomerative clustering algorithm"
This reverts commit ffcdfc7e99f03a630cd4bcb1a8a54177d91fb139, reversing
changes made to 3630f3a7bffad98b70e0e9239a889ab6216cf451.
---
README.md | 2 +-
geochemistrypi/cli.py | 8 +-
geochemistrypi/data_mining/cli_pipeline.py | 114 +++----
geochemistrypi/data_mining/constants.py | 2 +-
geochemistrypi/data_mining/data/inference.py | 4 +-
.../data_mining/data/preprocessing.py | 6 +-
.../data_mining/model/classification.py | 51 +--
.../data_mining/model/clustering.py | 246 ++++++++------
.../data_mining/model/decomposition.py | 15 +-
.../func/algo_classification/_extra_trees.py | 4 +-
.../_logistic_regression.py | 8 +-
.../_multi_layer_perceptron.py | 6 +-
.../model/func/algo_classification/_rf.py | 4 +-
.../model/func/algo_classification/_svc.py | 4 +-
.../model/func/algo_clustering/_common.py | 308 ++----------------
.../model/func/algo_clustering/_dbscan.py | 136 +++++---
.../model/func/algo_clustering/_kmeans.py | 161 +++++++++
.../model/func/algo_decomposition/_mds.py | 2 +-
.../model/func/algo_regression/_extra_tree.py | 4 +-
.../func/algo_regression/_lasso_regression.py | 4 +-
.../algo_regression/_linear_regression.py | 2 +-
.../_multi_layer_perceptron.py | 6 +-
.../algo_regression/_polynomial_regression.py | 4 +-
.../model/func/algo_regression/_rf.py | 4 +-
.../model/func/algo_regression/_svr.py | 4 +-
.../data_mining/model/regression.py | 53 +--
geochemistrypi/start_cli_pipeline.py | 2 +-
27 files changed, 545 insertions(+), 619 deletions(-)
diff --git a/README.md b/README.md
index 711db838..fdf81145 100644
--- a/README.md
+++ b/README.md
@@ -107,7 +107,7 @@ On Jupyter Notebook / Google Colab:
**Note**: There are four built-in data sets corresponding to four kinds of model pattern.
-### Case 2: Run with your own data set without model inference
+### Case 2: Run with your own data set
On command line:
diff --git a/geochemistrypi/cli.py b/geochemistrypi/cli.py
index 20bb44c7..17d2e6dc 100644
--- a/geochemistrypi/cli.py
+++ b/geochemistrypi/cli.py
@@ -40,7 +40,7 @@ def main(version: Optional[bool] = typer.Option(None, "--version", "-v", help="S
def data_mining(
data: str = typer.Option("", help="The path of the training data without model inference."),
training: str = typer.Option("", help="The path of the training data."),
- application: str = typer.Option("", help="The path of the inference data."),
+ inference: str = typer.Option("", help="The path of the inference data."),
mlflow: bool = typer.Option(False, help="Start the mlflow server."),
web: bool = False,
) -> None:
@@ -81,11 +81,11 @@ def start_mlflow():
if data:
cli_pipeline(data)
# If the training data and inference data are provided, start the CLI pipeline with continuous training and inference
- elif training and application:
- cli_pipeline(training, application)
+ elif training and inference:
+ cli_pipeline(training, inference)
# If no data is provided, use built-in data to start the CLI pipeline with continuous training and inference
else:
- cli_pipeline(training, application)
+ cli_pipeline(training, inference)
@app.command()
diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py
index 1859ce73..f7b7e336 100644
--- a/geochemistrypi/data_mining/cli_pipeline.py
+++ b/geochemistrypi/data_mining/cli_pipeline.py
@@ -45,7 +45,7 @@
from .utils.mlflow_utils import retrieve_previous_experiment_id
-def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = None) -> None:
+def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = None) -> None:
"""The command line interface software for Geochemistry π.
The business logic of this CLI software can be found in the figures in the README.md file.
It provides three MLOps core functionalities:
@@ -58,15 +58,11 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
training_data_path : str
The path of the training data.
- application_data_path : str, optional
- The path of the application data, by default None
+ inference_data_path : str, optional
+ The path of the inference data, by default None
"""
- # Local test: Uncomment the following line to utilize built-in datasets to test the pipeline. Don't forget to modify the path value to be consistent with your own location.
- training_data_path = "/Users/can/Documents/github/work/geo_ml/geochemistrypi/geochemistrypi/data_mining/data/dataset/Data_Classification.xlsx"
- application_data_path = "/Users/can/Documents/github/work/geo_ml/geochemistrypi/geochemistrypi/data_mining/data/dataset/Data_Classification.xlsx"
-
- # Local test: If the argument is False, hide all Python level warnings. Developers can turn it on by setting the argument to True.
+ # TODO: If the argument is False, hide all Python level warnings. Developers can turn it on by setting the argument to True.
show_warning(False)
os.makedirs(OUTPUT_PATH, exist_ok=True)
@@ -89,22 +85,22 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
print("[bold red]No Training Data File Provided![/bold red]")
print("[bold green]Built-in Data Loading.[/bold green]")
- # <-- User Application Data Loading -->
- with console.status("[bold green]Application Data Loading...[/bold green]", spinner="dots"):
+ # <-- User Inference Data Loading -->
+ with console.status("[bold green]Inference Data Loading...[/bold green]", spinner="dots"):
sleep(0.75)
is_built_in_inference_data = False
- if training_data_path and application_data_path:
+ if training_data_path and inference_data_path:
# If the user provides file name, then load the inference data from the file.
- inference_data = read_data(file_path=application_data_path, is_own_data=1)
- print("[bold green]Successfully Loading Own Application Data![bold green]")
- elif training_data_path and (not application_data_path):
+ inference_data = read_data(file_path=inference_data_path, is_own_data=1)
+ print("[bold green]Successfully Loading Own Inference Data![bold green]")
+ elif training_data_path and (not inference_data_path):
# If the user doesn't provide the inference data path, it means that the user doesn't want to run the model inference.
inference_data = None
- print("[bold red]No Application Data File Provided![/bold red]")
- elif (not training_data_path) and (not application_data_path):
+ print("[bold red]No Inference Data File Provided![/bold red]")
+ elif (not training_data_path) and (not inference_data_path):
is_built_in_inference_data = True
- print("[bold red]No Application Data File Provided![/bold red]")
- print("[bold green]Built-in Application Data Loading.[/bold green]")
+ print("[bold red]No Inference Data File Provided![/bold red]")
+ print("[bold green]Built-in Inference Data Loading.[/bold green]")
# <-- Dependency Checking -->
with console.status("[bold green]Denpendency Checking...[/bold green]", spinner="dots"):
@@ -198,23 +194,23 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
show_data_columns(data.columns)
clear_output()
- # <--- Built-in Application Data Loading --->
- logger.debug("Built-in Application Data Loading")
+ # <--- Built-in Inference Data Loading --->
+ logger.debug("Built-in Inference Data Loading")
# If the user doesn't provide training data path and inference data path, then use the built-in inference data.
if is_built_in_inference_data:
- print("-*-*- Built-in Application Data Option-*-*-")
+ print("-*-*- Built-in Inference Data Option-*-*-")
num2option(TEST_DATA_OPTION)
built_in_inference_data_num = limit_num_input(TEST_DATA_OPTION, SECTION[0], num_input)
if built_in_inference_data_num == 1:
- application_data_path = "InferenceData_Regression.xlsx"
+ inference_data_path = "InferenceData_Regression.xlsx"
elif built_in_inference_data_num == 2:
- application_data_path = "InferenceData_Classification.xlsx"
+ inference_data_path = "InferenceData_Classification.xlsx"
elif built_in_inference_data_num == 3:
- application_data_path = "InferenceData_Clustering.xlsx"
+ inference_data_path = "InferenceData_Clustering.xlsx"
elif built_in_inference_data_num == 4:
- application_data_path = "InferenceData_Decomposition.xlsx"
- inference_data = read_data(file_path=application_data_path)
- print(f"Successfully loading the built-in inference data set '{application_data_path}'.")
+ inference_data_path = "InferenceData_Decomposition.xlsx"
+ inference_data = read_data(file_path=inference_data_path)
+ print(f"Successfully loading the built-in inference data set '{inference_data_path}'.")
show_data_columns(inference_data.columns)
clear_output()
@@ -264,14 +260,14 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
# 2. Don't drop the rows with missing values, before implementing the model inference, the inference data set should be imputed as well.
# Because dropping the rows with missing values use pandas.DataFrame.dropna() method, while imputing the missing values use sklearn.impute.SimpleImputer() method.
drop_rows_with_missing_value_flag = False
- # clear_output()
+ clear_output()
if missing_value_flag:
- clear_output()
# Ask the user whether to use imputation techniques to deal with the missing values.
- print("-*-*- Missing Values Process -*-*-")
+ print("-*-*- Missing Values Process-*-*-")
print("Do you want to deal with the missing values?")
num2option(OPTION)
is_process_missing_value = limit_num_input(OPTION, SECTION[1], num_input)
+ clear_output()
if is_process_missing_value == 1:
process_missing_value_flag = True
# If the user wants to deal with the missing values, then ask the user which strategy to use.
@@ -283,8 +279,6 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
if missing_value_strategy_num == 1:
# Drop the rows with missing values
data_selected_dropped = data_selected.dropna()
- # Reset the index of the data set after dropping the rows with missing values.
- data_selected_dropped = data_selected_dropped.reset_index(drop=True)
print("Successfully drop the rows with missing values.")
print("The Selected Data Set After Dropping:")
print(data_selected_dropped)
@@ -301,12 +295,10 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
# Don't deal with the missing values, which means neither drop the rows with missing values nor use imputation techniques.
imputed_flag = False
save_data(data_selected, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
- clear_output()
else:
# If the selected data set doesn't have missing values, then don't deal with the missing values.
imputed_flag = False
save_data(data_selected, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
- clear_output()
data_selected = data_selected_dropped if drop_rows_with_missing_value_flag else data_selected
# If the selected data set contains missing values and the user wants to deal with the missing values and choose not to drop the rows with missing values,
# then use imputation techniques to deal with the missing values.
@@ -376,10 +368,10 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
# <--- Data Segmentation --->
# divide X and y data set when it is supervised learning
- logger.debug("Data Divsion")
+ logger.debug("Data Split")
if mode_num == 1 or mode_num == 2:
# Supervised learning
- print("-*-*- Data Segmentation - X Set and Y Set -*-*-")
+ print("-*-*- Data Split - X Set and Y Set -*-*-")
print("Divide the processing data set into X (feature value) and Y (target value) respectively.")
# create X data set
print("Selected sub data set to create X data set:")
@@ -394,22 +386,6 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
save_data(X, "X Without Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()
- # Create Y data set
- print("-*-*- Data Segmentation - X Set and Y Set-*-*-")
- print("Selected sub data set to create Y data set:")
- show_data_columns(data_selected_imputed_fe.columns)
- print("The selected Y data set:")
- print("Notice: Normally, please choose only one column to be tag column Y, not multiple columns.")
- print("Notice: For classification model training, please choose the label column which has distinctive integers.")
- y = create_sub_data_set(data_selected_imputed_fe)
- print("Successfully create Y data set.")
- print("The Selected Data Set:")
- print(y)
- print("Basic Statistical Information: ")
- basic_statistic(y)
- save_data(y, "Y", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
- clear_output()
-
# <--- Feature Scaling --->
print("-*-*- Feature Scaling on X Set -*-*-")
num2option(OPTION)
@@ -430,8 +406,24 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
feature_scaling_config = {}
clear_output()
+ # Create Y data set
+ print("-*-*- Data Split - X Set and Y Set-*-*-")
+ print("Selected sub data set to create Y data set:")
+ show_data_columns(data_selected_imputed_fe.columns)
+ print("The selected Y data set:")
+ print("Notice: Normally, please choose only one column to be tag column Y, not multiple columns.")
+ print("Notice: For classification model training, please choose the label column which has distinctive integers.")
+ y = create_sub_data_set(data_selected_imputed_fe)
+ print("Successfully create Y data set.")
+ print("The Selected Data Set:")
+ print(y)
+ print("Basic Statistical Information: ")
+ basic_statistic(y)
+ save_data(y, "Y", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+ clear_output()
+
# <--- Feature Selection --->
- print("-*-*- Feature Selection on X set -*-*-")
+ print("-*-*- Feature Selection -*-*-")
num2option(OPTION)
is_feature_selection = limit_num_input(OPTION, SECTION[1], num_input)
if is_feature_selection == 1:
@@ -543,7 +535,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
is_inference = False
# If the model is supervised learning, then allow the user to use model inference.
if mode_num == 1 or mode_num == 2:
- print("-*-*- Feature Engineering on Application Data -*-*-")
+ print("-*-*- Feature Engineering on Inference Data -*-*-")
is_inference = True
selected_columns = X_train.columns
if inference_data is not None:
@@ -553,15 +545,15 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
new_feature_builder = FeatureConstructor(inference_data)
inference_data_fe = new_feature_builder.batch_build(feature_engineering_config)
inference_data_fe_selected = inference_data_fe[selected_columns]
- save_data(inference_data, "Application Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
- save_data(inference_data_fe, "Application Data Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
- save_data(inference_data_fe_selected, "Application Data Feature-Engineering Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+ save_data(inference_data, "Inference Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+ save_data(inference_data_fe, "Inference Data Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+ save_data(inference_data_fe_selected, "Inference Data Feature-Engineering Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
else:
print("You have not applied feature engineering to the training data.")
print("Hence, no feature engineering operation will be applied to the inference data.")
inference_data_fe_selected = inference_data[selected_columns]
- save_data(inference_data, "Application Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
- save_data(inference_data_fe_selected, "Application Data Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+ save_data(inference_data, "Inference Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+ save_data(inference_data_fe_selected, "Inference Data Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
else:
# If the user doesn't provide the inference data path, it means that the user doesn't want to run the model inference.
print("You did not enter inference data.")
@@ -604,7 +596,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
if drop_rows_with_missing_value_flag:
inference_data_fe_selected_dropped = inference_data_fe_selected.dropna()
model_inference(inference_data_fe_selected_dropped, is_inference, run, transformer_config, transform_pipeline)
- save_data(inference_data_fe_selected_dropped, "Application Data Feature-Engineering Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+ save_data(inference_data_fe_selected_dropped, "Inference Data Feature-Engineering Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
else:
model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline)
clear_output()
@@ -641,7 +633,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
if drop_rows_with_missing_value_flag:
inference_data_fe_selected_dropped = inference_data_fe_selected.dropna()
model_inference(inference_data_fe_selected_dropped, is_inference, run, transformer_config, transform_pipeline)
- save_data(inference_data_fe_selected_dropped, "Application Data Feature-Engineering Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+ save_data(inference_data_fe_selected_dropped, "Inference Data Feature-Engineering Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
else:
model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline)
clear_output()
diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
index 6ccc7a23..8822ea74 100644
--- a/geochemistrypi/data_mining/constants.py
+++ b/geochemistrypi/data_mining/constants.py
@@ -100,6 +100,6 @@
CUSTOMIZE_LABEL_STRATEGY = ["Automatic Coding", "Custom Numeric Labels", "Custom Non-numeric Labels"]
-FEATURE_SELECTION_STRATEGY = ["Generic Univariate Select", "Select K Best"]
+FEATURE_SELECTION_STRATEGY = ["GenericUnivariateSelect", "SelectKBest"]
CALCULATION_METHOD_OPTION = ["Micro", "Macro", "Weighted"]
diff --git a/geochemistrypi/data_mining/data/inference.py b/geochemistrypi/data_mining/data/inference.py
index db78c434..0ebd6bce 100644
--- a/geochemistrypi/data_mining/data/inference.py
+++ b/geochemistrypi/data_mining/data/inference.py
@@ -129,7 +129,7 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: objec
"""
# If is_inference is True, then run the model inference.
if is_inference is True:
- print("Use the trained model to make predictions on the application data.")
+ print("Use the trained model to make predictions on the inference data.")
# If transformer_config is not {}, then transform the inference data with the transform pipeline.
if transformer_config:
inference_data_transformed = transform_pipeline.transform(inference_data)
@@ -139,4 +139,4 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: objec
inference_data_predicted_np = loaded_model.predict(inference_data_transformed)
inference_data_predicted = np2pd(inference_data_predicted_np, ["Predicted Value"])
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
- save_data(inference_data_predicted, "Application Data Predicted", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+ save_data(inference_data_predicted, "Inference Data Predicted", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
diff --git a/geochemistrypi/data_mining/data/preprocessing.py b/geochemistrypi/data_mining/data/preprocessing.py
index 95a6a7a3..4587c7c8 100644
--- a/geochemistrypi/data_mining/data/preprocessing.py
+++ b/geochemistrypi/data_mining/data/preprocessing.py
@@ -73,7 +73,7 @@ def feature_selector(X: pd.DataFrame, y: pd.DataFrame, feature_selection_task: i
X_selected : pd.DataFrame
The feature dataset after selecting.
"""
- print("-- Original Features --")
+ print("--Original Features-")
show_data_columns(X.columns)
features_num = len(X.columns)
@@ -85,9 +85,9 @@ def feature_selector(X: pd.DataFrame, y: pd.DataFrame, feature_selection_task: i
elif feature_selection_task == 2:
score_func = f_classif
- if method[method_idx] == "Generic Univariate Select":
+ if method[method_idx] == "GenericUnivariateSelect":
selector = GenericUnivariateSelect(score_func=score_func, mode="k_best", param=features_retain_num)
- elif method[method_idx] == "Select K Best":
+ elif method[method_idx] == "SelectKBest":
selector = SelectKBest(score_func=score_func, k=features_retain_num)
try:
diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py
index c2e3f280..910f34d3 100644
--- a/geochemistrypi/data_mining/model/classification.py
+++ b/geochemistrypi/data_mining/model/classification.py
@@ -533,10 +533,6 @@ def __init__(
self.decision_function_shape = decision_function_shape
self.break_ties = break_ties
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = SVC(
C=self.C,
kernel=self.kernel,
@@ -788,10 +784,6 @@ def __init__(
self.class_weight = class_weight
self.ccp_alpha = ccp_alpha
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = DecisionTreeClassifier(
criterion=self.criterion,
splitter=self.splitter,
@@ -924,7 +916,7 @@ def __init__(
bootstrap: bool = True,
oob_score: bool = False,
n_jobs: Optional[int] = -1,
- random_state: Optional[int] = None,
+ random_state: Optional[int] = 42,
verbose: int = 0,
warm_start: bool = False,
class_weight: Union[str, dict, list[dict], None] = None,
@@ -1126,16 +1118,13 @@ def __init__(
self.bootstrap = bootstrap
self.oob_score = oob_score
self.n_jobs = n_jobs
+ self.random_state = random_state
self.verbose = verbose
self.warm_start = warm_start
self.class_weight = class_weight
self.ccp_alpha = ccp_alpha
self.max_samples = max_samples
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = RandomForestClassifier(
n_estimators=self.n_estimators,
criterion=self.criterion,
@@ -1460,6 +1449,7 @@ def __init__(
self.base_score = base_score
self.missing = missing
self.num_parallel_tree = num_parallel_tree
+ self.random_state = random_state
self.n_jobs = n_jobs
self.monotone_constraints = monotone_constraints
self.interaction_constraints = interaction_constraints
@@ -1470,14 +1460,9 @@ def __init__(
self.enable_categorical = enable_categorical
self.eval_metric = eval_metric
self.early_stopping_rounds = early_stopping_rounds
-
if kwargs:
self.kwargs = kwargs
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = xgboost.XGBClassifier(
n_estimators=self.n_estimators,
objective=self.objective,
@@ -1765,19 +1750,17 @@ def __init__(
self.fit_intercept = fit_intercept
self.intercept_scaling = intercept_scaling
self.class_weight = class_weight
+ self.random_state = random_state
self.solver = solver
self.max_iter = max_iter
self.multi_class = multi_class
self.n_jobs = n_jobs
+ self.random_state = random_state
self.verbose = verbose
self.warm_start = warm_start
self.class_weight = class_weight
self.l1_ratio = l1_ratio
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = LogisticRegression(
penalty=self.penalty,
dual=self.dual,
@@ -2063,6 +2046,7 @@ def __init__(
self.power_t = (power_t,)
self.max_iter = (max_iter,)
self.shuffle = (shuffle,)
+ self.random_state = (random_state,)
self.tol = (tol,)
self.verbose = (verbose,)
self.warm_start = (warm_start,)
@@ -2076,12 +2060,6 @@ def __init__(
self.n_iter_no_change = (n_iter_no_change,)
self.max_fun = (max_fun,)
- if random_state:
- self.random_state = (random_state,)
- else:
- self.random_state = (self.random_state,)
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = MLPClassifier(
hidden_layer_sizes=self.hidden_layer_sizes[0],
activation=self.activation[0],
@@ -2416,16 +2394,13 @@ def __init__(
self.bootstrap = bootstrap
self.oob_score = oob_score
self.n_jobs = n_jobs
+ self.random_state = random_state
self.verbose = verbose
self.warm_start = warm_start
self.class_weight = class_weight
self.ccp_alpha = ccp_alpha
self.max_samples = max_samples
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = ExtraTreesClassifier(
n_estimators=self.n_estimators,
criterion=self.criterion,
@@ -2740,6 +2715,7 @@ def __init__(
self.init = (init,)
self.subsample = (subsample,)
self.max_features = (max_features,)
+ self.random_state = (random_state,)
self.verbose = (verbose,)
self.max_leaf_nodes = (max_leaf_nodes,)
self.min_impurity_decrease = (min_impurity_decrease,)
@@ -2749,12 +2725,6 @@ def __init__(
self.tol = (tol,)
self.ccp_alpha = (ccp_alpha,)
- if random_state:
- self.random_state = (random_state,)
- else:
- self.random_state = (self.random_state,)
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = GradientBoostingClassifier(
loss=self.loss[0],
learning_rate=self.learning_rate[0],
@@ -3248,6 +3218,7 @@ def __init__(
self.verbose = verbose
self.epsilon = epsilon
self.n_jobs = n_jobs
+ self.random_state = random_state
self.learning_rate = learning_rate
self.eta0 = eta0
self.power_t = power_t
@@ -3258,10 +3229,6 @@ def __init__(
self.warm_start = warm_start
self.average = average
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = SGDClassifier(
loss=self.loss,
penalty=self.penalty,
diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py
index 14afbc05..1dfad172 100644
--- a/geochemistrypi/data_mining/model/clustering.py
+++ b/geochemistrypi/data_mining/model/clustering.py
@@ -14,9 +14,9 @@
from ..utils.base import clear_output, save_data, save_fig, save_text
from ._base import WorkflowBase
from .func.algo_clustering._agglomerative import agglomerative_manual_hyper_parameters
-from .func.algo_clustering._common import plot_silhouette_diagram, plot_silhouette_value_diagram, scatter2d, scatter3d, score
-from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters
-from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters
+from .func.algo_clustering._common import plot_results, plot_silhouette_diagram, score
+from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters, dbscan_result_plot
+from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters, plot_silhouette_diagram_kmeans, scatter2d, scatter3d
class ClusteringWorkflowBase(WorkflowBase):
@@ -66,44 +66,26 @@ def _score(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, store_
mlflow.log_metrics(scores)
@staticmethod
- def _scatter2d(data: pd.DataFrame, labels: pd.DataFrame, cluster_centers_: np.ndarray, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
- """Plot the two-dimensional diagram of the clustering result."""
- print("-----* Cluster Two-Dimensional Diagram *-----")
- scatter2d(data, labels, cluster_centers_, algorithm_name)
- save_fig(f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
- data_with_labels = pd.concat([data, labels], axis=1)
- save_data(data_with_labels, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
+ def _plot_results(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, cluster_centers_: pd.DataFrame, local_path: str, mlflow_path: str) -> None:
+ """Plot the cluster_results ."""
+ print("-----* results diagram *-----")
+ plot_results(data, labels, algorithm_name, cluster_centers_)
+ save_fig(f"results - {algorithm_name}", local_path, mlflow_path)
+ data = pd.concat([data, labels], axis=1)
+ save_data(data, f"results - {algorithm_name}", local_path, mlflow_path)
@staticmethod
- def _scatter3d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
- """Plot the three-dimensional diagram of the clustering result."""
- print("-----* Cluster Three-Dimensional Diagram *-----")
- scatter3d(data, labels, algorithm_name)
- save_fig(f"Cluster Three-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
- data_with_labels = pd.concat([data, labels], axis=1)
- save_data(data_with_labels, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
-
- @staticmethod
- def _plot_silhouette_diagram(data: pd.DataFrame, labels: pd.DataFrame, model: object, cluster_centers_: np.ndarray, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
+ def _plot_silhouette_diagram(data: pd.DataFrame, labels: pd.DataFrame, cluster_centers_: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
"""Plot the silhouette diagram of the clustering result."""
print("-----* Silhouette Diagram *-----")
- plot_silhouette_diagram(data, labels, cluster_centers_, model, algorithm_name)
+ plot_silhouette_diagram(data, labels, algorithm_name)
save_fig(f"Silhouette Diagram - {algorithm_name}", local_path, mlflow_path)
data_with_labels = pd.concat([data, labels], axis=1)
save_data(data_with_labels, "Silhouette Diagram - Data With Labels", local_path, mlflow_path)
- if not isinstance(cluster_centers_, str):
+ if isinstance(cluster_centers_, pd.DataFrame):
cluster_center_data = pd.DataFrame(cluster_centers_, columns=data.columns)
save_data(cluster_center_data, "Silhouette Diagram - Cluster Centers", local_path, mlflow_path)
- @staticmethod
- def _plot_silhouette_value_diagram(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
- """Plot the silhouette value diagram of the clustering result."""
- print("-----* Silhouette value Diagram *-----")
- plot_silhouette_value_diagram(data, labels, algorithm_name)
- save_fig(f"Silhouette value Diagram - {algorithm_name}", local_path, mlflow_path)
- data_with_labels = pd.concat([data, labels], axis=1)
- save_data(data_with_labels, "Silhouette value Diagram - Data With Labels", local_path, mlflow_path)
-
def common_components(self) -> None:
"""Invoke all common application functions for clustering algorithms."""
GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH")
@@ -114,71 +96,18 @@ def common_components(self) -> None:
algorithm_name=self.naming,
store_path=GEOPI_OUTPUT_METRICS_PATH,
)
- if self.X.shape[1] >= 3:
- # choose two of dimensions to draw
- two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(self.X, 2)
- self._scatter2d(
- data=two_dimen_data,
- labels=self.clustering_result["clustering result"],
- cluster_centers_=self.get_cluster_centers(),
- algorithm_name=self.naming,
- local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
- mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
- )
-
- # choose three of dimensions to draw
- three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(self.X, 3)
- self._scatter3d(
- data=three_dimen_data,
- labels=self.clustering_result["clustering result"],
- algorithm_name=self.naming,
- local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
- mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
- )
- elif self.X.shape[1] == 3:
- # choose two of dimensions to draw
- two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(self.X, 2)
- self._scatter2d(
- data=two_dimen_data,
- labels=self.clustering_result["clustering result"],
- cluster_centers_=self.get_cluster_centers(),
- algorithm_name=self.naming,
- local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
- mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
- )
-
- # no need to choose
- self._scatter3d(
- data=self.X,
- labels=self.clustering_result["clustering result"],
- algorithm_name=self.naming,
- local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
- mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
- )
- elif self.X.shape[1] == 2:
- self._scatter2d(
- data=self.X,
- labels=self.clustering_result["clustering result"],
- cluster_centers_=self.get_cluster_centers(),
- algorithm_name=self.naming,
- local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
- mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
- )
- else:
- pass
-
+ # self._plot_results(
+ # data=self.X,
+ # labels=self.clustering_result["clustering result"],
+ # cluster_centers_=self.get_cluster_centers(),
+ # algorithm_name=self.naming,
+ # local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+ # mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+ # )
self._plot_silhouette_diagram(
data=self.X,
labels=self.clustering_result["clustering result"],
cluster_centers_=self.get_cluster_centers(),
- model=self.model,
- algorithm_name=self.naming,
- local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
- mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
- )
- self._plot_silhouette_value_diagram(
- data=self.X,
- labels=self.clustering_result["clustering result"],
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
@@ -286,10 +215,6 @@ def __init__(
self.copy_x = copy_x
self.algorithm = algorithm
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = KMeans(
n_clusters=self.n_clusters,
init=self.init,
@@ -321,13 +246,111 @@ def manual_hyper_parameters(cls) -> Dict:
clear_output()
return hyper_parameters
+ @staticmethod
+ def _plot_silhouette_diagram_kmeans(
+ data: pd.DataFrame,
+ cluster_labels: pd.DataFrame,
+ cluster_centers_: np.ndarray,
+ n_clusters: int,
+ algorithm_name: str,
+ local_path: str,
+ mlflow_path: str,
+ ) -> None:
+ """Plot the silhouette diagram of the clustering result."""
+ print("-----* KMeans's Silhouette Diagram *-----")
+ plot_silhouette_diagram_kmeans(data, cluster_labels, cluster_centers_, n_clusters, algorithm_name)
+ save_fig(f"KMeans's Silhouette Diagram - {algorithm_name}", local_path, mlflow_path)
+ data_with_labels = pd.concat([data, cluster_labels], axis=1)
+ save_data(data_with_labels, "KMeans's Silhouette Diagram - Data With Labels", local_path, mlflow_path)
+ cluster_center_data = pd.DataFrame(cluster_centers_, columns=data.columns)
+ save_data(cluster_center_data, "KMeans's Silhouette Diagram - Cluster Centers", local_path, mlflow_path)
+
+ @staticmethod
+ def _scatter2d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
+ """Plot the two-dimensional diagram of the clustering result."""
+ print("-----* Cluster Two-Dimensional Diagram *-----")
+ scatter2d(data, cluster_labels, algorithm_name)
+ save_fig(f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
+ data_with_labels = pd.concat([data, cluster_labels], axis=1)
+ save_data(data_with_labels, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
+
+ @staticmethod
+ def _scatter3d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
+ """Plot the three-dimensional diagram of the clustering result."""
+ print("-----* Cluster Three-Dimensional Diagram *-----")
+ scatter3d(data, cluster_labels, algorithm_name)
+ save_fig(f"Cluster Three-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
+ data_with_labels = pd.concat([data, cluster_labels], axis=1)
+ save_data(data_with_labels, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
+
def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
"""Invoke all special application functions for this algorithms by Scikit-learn framework."""
GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH")
+ GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
self._get_inertia_scores(
algorithm_name=self.naming,
store_path=GEOPI_OUTPUT_METRICS_PATH,
)
+ self._plot_silhouette_diagram_kmeans(
+ data=self.X,
+ cluster_labels=self.clustering_result["clustering result"],
+ cluster_centers_=self.get_cluster_centers(),
+ n_clusters=self.n_clusters,
+ algorithm_name=self.naming,
+ local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+ mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+ )
+
+ # Draw graphs when the number of principal components > 3
+ if self.X.shape[1] >= 3:
+ # choose two of dimensions to draw
+ two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(self.X, 2)
+ self._scatter2d(
+ data=two_dimen_data,
+ cluster_labels=self.clustering_result["clustering result"],
+ algorithm_name=self.naming,
+ local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+ mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+ )
+
+ # choose three of dimensions to draw
+ three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(self.X, 3)
+ self._scatter3d(
+ data=three_dimen_data,
+ cluster_labels=self.clustering_result["clustering result"],
+ algorithm_name=self.naming,
+ local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+ mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+ )
+ elif self.X.shape[1] == 3:
+ # choose two of dimensions to draw
+ two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(self.X, 2)
+ self._scatter2d(
+ data=two_dimen_data,
+ cluster_labels=self.clustering_result["clustering result"],
+ algorithm_name=self.naming,
+ local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+ mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+ )
+
+ # no need to choose
+ self._scatter3d(
+ data=self.X,
+ cluster_labels=self.clustering_result["clustering result"],
+ algorithm_name=self.naming,
+ local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+ mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+ )
+ elif self.X.shape[1] == 2:
+ self._scatter2d(
+ data=self.X,
+ cluster_labels=self.clustering_result["clustering result"],
+ algorithm_name=self.naming,
+ local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+ mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+ )
+ else:
+ pass
class DBSCANClustering(ClusteringWorkflowBase):
@@ -419,8 +442,25 @@ def manual_hyper_parameters(cls) -> Dict:
clear_output()
return hyper_parameters
+ @staticmethod
+ def _clustering_result_plot(X: pd.DataFrame, trained_model: any, algorithm_name: str, imag_config: dict, local_path: str, mlflow_path: str) -> None:
+ """Plot the clustering result in 2D graph."""
+ print("-------** Cluster Two-Dimensional Diagram **----------")
+ dbscan_result_plot(X, trained_model, imag_config, algorithm_name)
+ save_fig(f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
+ save_data(X, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path)
+
def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
"""Invoke all special application functions for this algorithms by Scikit-learn framework."""
+ GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
+ self._clustering_result_plot(
+ X=self.X,
+ trained_model=self.model,
+ algorithm_name=self.naming,
+ imag_config=self.image_config,
+ local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+ mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+ )
class Agglomerative(ClusteringWorkflowBase):
@@ -576,20 +616,16 @@ def __init__(
self.verbose = verbose
self.preference = preference
self.affinity = affinity
-
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
+ self.random_state = random_state
self.model = AffinityPropagation(
damping=self.damping,
max_iter=self.max_iter,
convergence_iter=self.convergence_iter,
copy=self.copy,
- preference=self.preference,
- affinity=self.affinity,
- verbose=self.verbose,
- random_state=self.random_state,
+ preference=None,
+ affinity="euclidean",
+ verbose=False,
+ random_state=None,
)
self.naming = AffinityPropagationClustering.name
diff --git a/geochemistrypi/data_mining/model/decomposition.py b/geochemistrypi/data_mining/model/decomposition.py
index ab954230..d9490456 100644
--- a/geochemistrypi/data_mining/model/decomposition.py
+++ b/geochemistrypi/data_mining/model/decomposition.py
@@ -189,11 +189,8 @@ def __init__(
self.iterated_power = iterated_power
# self.n_oversamples = n_oversamples
# self.power_iteration_normalizer = power_iteration_normalizer
+ self.random_state = random_state
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = PCA(
n_components=self.n_components,
copy=self.copy,
@@ -491,15 +488,12 @@ def __init__(
self.metric_params = metric_params
self.init = init
self.verbose = verbose
+ self.random_state = random_state
self.method = method
self.angle = angle
self.n_jobs = n_jobs
self.square_distances = square_distances
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = TSNE(
n_components=self.n_components,
perplexity=self.perplexity,
@@ -624,13 +618,10 @@ def __init__(
self.verbose = verbose
self.eps = eps
self.n_jobs = n_jobs
+ self.random_state = random_state
self.dissimilarity = dissimilarity
# self.normalized_stress = normalized_stress
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = MDS(
n_components=self.n_components,
metric=self.metric,
diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_extra_trees.py b/geochemistrypi/data_mining/model/func/algo_classification/_extra_trees.py
index 75ae43f3..e4a0eb2b 100644
--- a/geochemistrypi/data_mining/model/func/algo_classification/_extra_trees.py
+++ b/geochemistrypi/data_mining/model/func/algo_classification/_extra_trees.py
@@ -34,7 +34,7 @@ def extra_trees_manual_hyper_parameters() -> Dict:
"Bootstrap: Whether bootstrap samples are used when building trees. Bootstrapping is a technique where a random subset of the data is sampled with replacement to create a new dataset"
" of the same size as the original. This new dataset is then used to construct a decision tree in the ensemble. If False, the whole dataset is used to build each tree."
)
- print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it as True.")
+ print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it set to True.")
bootstrap = bool_input(SECTION[2])
max_samples = None
if bootstrap:
@@ -45,7 +45,7 @@ def extra_trees_manual_hyper_parameters() -> Dict:
"oob_score: Whether to use out-of-bag samples to estimate the generalization accuracy. When the oob_score hyperparameter is set to True, Extra Trees will use a random subset of the data"
" to train each decision tree in the ensemble, and the remaining data that was not used for training (the out-of-bag samples) will be used to calculate the OOB score. "
)
- print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it as True.")
+ print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it set to True.")
oob_score = bool_input(SECTION[2])
hyper_parameters = {
"n_estimators": n_estimators,
diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_logistic_regression.py b/geochemistrypi/data_mining/model/func/algo_classification/_logistic_regression.py
index 5ff1f9d5..432ddbf7 100644
--- a/geochemistrypi/data_mining/model/func/algo_classification/_logistic_regression.py
+++ b/geochemistrypi/data_mining/model/func/algo_classification/_logistic_regression.py
@@ -17,7 +17,7 @@ def logistic_regression_manual_hyper_parameters() -> Dict:
hyper_parameters : dict
"""
print("Penalty: This hyperparameter specifies the norm used in the penalization.")
- print("Please specify the norm used in the penalization. It is generally recommended to leave it as 'l2'.")
+ print("Please specify the norm used in the penalization. It is generally recommended to leave it set to l2.")
penalties = ["l1", "l2", "elasticnet", "None"]
penalty = str_input(penalties, SECTION[2])
if penalty == "None":
@@ -28,12 +28,12 @@ def logistic_regression_manual_hyper_parameters() -> Dict:
l1_ratio = None
if penalty == "l1":
print("Solver: This hyperparameter specifies the algorithm to use in the optimization problem.")
- print("Please specify the algorithm to use in the optimization problem. It is generally recommended to leave it as 'liblinear'.")
+ print("Please specify the algorithm to use in the optimization problem. It is generally recommended to leave it set to liblinear.")
solvers = ["liblinear", "saga"]
solver = str_input(solvers, SECTION[2])
elif penalty == "l2" or penalty == "none":
print("Solver: This hyperparameter specifies the algorithm to use in the optimization problem.")
- print("Please specify the algorithm to use in the optimization problem. It is generally recommended to leave it as 'lbfgs'.")
+ print("Please specify the algorithm to use in the optimization problem. It is generally recommended to leave it set to lbfgs.")
solvers = ["newton-cg", "lbfgs", "sag", "saga"]
solver = str_input(solvers, SECTION[2])
elif penalty == "elasticnet":
@@ -48,7 +48,7 @@ def logistic_regression_manual_hyper_parameters() -> Dict:
"Class Weight: This hyperparameter specifies the weights associated with classes. It can be set to 'balanced'"
" to automatically adjust the weights inversely proportional to the class frequencies in the input data."
)
- print("Please specify the weights associated with classes. It is generally recommended to leave it as None.")
+ print("Please specify the weights associated with classes. It is generally recommended to leave it set to None.")
class_weights = ["None", "balanced"]
class_weight = str_input(class_weights, SECTION[2])
if class_weight == "None":
diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_multi_layer_perceptron.py b/geochemistrypi/data_mining/model/func/algo_classification/_multi_layer_perceptron.py
index 8d805288..e7769eb5 100644
--- a/geochemistrypi/data_mining/model/func/algo_classification/_multi_layer_perceptron.py
+++ b/geochemistrypi/data_mining/model/func/algo_classification/_multi_layer_perceptron.py
@@ -18,18 +18,18 @@ def multi_layer_perceptron_manual_hyper_parameters() -> Dict:
print("Please specify the size of hidden layer and the number of neurons in the each hidden layer.")
hidden_layer = tuple_input((50, 25, 5), SECTION[2], "@Hidden Layer Sizes: ")
print("Activation: Activation function for the hidden layer.")
- print("Please specify the activation function for the hidden layer. It is generally recommended to leave it as 'ReLU'.")
+ print("Please specify the activation function for the hidden layer. It is generally recommended to leave it set to ReLU.")
activations = ["identity", "logistic", "tanh", "relu"]
activation = str_input(activations, SECTION[2])
print("Solver: The solver for weight optimization.")
- print("Please specify the solver for weight optimization. It is generally recommended to leave it as 'Adam'.")
+ print("Please specify the solver for weight optimization. It is generally recommended to leave it set to Adam.")
solvers = ["lbfgs", "sgd", "adam"]
solver = str_input(solvers, SECTION[2])
print("Alpha: L2 penalty (regularization term) parameter.")
print("Please specify the L2 penalty (regularization term) parameter. A good starting range could be between 0.0001 and 10, such as 0.0001.")
alpha = float_input(0.0001, SECTION[2], "@Alpha: ")
print("Learning Rate: It controls the step-size in updating the weights.")
- print("Please specify the learning rate. It is generally recommended to leave it as 'Adaptive'.")
+ print("Please specify the learning rate. It is generally recommended to leave it set to Adaptive.")
learning_rates = ["constant", "invscaling", "adaptive"]
learning_rate = str_input(learning_rates, SECTION[2])
print("Max Iterations: Maximum number of iterations. The solver iterates until convergence (determined by 'tol') or this number of iterations.")
diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_rf.py b/geochemistrypi/data_mining/model/func/algo_classification/_rf.py
index 6de6a97d..6d73e305 100644
--- a/geochemistrypi/data_mining/model/func/algo_classification/_rf.py
+++ b/geochemistrypi/data_mining/model/func/algo_classification/_rf.py
@@ -35,7 +35,7 @@ def random_forest_manual_hyper_parameters() -> Dict:
"Bootstrap: Whether bootstrap samples are used when building trees. Bootstrapping is a technique where a random subset of the data is sampled with replacement"
" to create a new dataset of the same size as the original. This new dataset is then used to construct a decision tree in the ensemble. If False, the whole dataset is used to build each tree."
)
- print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it as True.")
+ print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it set to True.")
bootstrap = bool_input(SECTION[2])
max_samples = None
if bootstrap:
@@ -46,7 +46,7 @@ def random_forest_manual_hyper_parameters() -> Dict:
"oob_score: Whether to use out-of-bag samples to estimate the generalization accuracy. When the oob_score hyperparameter is set to True, Extra Trees will use a random subset of the data"
" to train each decision tree in the ensemble, and the remaining data that was not used for training (the out-of-bag samples) will be used to calculate the OOB score. "
)
- print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it as True.")
+ print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it set to True.")
oob_score = bool_input(SECTION[2])
hyper_parameters = {
"n_estimators": n_estimators,
diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_svc.py b/geochemistrypi/data_mining/model/func/algo_classification/_svc.py
index a58634a4..45cb16cf 100644
--- a/geochemistrypi/data_mining/model/func/algo_classification/_svc.py
+++ b/geochemistrypi/data_mining/model/func/algo_classification/_svc.py
@@ -15,7 +15,7 @@ def svc_manual_hyper_parameters() -> Dict:
hyper_parameters : dict
"""
print("Kernel: This hyperparameter specifies the kernel function to be used for mapping the input data to a higher-dimensional feature space.")
- print("Please specify the kernel type to be used in the algorithm. It is generally recommended to leave it as 'Radial Basis Function (RBF) Kernel'.")
+ print("Please specify the kernel type to be used in the algorithm. It is generally recommended to leave it set to Radial basis function (RBF) kernel.")
kernels = ["linear", "poly", "rbf", "sigmoid"]
kernel = str_input(kernels, SECTION[2])
degree = None
@@ -41,7 +41,7 @@ def svc_manual_hyper_parameters() -> Dict:
C = float_input(1, SECTION[2], "@C: ")
print("Shrinking: This hyperparameter specifies whether to use the shrinking heuristic.")
print("The shrinking heuristic is a technique that speeds up the training process by only considering the support vectors in the decision function.")
- print("Please specify whether to use the shrinking heuristic. It is generally recommended to leave it as True.")
+ print("Please specify whether to use the shrinking heuristic. It is generally recommended to leave it set to True.")
shrinking = bool_input(SECTION[2])
hyper_parameters = {"kernel": kernel, "C": C, "shrinking": shrinking}
if not degree:
diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_common.py b/geochemistrypi/data_mining/model/func/algo_clustering/_common.py
index 75848442..9a58d65f 100644
--- a/geochemistrypi/data_mining/model/func/algo_clustering/_common.py
+++ b/geochemistrypi/data_mining/model/func/algo_clustering/_common.py
@@ -1,34 +1,31 @@
# -*- coding: utf-8 -*-
-from itertools import cycle
from typing import Dict
-import matplotlib.cm as cm
import matplotlib.pyplot as plt
-import numpy as np
import pandas as pd
import seaborn as sns
from rich import print
from sklearn.metrics import calinski_harabasz_score, silhouette_samples, silhouette_score
-def score(data: pd.DataFrame, labels: pd.DataFrame) -> Dict:
+def score(X: pd.DataFrame, labels: pd.DataFrame) -> Dict:
"""Calculate the scores of the clustering model.
Parameters
----------
- data : pd.DataFrame (n_samples, n_components)
+ X : pd.DataFrame (n_samples, n_components)
The true values.
- labels : pd.DataFrame (n_samples, n_components)
- Labels of each point.
+ label : pd.DataFrame (n_samples, n_components)
+ The labels values.
Returns
-------
scores : dict
The scores of the clustering model.
"""
- silhouette = silhouette_score(data, labels)
- calinski_harabaz = calinski_harabasz_score(data, labels)
+ silhouette = silhouette_score(X, labels)
+ calinski_harabaz = calinski_harabasz_score(X, labels)
print("silhouette_score: ", silhouette)
print("calinski_harabasz_score:", calinski_harabaz)
scores = {
@@ -38,298 +35,49 @@ def score(data: pd.DataFrame, labels: pd.DataFrame) -> Dict:
return scores
-def scatter2d(data: pd.DataFrame, labels: pd.DataFrame, cluster_centers_: np.ndarray, algorithm_name: str) -> None:
- """
- Draw the result-2D diagram for analysis.
-
- Parameters
- ----------
- data : pd.DataFrame (n_samples, n_components)
- The features of the data.
-
- labels : pd.DataFrame (n_samples,)
- Labels of each point.
-
- cluster_centers_: np.ndarray (n_samples,)
- Coordinates of cluster centers. If the algorithm stops before fully converging (see tol and max_iter), these will not be consistent with labels_.
-
- algorithm_name : str
- the name of the algorithm
- """
- # markers = ["+", "v", ".", "d", "o", "s", "1", "D", "X", "^", "p", "<", "*", "H", "3", "P"]
- colors = [
- "#1f77b4",
- "#ff7f0e",
- "#2ca02c",
- "#d62728",
- "#9467bd",
- "#8c564b",
- "#e377c2",
- "#7f7f7f",
- "#bcbd22",
- "#17becf",
- "#33a02c",
- "#1f77b4",
- "#ff7f0e",
- "#2ca02c",
- "#d62728",
- "#9467bd",
- "#8c564b",
- "#e377c2",
- "#7f7f7f",
- "#bcbd22",
- ]
-
- # marker_cycle = cycle(markers)
- color_cycle = cycle(colors)
-
- fig = plt.figure()
- fig.set_size_inches(18, 10)
- plt.subplot(111)
- # Plot the data
- for i, label in enumerate(set(labels)):
- cluster_data = data[labels == label]
- color = next(color_cycle)
- # marker = next(marker_cycle)
- marker = "."
- plt.scatter(cluster_data.iloc[:, 0], cluster_data.iloc[:, 1], c=color, marker=marker)
-
- # Plot the cluster centers
- if not isinstance(cluster_centers_, str):
- # Draw white circles at cluster centers
- plt.scatter(cluster_centers_[:, 0], cluster_centers_[:, 1], c="white", marker="o", alpha=1, s=200, edgecolor="k")
-
- # Label the cluster centers
- for i, c in enumerate(cluster_centers_):
- plt.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
-
- plt.xlabel(f"{data.columns[0]}")
- plt.ylabel(f"{data.columns[1]}")
- plt.title(f"Cluster Data Bi-plot - {algorithm_name}")
-
-
-def scatter3d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str) -> None:
- """
- Draw the result-3D diagram for analysis.
-
- Parameters
- ----------
- data : pd.DataFrame (n_samples, n_components)
- The features of the data.
-
- labels : pd.DataFrame (n_samples,)
- Labels of each point.
-
- algorithm_name : str
- the name of the algorithm
- """
- plt.figure()
- namelist = data.columns.values.tolist()
- fig = plt.figure(figsize=(12, 6), facecolor="w")
- plt.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.9)
-
- # Plot the data without cluster results
- ax = fig.add_subplot(121, projection="3d")
- ax.scatter(data.iloc[:, 0], data.iloc[:, 1], data.iloc[:, 2], alpha=0.3, c="#FF0000", marker=".")
- ax.set_xlabel(namelist[0])
- ax.set_ylabel(namelist[1])
- ax.set_zlabel(namelist[2])
- plt.grid(True)
-
- ax2 = fig.add_subplot(122, projection="3d")
- # markers = ["+", "v", ".", "d", "o", "s", "1", "D", "X", "^", "p", "<", "*", "H", "3", "P"]
- colors = [
- "#1f77b4",
- "#ff7f0e",
- "#2ca02c",
- "#d62728",
- "#9467bd",
- "#8c564b",
- "#e377c2",
- "#7f7f7f",
- "#bcbd22",
- "#17becf",
- "#33a02c",
- "#1f77b4",
- "#ff7f0e",
- "#2ca02c",
- "#d62728",
- "#9467bd",
- "#8c564b",
- "#e377c2",
- "#7f7f7f",
- "#bcbd22",
- ]
- # marker_cycle = cycle(markers)
- color_cycle = cycle(colors)
-
- # Plot the data with cluster results
- for i, label in enumerate(set(labels)):
- cluster_data = data[labels == label]
- color = next(color_cycle)
- # marker = next(marker_cycle)
- marker = "."
- ax2.scatter(cluster_data.iloc[:, 0], cluster_data.iloc[:, 1], cluster_data.iloc[:, 2], c=color, marker=marker, s=6, cmap=plt.cm.Paired, edgecolors="none")
-
- ax2.set_xlabel(namelist[0])
- ax2.set_ylabel(namelist[1])
- ax2.set_zlabel(namelist[2])
- plt.grid(True)
- ax.set_title(f"Base Data Tri-plot - {algorithm_name}")
- ax2.set_title(f"Cluster Data Tri-plot - {algorithm_name}")
-
-
-def plot_silhouette_diagram(data: pd.DataFrame, labels: pd.DataFrame, cluster_centers_: np.ndarray, model: object, algorithm_name: str) -> None:
- """
- Draw the silhouette diagram for analysis.
+def plot_results(X, labels, algorithm_name: str, cluster_centers_=None) -> None:
+ """Plot clustering results of the clustering model.
Parameters
----------
- data : pd.DataFrame (n_samples, n_components)
- The true values.
-
- labels : pd.DataFrame (n_samples,)
- Labels of each point.
-
- cluster_centers_: np.ndarray (n_samples,)
- Coordinates of cluster centers. If the algorithm stops before fully converging (see tol and max_iter), these will not be consistent with labels_.
+ X : pd.DataFrame (n_samples, n_components)
+ The true values.
- model : sklearn algorithm model
- The sklearn algorithm model trained with X.
+ label : pd.DataFrame (n_samples, n_components)
+ The labels values.
algorithm_name : str
- the name of the algorithm
-
- References
- ----------
- Silhouette analysis can be used to study the separation distance between the resulting clusters.
- The silhouette plot displays a measure of how close each point in one cluster is to other points in the
- neighboring clusters and thus provides a way to assess parameters like number of clusters visually.
- This measure has a range of [-1, 1].
+ The name of the algorithm model.
- https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
+ cluster_centers
+ The center of the algorithm model.
"""
- if hasattr(model, "n_clusters"):
- n_clusters = model.n_clusters
- else:
- n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
-
- # Create a subplot with 1 row and 2 columns
- fig, (ax1, ax2) = plt.subplots(1, 2)
- fig.set_size_inches(18, 10)
-
- # The 1st subplot is the silhouette plot
- # The silhouette coefficient can range from -1, 1 but in this example all
- # lie within [-0.1, 1]
- ax1.set_xlim([-0.1, 1])
- # The (n_clusters+1)*10 is for inserting blank space between silhouette
- # plots of individual clusters, to demarcate them clearly.
- ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10])
-
- # The silhouette_score gives the average value for all the samples.
- # This gives a perspective into the density and separation of the formed
- # clusters
- silhouette_avg = silhouette_score(data, labels)
- print(
- "For n_clusters =",
- n_clusters,
- "The average silhouette_score is :",
- silhouette_avg,
- )
-
- # Compute the silhouette scores for each sample
- sample_silhouette_values = silhouette_samples(data, labels)
-
- if n_clusters >= 20:
- Fontsize = 5
- y_long = 7
- else:
- Fontsize = None
- y_long = 10
-
- y_lower = 10
- for i in range(n_clusters):
- # Aggregate the silhouette scores for samples belonging to
- # cluster i, and sort them
- ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
-
- ith_cluster_silhouette_values.sort()
-
- size_cluster_i = ith_cluster_silhouette_values.shape[0]
- y_upper = y_lower + size_cluster_i
-
- color = cm.nipy_spectral(float(i) / n_clusters)
- ax1.fill_betweenx(
- np.arange(y_lower, y_upper),
- 0,
- ith_cluster_silhouette_values,
- facecolor=color,
- edgecolor=color,
- alpha=0.7,
- )
-
- # Label the silhouette plots with their cluster numbers at the middle
- ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontsize=Fontsize)
-
- # Compute the new y_lower for next plot
- y_lower = y_upper + y_long # 10 for the 0 samples
-
- ax1.set_title("The silhouette plot for the various clusters.")
- ax1.set_xlabel("The silhouette coefficient values")
- ax1.set_ylabel("Cluster label")
-
- # The vertical line for average silhouette score of all the values
- ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
-
- ax1.set_yticks([]) # Clear the yaxis labels / ticks
- ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
-
- # 2nd Plot showing the actual clusters formed
- colors = cm.nipy_spectral(labels.astype(float) / n_clusters)
- ax2.scatter(data.iloc[:, 0], data.iloc[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k")
-
+ sns.scatterplot(x=X.iloc[:, 0], y=X.iloc[:, 1], hue=labels, palette="viridis", s=50, alpha=0.8)
if not isinstance(cluster_centers_, str):
- # Draw white circles at cluster centers
- ax2.scatter(
- cluster_centers_[:, 0],
- cluster_centers_[:, 1],
- marker="o",
- c="white",
- alpha=1,
- s=200,
- edgecolor="k",
- )
-
- # Label the cluster centers
- for i, c in enumerate(cluster_centers_):
- ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
-
- ax2.set_title("The visualization of the clustered data.")
- ax2.set_xlabel("Feature space for the 1st feature")
- ax2.set_ylabel("Feature space for the 2nd feature")
- plt.suptitle(
- f"Silhouette analysis for clustering on sample data with n_clusters = %d - {algorithm_name}" % n_clusters,
- fontsize=14,
- fontweight="bold",
- )
+ plt.scatter(cluster_centers_[:, 0], cluster_centers_[:, 1], c="red", marker="X", s=200, label="Cluster Centers")
+ plt.title(f"results - {algorithm_name}")
+ plt.xlabel("Feature 1")
+ plt.ylabel("Feature 2")
+ plt.legend()
-def plot_silhouette_value_diagram(data, labels, algorithm_name: str):
+def plot_silhouette_diagram(X, labels, algorithm_name: str):
"""Calculate the scores of the clustering model.
Parameters
----------
- data : pd.DataFrame (n_samples, n_components)
+ X : pd.DataFrame (n_samples, n_components)
The true values.
- labels : pd.DataFrame (n_samples, n_components)
- Labels of each point.
+ label : pd.DataFrame (n_samples, n_components)
+ The labels values.
algorithm_name : str
The name of the algorithm model.
"""
- silhouette_values = silhouette_samples(data, labels)
+ silhouette_values = silhouette_samples(X, labels)
sns.histplot(silhouette_values, bins=30, kde=True)
- plt.title(f"Silhouette value Diagram - {algorithm_name}")
+ plt.title(f"Silhouette Diagram - {algorithm_name}")
plt.xlabel("Silhouette Coefficient")
plt.ylabel("Frequency")
+ plt.legend()
diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py b/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py
index 9918713b..951dca13 100644
--- a/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py
+++ b/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py
@@ -1,5 +1,8 @@
from typing import Dict
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
from rich import print
from ....constants import SECTION
@@ -16,51 +19,17 @@ def dbscan_manual_hyper_parameters() -> Dict:
print("Eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.")
print("Please specify the maximum distance. A good starting range could be between 0.1 and 1.0, such as 0.5.")
eps = float_input(0.5, SECTION[2], "Eps: ")
-
print("Min Samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.")
print("Please specify the number of samples. A good starting range could be between 5 and 20, such as 5.")
min_samples = num_input(SECTION[2], "Min Samples: ")
-
+ print("Metric: The metric to use when calculating distance between instances in a feature array.")
+ print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it as 'euclidean'.")
+ metrics = ["euclidean", "manhattan", "chebyshev", "minkowski", "cosine", "correlation"]
+ metric = str_input(metrics, SECTION[2])
print("Algorithm: The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors.")
print("Please specify the algorithm. It is generally recommended to leave it as 'auto'.")
algorithms = ["auto", "ball_tree", "kd_tree", "brute"]
algorithm = str_input(algorithms, SECTION[2])
-
- # Reference:
- # https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
- # https://scikit-learn.org/stable/modules/neighbors.html
- print("Metric: The metric to use when calculating distance between instances in a feature array.")
- print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it as 'euclidean'.")
- if algorithm == "kd_tree":
- metrics = ["euclidean", "l2", "minkowski", "p", "manhattan", "cityblock", "l1", "chebyshev", "infinity"]
- elif algorithm == "ball_tree":
- metrics = [
- "euclidean",
- "l2",
- "minkowski",
- "p",
- "manhattan",
- "cityblock",
- "l1",
- "chebyshev",
- "infinity",
- "seuclidean",
- "mahalanobis",
- "hamming",
- "canberra",
- "braycurtis",
- "jaccard",
- "dice",
- "rogerstanimoto",
- "russellrao",
- "sokalmichener",
- "sokalsneath",
- "haversine",
- ]
- else:
- metrics = ["euclidean", "manhattan", "chebyshev", "minkowski", "cosine", "correlation"]
- metric = str_input(metrics, SECTION[2])
-
print("Leaf Size: Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree.")
print("Please specify the leaf size. A good starting range could be between 10 and 30, such as 30.")
leaf_size = num_input(SECTION[2], "Leaf Size: ")
@@ -78,3 +47,94 @@ def dbscan_manual_hyper_parameters() -> Dict:
"p": p,
}
return hyper_parameters
+
+
+def dbscan_result_plot(data: pd.DataFrame, trained_model: any, image_config: dict, algorithm_name: str) -> None:
+ """
+ Draw the clustering result diagram for analysis.
+
+ Parameters
+ ----------
+ data: pd.DataFrame (n_samples, n_components)
+ Data for silhouette.
+
+ trained_model: any
+ The algorithm which to be used
+
+ algorithm_name : str
+ the name of the algorithm
+
+ References
+ ----------
+ The DBSCAN algorithm is deterministic, always generating the same clusters when given the same data in the same order.
+
+ https://scikit-learn.org/stable/modules/clustering.html/dbscan
+
+ """
+ db = trained_model.fit(data)
+ labels = trained_model.labels_
+ core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
+ core_samples_mask[db.core_sample_indices_] = True
+ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
+ print("Estimated number of clusters: %d" % n_clusters_)
+ unique_labels = set(labels)
+
+ # create drawing canvas
+ fig, ax = plt.subplots(figsize=(image_config["width"], image_config["height"]), dpi=image_config["dpi"])
+
+ # draw the main content
+ colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
+ for k, col in zip(unique_labels, colors):
+ if k == -1:
+ col = [0, 0, 0, 1]
+ class_member_mask = labels == k
+ xy = data[class_member_mask & core_samples_mask]
+ ax.plot(
+ xy.iloc[:, 0],
+ xy.iloc[:, 1],
+ image_config["marker_angle"],
+ markerfacecolor=tuple(col),
+ markeredgecolor=image_config["edgecolor"],
+ markersize=image_config["markersize1"],
+ alpha=image_config["alpha1"],
+ )
+ xy = data[class_member_mask & ~core_samples_mask]
+ ax.plot(
+ xy.iloc[:, 0],
+ xy.iloc[:, 1],
+ image_config["marker_circle"],
+ markerfacecolor=tuple(col),
+ markeredgecolor=image_config["edgecolor"],
+ markersize=image_config["markersize2"],
+ alpha=image_config["alpha2"],
+ )
+
+ # automatically optimize picture layout structure
+ fig.tight_layout()
+ xmin, xmax = ax.get_xlim()
+ ymin, ymax = ax.get_ylim()
+ x_adjustment = (xmax - xmin) * 0.1
+ y_adjustment = (ymax - ymin) * 0.1
+ ax.axis([xmin - x_adjustment, xmax + x_adjustment, ymin - y_adjustment, ymax + y_adjustment])
+
+ # convert the font of the axes
+ plt.tick_params(labelsize=image_config["labelsize"]) # adjust the font size of the axis label
+ # plt.setp(ax.get_xticklabels(), rotation=image_config['xrotation'], ha=image_config['xha'],
+ # rotation_mode="anchor") # axis label rotation Angle
+ # plt.setp(ax.get_yticklabels(), rotation=image_config['rot'], ha=image_config['yha'],
+ # rotation_mode="anchor") # axis label rotation Angle
+ x1_label = ax.get_xticklabels() # adjust the axis label font
+ [x1_label_temp.set_fontname(image_config["axislabelfont"]) for x1_label_temp in x1_label]
+ y1_label = ax.get_yticklabels()
+ [y1_label_temp.set_fontname(image_config["axislabelfont"]) for y1_label_temp in y1_label]
+
+ ax.set_title(
+ label=algorithm_name,
+ fontdict={
+ "size": image_config["title_size"],
+ "color": image_config["title_color"],
+ "family": image_config["title_font"],
+ },
+ loc=image_config["title_location"],
+ pad=image_config["title_pad"],
+ )
diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py b/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py
index 82668a55..5e3b2654 100644
--- a/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py
+++ b/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py
@@ -1,6 +1,11 @@
from typing import Dict
+import matplotlib.cm as cm
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
from rich import print
+from sklearn.metrics import silhouette_samples, silhouette_score
from ....constants import SECTION
from ....data.data_readiness import float_input, num_input, str_input
@@ -33,3 +38,159 @@ def kmeans_manual_hyper_parameters() -> Dict:
algorithm = str_input(algorithms, SECTION[2])
hyper_parameters = {"n_clusters": n_clusters, "init": init, "max_iter": max_iters, "tol": tol, "algorithm": algorithm}
return hyper_parameters
+
+
+def plot_silhouette_diagram_kmeans(data: pd.DataFrame, cluster_labels: pd.DataFrame, cluster_centers_: np.ndarray, n_clusters: int, algorithm_name: str) -> None:
+ """
+ Draw the silhouette diagram for analysis.
+
+ Parameters
+ ----------
+ data: pd.DataFrame (n_samples, n_components)
+ Data for silhouette.
+
+ cluster_labels: pd.DataFrame (n_samples,)
+ Labels of each point.
+
+ cluster_centers_: np.ndarray (n_samples,)
+ Coordinates of cluster centers. If the algorithm stops before fully converging (see tol and max_iter), these will not be consistent with labels_.
+
+ n_clusters: int
+ Number of features seen during fit.
+
+ algorithm_name : str
+ the name of the algorithm
+
+ References
+ ----------
+ Silhouette analysis can be used to study the separation distance between the resulting clusters.
+ The silhouette plot displays a measure of how close each point in one cluster is to other points in the
+ neighboring clusters and thus provides a way to assess parameters like number of clusters visually.
+ This measure has a range of [-1, 1].
+
+ https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
+ """
+ # Create a subplot with 1 row and 2 columns
+ fig, (ax1, ax2) = plt.subplots(1, 2)
+ fig.set_size_inches(18, 7)
+
+ # The 1st subplot is the silhouette plot
+ # The silhouette coefficient can range from -1, 1 but in this example all
+ # lie within [-0.1, 1]
+ ax1.set_xlim([-0.1, 1])
+ # The (n_clusters+1)*10 is for inserting blank space between silhouette
+ # plots of individual clusters, to demarcate them clearly.
+ ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10])
+
+ # The silhouette_score gives the average value for all the samples.
+ # This gives a perspective into the density and separation of the formed
+ # clusters
+ silhouette_avg = silhouette_score(data, cluster_labels)
+ print(
+ "For n_clusters =",
+ n_clusters,
+ "The average silhouette_score is :",
+ silhouette_avg,
+ )
+
+ # Compute the silhouette scores for each sample
+ sample_silhouette_values = silhouette_samples(data, cluster_labels)
+
+ y_lower = 10
+ for i in range(n_clusters):
+ # Aggregate the silhouette scores for samples belonging to
+ # cluster i, and sort them
+ ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
+
+ ith_cluster_silhouette_values.sort()
+
+ size_cluster_i = ith_cluster_silhouette_values.shape[0]
+ y_upper = y_lower + size_cluster_i
+
+ color = cm.nipy_spectral(float(i) / n_clusters)
+ ax1.fill_betweenx(
+ np.arange(y_lower, y_upper),
+ 0,
+ ith_cluster_silhouette_values,
+ facecolor=color,
+ edgecolor=color,
+ alpha=0.7,
+ )
+
+ # Label the silhouette plots with their cluster numbers at the middle
+ ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
+
+ # Compute the new y_lower for next plot
+ y_lower = y_upper + 10 # 10 for the 0 samples
+
+ ax1.set_title("The silhouette plot for the various clusters.")
+ ax1.set_xlabel("The silhouette coefficient values")
+ ax1.set_ylabel("Cluster label")
+
+ # The vertical line for average silhouette score of all the values
+ ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
+
+ ax1.set_yticks([]) # Clear the yaxis labels / ticks
+ ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
+
+ # 2nd Plot showing the actual clusters formed
+ colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
+ ax2.scatter(data.iloc[:, 0], data.iloc[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k")
+
+ # Labeling the clusters
+ centers = cluster_centers_
+ # Draw white circles at cluster centers
+ ax2.scatter(
+ centers[:, 0],
+ centers[:, 1],
+ marker="o",
+ c="white",
+ alpha=1,
+ s=200,
+ edgecolor="k",
+ )
+
+ for i, c in enumerate(centers):
+ ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
+
+ ax2.set_title("The visualization of the clustered data.")
+ ax2.set_xlabel("Feature space for the 1st feature")
+ ax2.set_ylabel("Feature space for the 2nd feature")
+ plt.suptitle(
+ f"Silhouette analysis for KMeans clustering on sample data with n_clusters = %d - {algorithm_name}" % n_clusters,
+ fontsize=14,
+ fontweight="bold",
+ )
+
+
+def scatter2d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name: str) -> None:
+ plt.figure()
+ plt.subplot(111)
+ plt.scatter(data.iloc[:, 0], data.iloc[:, 1], c=cluster_labels)
+
+ plt.xlabel(f"{data.columns[0]}")
+ plt.ylabel(f"{data.columns[1]}")
+ plt.title(f"Cluster Data Bi-plot - {algorithm_name}")
+
+
+def scatter3d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name: str) -> None:
+ plt.figure()
+ namelist = data.columns.values.tolist()
+ fig = plt.figure(figsize=(12, 6), facecolor="w")
+ plt.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.9)
+
+ ax = fig.add_subplot(121, projection="3d")
+ ax.scatter(data.iloc[:, 0], data.iloc[:, 1], data.iloc[:, 2], alpha=0.3, c="#FF0000", s=6)
+ ax.set_xlabel(namelist[0])
+ ax.set_ylabel(namelist[1])
+ ax.set_zlabel(namelist[2])
+ plt.grid(True)
+
+ ax2 = fig.add_subplot(122, projection="3d")
+ ax2.scatter(data.iloc[:, 0], data.iloc[:, 1], data.iloc[:, 2], c=cluster_labels, s=6, cmap=plt.cm.Paired, edgecolors="none")
+ ax2.set_xlabel(namelist[0])
+ ax2.set_ylabel(namelist[1])
+ ax2.set_zlabel(namelist[2])
+ plt.grid(True)
+ ax.set_title(f"Base Data Tri-plot - {algorithm_name}")
+ ax2.set_title(f"Cluster Data Tri-plot - {algorithm_name}")
diff --git a/geochemistrypi/data_mining/model/func/algo_decomposition/_mds.py b/geochemistrypi/data_mining/model/func/algo_decomposition/_mds.py
index 30ec1918..9a1abb9c 100644
--- a/geochemistrypi/data_mining/model/func/algo_decomposition/_mds.py
+++ b/geochemistrypi/data_mining/model/func/algo_decomposition/_mds.py
@@ -17,7 +17,7 @@ def mds_manual_hyper_parameters() -> Dict:
print("Please specify the number of components to retain. A good starting range could be between 2 and 10, such as 4.")
n_components = num_input(SECTION[2], "N Components: ")
print("Metric: This parameter specifies the metric to be used when calculating distance between instances in a feature array.")
- print("Please specify whether the metric is used when measuring the pairwise distances between data points in the input space. It is generally recommended to leave it as True.")
+ print("Please specify whether the metric is used when measuring the pairwise distances between data points in the input space. It is generally recommended to leave it set to True.")
metric = bool_input(SECTION[2])
print("N Init: This parameter specifies the number of times the SMACOF algorithm will be run with different initializations.")
print("Please specify the number of times. A good starting range could be between 1 and 10, such as 4.")
diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_extra_tree.py b/geochemistrypi/data_mining/model/func/algo_regression/_extra_tree.py
index 4476f892..d440b274 100644
--- a/geochemistrypi/data_mining/model/func/algo_regression/_extra_tree.py
+++ b/geochemistrypi/data_mining/model/func/algo_regression/_extra_tree.py
@@ -35,7 +35,7 @@ def extra_trees_manual_hyper_parameters() -> Dict:
"Bootstrap: Whether bootstrap samples are used when building trees. Bootstrapping is a technique where a random subset of the data is sampled with replacement"
" to create a new dataset of the same size as the original. This new dataset is then used to construct a decision tree in the ensemble. If False, the whole dataset is used to build each tree."
)
- print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it as True.")
+ print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it set to True.")
bootstrap = bool_input(SECTION[2])
max_samples = None
if bootstrap:
@@ -46,7 +46,7 @@ def extra_trees_manual_hyper_parameters() -> Dict:
"oob_score: Whether to use out-of-bag samples to estimate the generalization accuracy. When the oob_score hyperparameter is set to True, Extra Trees will use a random subset of the data"
" to train each decision tree in the ensemble, and the remaining data that was not used for training (the out-of-bag samples) will be used to calculate the OOB score. "
)
- print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it as True.")
+ print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it set to True.")
oob_score = bool_input(SECTION[2])
hyper_parameters = {
"n_estimators": n_estimators,
diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_lasso_regression.py b/geochemistrypi/data_mining/model/func/algo_regression/_lasso_regression.py
index 9b450f19..a8a1d293 100644
--- a/geochemistrypi/data_mining/model/func/algo_regression/_lasso_regression.py
+++ b/geochemistrypi/data_mining/model/func/algo_regression/_lasso_regression.py
@@ -18,7 +18,7 @@ def lasso_regression_manual_hyper_parameters() -> Dict:
print("Please indicate the coefficient of alpha. A good starting range could be between 0.001 and 2, such as 1.")
alpha = float_input(0.01, SECTION[2], "@Alpha: ")
print("Fit Intercept: This hyperparameter represents whether the model is evaluated with constant terms.")
- print("Please indicate whether there is a parameter entry. It is generally recommended to leave it as True.")
+ print("Please indicate whether there is a parameter entry. It is generally recommended to leave it set to True.")
fit_intercept = bool_input(SECTION[2])
print("Max Iter: This hyperparameter represents the maximum number of iterations for the solver to converge.")
print("Please indicate the maximum number of iterations. A good starting range could be between 1000 and 10000, such as 1000.")
@@ -27,7 +27,7 @@ def lasso_regression_manual_hyper_parameters() -> Dict:
print("Please indicate the tolerance. A good starting range could be between 0.0001 and 0.001, such as 0.0001.")
tol = float_input(0.0001, SECTION[2], "@Tolerance: ")
print("Selection: This hyperparameter represents the method of selecting the regularization coefficient.")
- print("Please indicate the method of selecting the regularization coefficient. It is generally recommended to leave it as 'cyclic'.")
+ print("Please indicate the method of selecting the regularization coefficient. It is generally recommended to leave it set to 'cyclic'.")
selections = ["cyclic", "random"]
selection = str_input(selections, SECTION[2])
hyper_parameters = {
diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_linear_regression.py b/geochemistrypi/data_mining/model/func/algo_regression/_linear_regression.py
index 2cd39529..ae806d03 100644
--- a/geochemistrypi/data_mining/model/func/algo_regression/_linear_regression.py
+++ b/geochemistrypi/data_mining/model/func/algo_regression/_linear_regression.py
@@ -19,7 +19,7 @@ def linear_regression_manual_hyper_parameters() -> Dict:
hyper_parameters : dict
"""
print("Fit Intercept: This hyperparameter specifies whether to calculate the intercept (also called the bias term) for this model.")
- print("Please specify whether to calculate the intercept for this model. It is generally recommended to leave it as True.")
+ print("Please specify whether to calculate the intercept for this model. It is generally recommended to leave it set to True.")
fit_intercept = bool_input(SECTION[2])
hyper_parameters = {"fit_intercept": fit_intercept}
return hyper_parameters
diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_multi_layer_perceptron.py b/geochemistrypi/data_mining/model/func/algo_regression/_multi_layer_perceptron.py
index 8d805288..e7769eb5 100644
--- a/geochemistrypi/data_mining/model/func/algo_regression/_multi_layer_perceptron.py
+++ b/geochemistrypi/data_mining/model/func/algo_regression/_multi_layer_perceptron.py
@@ -18,18 +18,18 @@ def multi_layer_perceptron_manual_hyper_parameters() -> Dict:
print("Please specify the size of hidden layer and the number of neurons in the each hidden layer.")
hidden_layer = tuple_input((50, 25, 5), SECTION[2], "@Hidden Layer Sizes: ")
print("Activation: Activation function for the hidden layer.")
- print("Please specify the activation function for the hidden layer. It is generally recommended to leave it as 'ReLU'.")
+ print("Please specify the activation function for the hidden layer. It is generally recommended to leave it set to ReLU.")
activations = ["identity", "logistic", "tanh", "relu"]
activation = str_input(activations, SECTION[2])
print("Solver: The solver for weight optimization.")
- print("Please specify the solver for weight optimization. It is generally recommended to leave it as 'Adam'.")
+ print("Please specify the solver for weight optimization. It is generally recommended to leave it set to Adam.")
solvers = ["lbfgs", "sgd", "adam"]
solver = str_input(solvers, SECTION[2])
print("Alpha: L2 penalty (regularization term) parameter.")
print("Please specify the L2 penalty (regularization term) parameter. A good starting range could be between 0.0001 and 10, such as 0.0001.")
alpha = float_input(0.0001, SECTION[2], "@Alpha: ")
print("Learning Rate: It controls the step-size in updating the weights.")
- print("Please specify the learning rate. It is generally recommended to leave it as 'Adaptive'.")
+ print("Please specify the learning rate. It is generally recommended to leave it set to Adaptive.")
learning_rates = ["constant", "invscaling", "adaptive"]
learning_rate = str_input(learning_rates, SECTION[2])
print("Max Iterations: Maximum number of iterations. The solver iterates until convergence (determined by 'tol') or this number of iterations.")
diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_polynomial_regression.py b/geochemistrypi/data_mining/model/func/algo_regression/_polynomial_regression.py
index 55a0f13c..63818221 100644
--- a/geochemistrypi/data_mining/model/func/algo_regression/_polynomial_regression.py
+++ b/geochemistrypi/data_mining/model/func/algo_regression/_polynomial_regression.py
@@ -18,10 +18,10 @@ def polynomial_regression_manual_hyper_parameters() -> Dict:
print("Please specify the degree of the polynomial features. A good starting range could be between 1 and 5, such as 2.")
degree = num_input(SECTION[2], "@Degree: ")
print("Interaction Only: This hyperparameter specifies whether to only include interaction features.")
- print("Please specify whether to only include interaction features. It is generally recommended to leave it as False.")
+ print("Please specify whether to only include interaction features. It is generally recommended to leave it set to False.")
interaction_only = bool_input(SECTION[2])
print("Include Bias: This hyperparameter specifies whether to include a bias (also called the intercept) term in the model.")
- print("Please specify whether to include a bias term in the model. It is generally recommended to leave it as True.")
+ print("Please specify whether to include a bias term in the model. It is generally recommended to leave it set to True.")
include_bias = bool_input(SECTION[2])
hyper_parameters = {"degree": degree, "interaction_only": interaction_only, "include_bias": include_bias}
return hyper_parameters
diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_rf.py b/geochemistrypi/data_mining/model/func/algo_regression/_rf.py
index 03bd7039..8cf08715 100644
--- a/geochemistrypi/data_mining/model/func/algo_regression/_rf.py
+++ b/geochemistrypi/data_mining/model/func/algo_regression/_rf.py
@@ -35,7 +35,7 @@ def random_forest_manual_hyper_parameters() -> Dict:
"Bootstrap: Whether bootstrap samples are used when building trees. Bootstrapping is a technique where a random subset of the data is sampled with replacement"
" to create a new dataset of the same size as the original. This new dataset is then used to construct a decision tree in the ensemble. If False, the whole dataset is used to build each tree."
)
- print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it as True.")
+ print("Please specify whether bootstrap samples are used when building trees. It is generally recommended to leave it set to True.")
bootstrap = bool_input(SECTION[2])
max_samples = None
if bootstrap:
@@ -46,7 +46,7 @@ def random_forest_manual_hyper_parameters() -> Dict:
"oob_score: Whether to use out-of-bag samples to estimate the generalization accuracy. When the oob_score hyperparameter is set to True, Extra Trees will use a random subset of the data"
" to train each decision tree in the ensemble, and the remaining data that was not used for training (the out-of-bag samples) will be used to calculate the OOB score. "
)
- print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it as True.")
+ print("Please specify whether to use out-of-bag samples to estimate the generalization accuracy. It is generally recommended to leave it set to True.")
oob_score = bool_input(SECTION[2])
hyper_parameters = {
"n_estimators": n_estimators,
diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_svr.py b/geochemistrypi/data_mining/model/func/algo_regression/_svr.py
index 0f120f5d..13bed965 100644
--- a/geochemistrypi/data_mining/model/func/algo_regression/_svr.py
+++ b/geochemistrypi/data_mining/model/func/algo_regression/_svr.py
@@ -15,7 +15,7 @@ def svr_manual_hyper_parameters() -> Dict:
hyper_parameters : dict
"""
print("Kernel: This hyperparameter specifies the kernel function to be used for mapping the input data to a higher-dimensional feature space.")
- print("Please specify the kernel type to be used in the algorithm. It is generally recommended to leave it as 'Radial Basis Function (RBF) Kernel'.")
+ print("Please specify the kernel type to be used in the algorithm. It is generally recommended to leave it set to Radial basis function (RBF) kernel.")
kernels = ["linear", "poly", "rbf", "sigmoid"]
kernel = str_input(kernels, SECTION[2])
degree = None
@@ -41,7 +41,7 @@ def svr_manual_hyper_parameters() -> Dict:
C = float_input(1, SECTION[2], "@C: ")
print("Shrinking: This hyperparameter specifies whether to use the shrinking heuristic.")
print("The shrinking heuristic is a technique that speeds up the training process by only considering the support vectors in the decision function.")
- print("Please specify whether to use the shrinking heuristic. It is generally recommended to leave it as True.")
+ print("Please specify whether to use the shrinking heuristic. It is generally recommended to leave it set to True.")
shrinking = bool_input(SECTION[2])
hyper_parameters = {"kernel": kernel, "C": C, "shrinking": shrinking}
if not degree:
diff --git a/geochemistrypi/data_mining/model/regression.py b/geochemistrypi/data_mining/model/regression.py
index 2c7d91f5..b17aacae 100644
--- a/geochemistrypi/data_mining/model/regression.py
+++ b/geochemistrypi/data_mining/model/regression.py
@@ -545,6 +545,7 @@ def __init__(
self.base_score = base_score
self.missing = missing
self.num_parallel_tree = num_parallel_tree
+ self.random_state = random_state
self.n_jobs = n_jobs
self.monotone_constraints = monotone_constraints
self.interaction_constraints = interaction_constraints
@@ -558,10 +559,6 @@ def __init__(
if kwargs:
self.kwargs = kwargs
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = xgboost.XGBRegressor(
n_estimators=self.n_estimators,
objective=self.objective,
@@ -821,16 +818,11 @@ def __init__(
self.min_samples_leaf = (min_samples_leaf,)
self.min_weight_fraction_leaf = (min_weight_fraction_leaf,)
self.max_features = (max_features,)
+ self.random_state = (random_state,)
self.max_leaf_nodes = (max_leaf_nodes,)
self.min_impurity_decrease = (min_impurity_decrease,)
- self.ccp_alpha = (ccp_alpha,)
-
- if random_state:
- self.random_state = (random_state,)
- else:
- self.random_state = (self.random_state,)
+ self.ccp_alpha = ccp_alpha
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = DecisionTreeRegressor(
criterion=self.criterion[0],
splitter=self.splitter[0],
@@ -842,7 +834,7 @@ def __init__(
random_state=self.random_state[0],
max_leaf_nodes=self.max_leaf_nodes[0],
min_impurity_decrease=self.min_impurity_decrease[0],
- ccp_alpha=self.ccp_alpha[0],
+ ccp_alpha=self.ccp_alpha,
)
self.naming = DecisionTreeRegression.name
self.customized = True
@@ -1143,15 +1135,12 @@ def __init__(
self.bootstrap = bootstrap
self.oob_score = oob_score
self.n_jobs = n_jobs
+ self.random_state = random_state
self.verbose = verbose
self.warm_start = warm_start
self.ccp_alpha = ccp_alpha
self.max_samples = max_samples
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = ExtraTreesRegressor(
n_estimators=self.n_estimators,
criterion=self.criterion,
@@ -1257,7 +1246,7 @@ def __init__(
bootstrap: bool = True,
oob_score: bool = False,
n_jobs: int = None,
- random_state: Optional[int] = None,
+ random_state: int = None,
verbose: int = 0,
warm_start: bool = False,
# class_weight=None,
@@ -1441,16 +1430,13 @@ def __init__(
self.bootstrap = bootstrap
self.oob_score = oob_score
self.n_jobs = n_jobs
+ self.random_state = random_state
self.verbose = verbose
self.warm_start = warm_start
# self.class_weight = class_weight
self.ccp_alpha = ccp_alpha
self.max_samples = max_samples
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = RandomForestRegressor(
n_estimators=self.n_estimators,
criterion=self.criterion,
@@ -1889,6 +1875,7 @@ def __init__(
self.learning_rate_init = learning_rate_init
self.max_iter = max_iter
self.shuffle = shuffle
+ self.random_state = random_state
self.tol = tol
self.verbose = verbose
self.warm_start = warm_start
@@ -1899,10 +1886,6 @@ def __init__(
self.epsilon = epsilon
self.n_iter_no_change = n_iter_no_change
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = MLPRegressor(
hidden_layer_sizes=self.hidden_layer_sizes,
activation=self.activation,
@@ -2564,6 +2547,7 @@ def __init__(
self.max_depth = max_depth
self.min_impurity_decrease = min_impurity_decrease
self.init = init
+ self.random_state = random_state
self.max_features = max_features
self.alpha = alpha
self.verbose = verbose
@@ -2574,10 +2558,6 @@ def __init__(
self.tol = tol
self.ccp_alpha = ccp_alpha
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = GradientBoostingRegressor(
loss=self.loss,
learning_rate=self.learning_rate,
@@ -2786,12 +2766,9 @@ def __init__(
self.tol = tol
self.warm_start = warm_start
self.positive = positive
+ self.random_state = random_state
self.selection = selection
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = Lasso(
alpha=self.alpha,
fit_intercept=self.fit_intercept,
@@ -3110,12 +3087,9 @@ def __init__(
self.tol = tol
self.warm_start = warm_start
self.positive = positive
+ self.random_state = random_state
self.selection = selection
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = ElasticNet(
alpha=self.alpha,
l1_ratio=self.l1_ratio,
@@ -3520,6 +3494,7 @@ def __init__(
self.shuffle = shuffle
self.verbose = verbose
self.epsilon = epsilon
+ self.random_state = random_state
self.learning_rate = learning_rate
self.eta0 = eta0
self.power_t = power_t
@@ -3529,10 +3504,6 @@ def __init__(
self.warm_start = warm_start
self.average = average
- if random_state:
- self.random_state = random_state
-
- # If 'random_state' is None, 'self.random_state' comes from the parent class 'WorkflowBase'
self.model = SGDRegressor(
loss=self.loss,
penalty=self.penalty,
diff --git a/geochemistrypi/start_cli_pipeline.py b/geochemistrypi/start_cli_pipeline.py
index b3834d90..3f6fbac5 100644
--- a/geochemistrypi/start_cli_pipeline.py
+++ b/geochemistrypi/start_cli_pipeline.py
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
from data_mining.cli_pipeline import cli_pipeline
-# Used for internal testing, run in debug mode in IDE to inspect the pipeline
+# Used for internal testing
cli_pipeline("", "")