Merge pull request #287 from ZJUEarthData/web

build: v0.4.0
ZJUEarthData · Dec 15, 2023 · 559869b · 559869b
2 parents 48f0e9e + 2d7a697
commit 559869b
Show file tree

Hide file tree

Showing 13 changed files with 98 additions and 39 deletions.
diff --git a/geochemistrypi/_version.py b/geochemistrypi/_version.py
@@ -1 +1 @@
-__version__ = "0.3.0"
+__version__ = "0.4.0"
diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py
@@ -265,7 +265,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
         del data_selected
         clear_output()
     else:
-        # if the selected data set doesn't need imputation, which means there are no missing values.
+        # If the selected data set doesn't need imputation, which means there are no missing values.
         imputation_config = {}
         data_selected_imputed = data_selected
 

diff --git a/geochemistrypi/data_mining/data/dataset/InferenceData_Classification.xlsx b/geochemistrypi/data_mining/data/dataset/InferenceData_Classification.xlsx
diff --git a/geochemistrypi/data_mining/data/dataset/InferenceData_Regression.xlsx b/geochemistrypi/data_mining/data/dataset/InferenceData_Regression.xlsx
diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py
@@ -56,6 +56,7 @@ def __init__(self) -> None:
         # These two attributes are used for the customized models of FLAML framework
         self.customized = False
         self.customized_name = None
+        self.mode = "Classification"
 
     @dispatch(object, object)
     def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None:
@@ -2995,7 +2996,8 @@ class SGDClassification(LinearWorkflowMixin, ClassificationWorkflowBase):
     """The automation workflow of using Stochastic Gradient Descent - SGD algorithm to make insightful products."""
 
     name = "Stochastic Gradient Descent"
-    special_function = ["SGD Formula"]
+    # special_function = ["SGD Formula"]
+    special_function = []
 
     def __init__(
         self,
@@ -3315,25 +3317,25 @@ def manual_hyper_parameters(cls) -> Dict:
     @dispatch()
     def special_components(self, **kwargs) -> None:
         """Invoke all special application functions for this algorithms by Scikit-learn framework."""
-        GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH")
-        self._show_formula(
-            coef=[self.model.coef_],
-            intercept=self.model.intercept_,
-            features_name=SGDClassification.X_train.columns,
-            algorithm_name=self.naming,
-            local_path=GEOPI_OUTPUT_ARTIFACTS_PATH,
-            mlflow_path="root",
-        )
+        # GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH")
+        # self._show_formula(
+        #     coef=[self.model.coef_],
+        #     intercept=self.model.intercept_,
+        #     features_name=SGDClassification.X_train.columns,
+        #     algorithm_name=self.naming,
+        #     local_path=GEOPI_OUTPUT_ARTIFACTS_PATH,
+        #     mlflow_path="root",
+        # )
 
     @dispatch(bool)
     def special_components(self, is_automl: bool = False, **kwargs) -> None:
         """Invoke all special application functions for this algorithms by FLAML framework."""
-        GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH")
-        self._show_formula(
-            coef=self.auto_model.coef_,
-            intercept=self.auto_model.intercept_,
-            features_name=SGDClassification.X.columns,
-            algorithm_name=self.naming,
-            local_path=GEOPI_OUTPUT_ARTIFACTS_PATH,
-            mlflow_path="root",
-        )
+        # GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH")
+        # self._show_formula(
+        #     coef=self.auto_model.coef_,
+        #     intercept=self.auto_model.intercept_,
+        #     features_name=SGDClassification.X.columns,
+        #     algorithm_name=self.naming,
+        #     local_path=GEOPI_OUTPUT_ARTIFACTS_PATH,
+        #     mlflow_path="root",
+        # )
diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py
@@ -14,7 +14,7 @@
 from ._base import WorkflowBase
 from .func.algo_clustering._common import plot_results, plot_silhouette_diagram, score
 from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters, dbscan_result_plot
-from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters, scatter2d, scatter3d
+from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters, plot_silhouette_diagram_kmeans, scatter2d, scatter3d
 
 
 class ClusteringWorkflowBase(WorkflowBase):
@@ -25,6 +25,7 @@ class ClusteringWorkflowBase(WorkflowBase):
     def __init__(self):
         super().__init__()
         self.clustering_result = None
+        self.mode = "Clustering"
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None:
         """Fit the model according to the given training data."""
@@ -93,14 +94,14 @@ def common_components(self) -> None:
             algorithm_name=self.naming,
             store_path=GEOPI_OUTPUT_METRICS_PATH,
         )
-        self._plot_results(
-            data=self.X,
-            labels=self.clustering_result["clustering result"],
-            cluster_centers_=self.get_cluster_centers(),
-            algorithm_name=self.naming,
-            local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
-            mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
-        )
+        # self._plot_results(
+        #     data=self.X,
+        #     labels=self.clustering_result["clustering result"],
+        #     cluster_centers_=self.get_cluster_centers(),
+        #     algorithm_name=self.naming,
+        #     local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+        #     mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+        # )
         self._plot_silhouette_diagram(
             data=self.X,
             labels=self.clustering_result["clustering result"],
@@ -226,6 +227,15 @@ def __init__(
 
         self.naming = KMeansClustering.name
 
+    def _get_inertia_scores(self, algorithm_name: str, store_path: str) -> None:
+        """Get the scores of the clustering result."""
+        print("-----* KMeans Inertia Scores *-----")
+        print("Inertia Score: ", self.model.inertia_)
+        inertia_scores = {"Inertia Score": self.model.inertia_}
+        mlflow.log_metrics(inertia_scores)
+        inertia_scores_str = json.dumps(inertia_scores, indent=4)
+        save_text(inertia_scores_str, f"KMeans Inertia Scores - {algorithm_name}", store_path)
+
     @classmethod
     def manual_hyper_parameters(cls) -> Dict:
         """Manual hyper-parameters specification."""
@@ -234,6 +244,25 @@ def manual_hyper_parameters(cls) -> Dict:
         clear_output()
         return hyper_parameters
 
+    @staticmethod
+    def _plot_silhouette_diagram_kmeans(
+        data: pd.DataFrame,
+        cluster_labels: pd.DataFrame,
+        cluster_centers_: np.ndarray,
+        n_clusters: int,
+        algorithm_name: str,
+        local_path: str,
+        mlflow_path: str,
+    ) -> None:
+        """Plot the silhouette diagram of the clustering result."""
+        print("-----* KMeans's Silhouette Diagram *-----")
+        plot_silhouette_diagram_kmeans(data, cluster_labels, cluster_centers_, n_clusters, algorithm_name)
+        save_fig(f"KMeans's Silhouette Diagram - {algorithm_name}", local_path, mlflow_path)
+        data_with_labels = pd.concat([data, cluster_labels], axis=1)
+        save_data(data_with_labels, "KMeans's Silhouette Diagram - Data With Labels", local_path, mlflow_path)
+        cluster_center_data = pd.DataFrame(cluster_centers_, columns=data.columns)
+        save_data(cluster_center_data, "KMeans's Silhouette Diagram - Cluster Centers", local_path, mlflow_path)
+
     @staticmethod
     def _scatter2d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
         """Plot the two-dimensional diagram of the clustering result."""
@@ -254,7 +283,21 @@ def _scatter3d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name:
 
     def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
         """Invoke all special application functions for this algorithms by Scikit-learn framework."""
+        GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH")
         GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
+        self._get_inertia_scores(
+            algorithm_name=self.naming,
+            store_path=GEOPI_OUTPUT_METRICS_PATH,
+        )
+        self._plot_silhouette_diagram_kmeans(
+            data=self.X,
+            cluster_labels=self.clustering_result["clustering result"],
+            cluster_centers_=self.get_cluster_centers(),
+            n_clusters=self.n_clusters,
+            algorithm_name=self.naming,
+            local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
+            mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
+        )
 
         # Draw graphs when the number of principal components > 3
         if self.X.shape[1] >= 3:

diff --git a/geochemistrypi/data_mining/model/decomposition.py b/geochemistrypi/data_mining/model/decomposition.py
@@ -26,6 +26,7 @@ def __init__(self) -> None:
 
         # the extra attributes that decomposition algorithm needs
         self.X_reduced = None
+        self.mode = "Decomposition"
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None:
         """Fit the model."""

diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_common.py b/geochemistrypi/data_mining/model/func/algo_classification/_common.py
@@ -6,8 +6,6 @@
 import mlflow
 import numpy as np
 import pandas as pd
-from data_mining.constants import CALCULATION_METHOD_OPTION, SECTION
-from data_mining.data.data_readiness import limit_num_input, num2option, num_input
 from imblearn.over_sampling import RandomOverSampler
 from imblearn.pipeline import Pipeline
 from imblearn.under_sampling import RandomUnderSampler
@@ -16,6 +14,9 @@
 from sklearn.model_selection import cross_validate
 from sklearn.preprocessing import LabelEncoder
 
+from ....constants import CALCULATION_METHOD_OPTION, SECTION
+from ....data.data_readiness import limit_num_input, num2option, num_input
+
 
 def score(y_true: pd.DataFrame, y_predict: pd.DataFrame) -> tuple[str, Dict]:
     """Calculate the scores of the classification model.

diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_sgd_classification.py b/geochemistrypi/data_mining/model/func/algo_classification/_sgd_classification.py
@@ -58,9 +58,14 @@ def sgd_classificaiton_manual_hyper_parameters() -> Dict:
     early_stopping = bool_input(SECTION[2])
 
     print("Validation Fraction: The proportion of training data to set aside as validation set for early stopping.")
-    print("A good starting value could be between 0.000001 and 1, such as 0.1. The default is 0.1.")
-    validation_fraction = float_input(0.1, SECTION[2], "@Validation Fraction: ")
-
+    print("It must be in range (0, 1). A good starting value could be between 0.000001 and 1, such as 0.1. The default is 0.1.")
+    is_valid = False
+    while not is_valid:
+        validation_fraction = float_input(0.1, SECTION[2], "@Validation Fraction: ")
+        if 0 < validation_fraction < 1:
+            is_valid = True
+        else:
+            print("The validation fraction must be in range (0, 1).")
     print("Number of Iterations With No Improvement: Number of iterations with no improvement to wait before stopping fitting.")
     print("A good starting value could be between 1 and maximum number of iterations, such as 5. The default is 5.")
     n_iter_no_change = num_input(SECTION[2], "@Iterations With No Improvement: ")

diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py b/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py
@@ -40,7 +40,7 @@ def kmeans_manual_hyper_parameters() -> Dict:
     return hyper_parameters
 
 
-def plot_silhouette_diagram(data: pd.DataFrame, cluster_labels: pd.DataFrame, cluster_centers_: np.ndarray, n_clusters: int, algorithm_name: str) -> None:
+def plot_silhouette_diagram_kmeans(data: pd.DataFrame, cluster_labels: pd.DataFrame, cluster_centers_: np.ndarray, n_clusters: int, algorithm_name: str) -> None:
     """
     Draw the silhouette diagram for analysis.
 

diff --git a/geochemistrypi/data_mining/model/func/algo_decomposition/_tsne.py b/geochemistrypi/data_mining/model/func/algo_decomposition/_tsne.py
@@ -24,8 +24,14 @@ def tsne_manual_hyper_parameters() -> Dict:
     print("Please specify the learning rate. A good starting range could be between 10 and 1000, such as 200.")
     learning_rate = float_input(200, SECTION[2], "Learning Rate: ")
     print("Number of Iterations: This parameter controls how many iterations the optimization will run for.")
-    print("Please specify the number of iterations. A good starting range could be between 250 and 1000, such as 500.")
-    n_iter = num_input(SECTION[2], "Number of Iterations: ")
+    print("Please specify the number of iterations. A good starting range could be between 250 and 1000, such as 500. The minimum is 250.")
+    is_valid = False
+    while not is_valid:
+        n_iter = num_input(SECTION[2], "Number of Iterations: ")
+        if n_iter >= 250:
+            is_valid = True
+        else:
+            print("Please enter a number greater than or equal to 250.")
     print("Early Exaggeration: This parameter controls how tight natural clusters in the original space are in the embedded space and how much space will be between them.")
     print("Please specify the early exaggeration. A good starting range could be between 5 and 50, such as 12.")
     early_exaggeration = float_input(12, SECTION[2], "Early Exaggeration: ")

diff --git a/geochemistrypi/data_mining/model/regression.py b/geochemistrypi/data_mining/model/regression.py
@@ -47,6 +47,7 @@ def __init__(self) -> None:
         # These two attributes are used for the customized models of FLAML framework
         self.customized = False
         self.customized_name = None
+        self.mode = "Regression"
 
     @dispatch(object, object)
     def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None:

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "geochemistrypi"
-version = "0.4.0.dev2"
+version = "0.4.0"
 authors = [
   { name="Can He", email="[email protected]" },
 ]