From 75ec881b640222fe4a593366f5a8a11d1ed79d2d Mon Sep 17 00:00:00 2001
From: sanyhe <sanyhew1097618435@163.com>
Date: Sun, 5 Nov 2023 21:11:25 +0800
Subject: [PATCH] feat: add feature scaling for unsupervised learning.

---
 README.md                                     | 12 +++-
 geochemistrypi/data_mining/cli_pipeline.py    | 57 +++++++++++++------
 geochemistrypi/data_mining/data/inference.py  |  9 +--
 .../data_mining/model/classification.py       |  8 +--
 .../model/func/algo_clustering/_dbscan.py     |  4 +-
 .../model/func/algo_clustering/_kmeans.py     |  4 +-
 .../model/func/algo_decomposition/_pca.py     |  2 +-
 .../data_mining/model/regression.py           | 12 ++--
 .../data_mining/process/classify.py           |  8 +--
 geochemistrypi/data_mining/process/regress.py |  8 +--
 10 files changed, 76 insertions(+), 48 deletions(-)
diff --git a/README.md b/README.md
index 95b36e9a..2cbb2e92 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,9 @@ The following figure is the simplified overview of Geochemistry π: <br>
 
 The following figure is the frontend-backend separation architecture of Geochemistry: <br>
 
-![Frontend-backend separation architecture of Geochemistry](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/3b27cbdb-ff50-4fa6-b1d1-4c75b253fdff)
+<div style="text-align:center;">
+  <img src="https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/3b27cbdb-ff50-4fa6-b1d1-4c75b253fdff" alt="Frontend-backend separation architecture of Geochemistry" width="400" />
+</div>
 
 ## Quick Installation
 
@@ -149,7 +151,9 @@ The following figure is the system architecture diagram: <br>
 
 The following figure is the customized automated ML pipeline: <br>
 
-![Customized automated ML pipeline](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/07078b43-30bd-46cf-abad-2da509fae6aa)
+<div style="text-align:center;">
+  <img src="https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/07078b43-30bd-46cf-abad-2da509fae6aa" alt="Customized automated ML pipeline" width="400" />
+</div>
 
 The following figure is the design pattern hierarchical architecture: <br>
 
@@ -158,7 +162,9 @@ The following figure is the design pattern hierarchical architecture: <br>
 
 The following figure is the storage mechanism: <br>
 
-![Storage Mechanism](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/401f3429-c44f-4b76-b085-7a9dcc987cde)
+<div style="text-align:center;">
+  <img src="https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/401f3429-c44f-4b76-b085-7a9dcc987cde" alt="Storage Mechanism" width="500" />
+</div>
 
 The whole package is under construction and the documentation is progressively evolving.
 
diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py
index 3b213887..11df08da 100644
--- a/geochemistrypi/data_mining/cli_pipeline.py
+++ b/geochemistrypi/data_mining/cli_pipeline.py
@@ -67,7 +67,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
 
     # <-- User Data Loading -->
     with console.status("[bold green]Data Loading...[/bold green]", spinner="dots"):
-        sleep(1.5)
+        sleep(1)
     if training_data_path:
         # If the user provides file name, then load the data from the file.
         data = read_data(file_path=training_data_path, is_own_data=1)
@@ -167,7 +167,9 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
         print(f"Successfully loading the built-in training data set '{training_data_path}'.")
         show_data_columns(data.columns)
         clear_output()
-    # If the user doesn't provide the inference data path, then use the built-in data.
+    # If the user doesn't provide the inference data path and the training data is built-in data,
+    #  then use the built-in data as inference data. Otherwise, the inference data is None.
+    #  It means that the user doesn't want to run the model inference.
     if (not inference_data_path) and is_built_in_data:
         print("-*-*- Inference Data -*-*-")
         if built_in_data_num == 1:
@@ -238,9 +240,6 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
             test="kruskal",
             confidence=0.05,
         )
-        # print("The statistics test method: Kruskal Wallis Test")
-        # monte_carlo_simulator(data_processed, data_processed_imputed, sample_size=50,
-        #                       iteration=100, test='kruskal', confidence=0.05)
         probability_plot(data_selected.columns, data_selected, data_selected_imputed)
         basic_info(data_selected_imputed)
         basic_statistic(data_selected_imputed)
@@ -272,6 +271,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
     # divide X and y data set when it is supervised learning
     logger.debug("Data Split")
     if mode_num == 1 or mode_num == 2:
+        # Supervised learning
         print("-*-*- Data Split - X Set and Y Set -*-*-")
         print("Divide the processing data set into X (feature value) and Y (target value) respectively.")
         # create X data set
@@ -356,11 +356,32 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
         del data_selected_imputed_fe
         clear_output()
     else:
-        # unsupervised learning
-        feature_scaling_config = {}
-        feature_selection_config = {}
+        # Unsupervised learning
+        # Create X data set without data split because it is unsupervised learning
         X = data_selected_imputed_fe
-        X_train = data_selected_imputed_fe
+        # <--- Feature Scaling --->
+        print("-*-*- Feature Scaling on X Set -*-*-")
+        num2option(OPTION)
+        is_feature_scaling = limit_num_input(OPTION, SECTION[1], num_input)
+        if is_feature_scaling == 1:
+            print("Which strategy do you want to apply?")
+            num2option(FEATURE_SCALING_STRATEGY)
+            feature_scaling_num = limit_num_input(FEATURE_SCALING_STRATEGY, SECTION[1], num_input)
+            feature_scaling_config, X_scaled_np = feature_scaler(X, FEATURE_SCALING_STRATEGY, feature_scaling_num - 1)
+            X = np2pd(X_scaled_np, X.columns)
+            del X_scaled_np
+            print("Data Set After Scaling:")
+            print(X)
+            print("Basic Statistical Information: ")
+            basic_statistic(X)
+            save_data(X, "X With Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+        else:
+            feature_scaling_config = {}
+        clear_output()
+
+        feature_selection_config = {}
+        # Create training data without data split because it is unsupervised learning
+        X_train = X
         y, X_test, y_train, y_test = None, None, None, None
 
     # <--- Model Selection --->
@@ -401,11 +422,11 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
     is_inference = False
     # If the model is supervised learning, then allow the user to use model inference.
     if mode_num == 1 or mode_num == 2:
-        print("-*-*- Feature Engineering on Inference Data  -*-*-")
+        print("-*-*- Feature Engineering on Inference Data -*-*-")
         is_inference = True
         selected_columns = X_train.columns
-        # If feature_engineering_config is not {}, then apply feature engineering with the same operation to the input data.
-        if feature_engineering_config:
+        # If feature_engineering_config is not {} and inference_data is not None, then apply feature engineering with the same operation to the input data.
+        if feature_engineering_config and (inference_data is not None):
             print("The same feature engineering operation will be applied to the inference data.")
             new_feature_builder = FeatureConstructor(inference_data)
             inference_data_fe = new_feature_builder.batch_build(feature_engineering_config)
@@ -418,6 +439,8 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
         save_data(inference_data_fe, "Inference Data Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
         save_data(inference_data_fe_selected, "Inference Data Feature-Engineering Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
         clear_output()
+    else:
+        inference_data_fe_selected = None
 
     # <--- Model Training --->
     logger.debug("Model Training")
@@ -439,8 +462,9 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
 
         # <--- Model Inference --->
         logger.debug("Model Inference")
-        model_inference(inference_data_fe_selected, is_inference, feature_engineering_config, run, transformer_config, transform_pipeline)
-        clear_output()
+        if inference_data_fe_selected is not None:
+            model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline)
+            clear_output()
     else:
         # Run all models
         for i in range(len(MODELS) - 1):
@@ -465,6 +489,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
 
                 # <--- Model Inference --->
                 logger.debug("Model Inference")
-                model_inference(inference_data_fe_selected, is_inference, feature_engineering_config, run, transformer_config, transform_pipeline)
-                clear_output()
+                if inference_data_fe_selected is not None:
+                    model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline)
+                    clear_output()
     mlflow.end_run()
diff --git a/geochemistrypi/data_mining/data/inference.py b/geochemistrypi/data_mining/data/inference.py
index 584df5f1..111bb0c7 100644
--- a/geochemistrypi/data_mining/data/inference.py
+++ b/geochemistrypi/data_mining/data/inference.py
@@ -108,7 +108,7 @@ def build_transform_pipeline(imputation_config: Dict, feature_scaling_config: Di
     return transformer_config, transform_pipeline
 
 
-def model_inference(inference_data: pd.DataFrame, is_inference: bool, feature_engineering_config: Dict, run: object, transformer_config: Dict, transform_pipeline: Optional[object] = None):
+def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: object, transformer_config: Dict, transform_pipeline: Optional[object] = None):
     """Run the model inference.
 
     Parameters
@@ -119,9 +119,6 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, feature_en
     is_inference : bool
         Whether to run the model inference.
 
-    feature_engineering_config : Dict
-        The feature engineering configuration.
-
     run : object
         The model selection object.
 
@@ -131,8 +128,8 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, feature_en
     transform_pipeline : Optional[object], optional
         The transform pipeline object. The default is None.
     """
-    # If inference_data is not None and is_inference is True, then run the model inference.
-    if (inference_data is not None) and (is_inference is True):
+    # If is_inference is True, then run the model inference.
+    if is_inference is True:
         print("-*-*- Model Inference -*-*-")
         print("Use the trained model to make predictions on the inference data.")
         # If transformer_config is not {}, then transform the inference data with the transform pipeline.
diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py
index 7f883600..27cb95c7 100644
--- a/geochemistrypi/data_mining/model/classification.py
+++ b/geochemistrypi/data_mining/model/classification.py
@@ -1201,7 +1201,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:
         )
 
 
-class XgboostClassification(TreeWorkflowMixin, ClassificationWorkflowBase):
+class XGBoostClassification(TreeWorkflowMixin, ClassificationWorkflowBase):
     """The automation workflow of using Xgboost algorithm to make insightful products."""
 
     name = "Xgboost"
@@ -1490,7 +1490,7 @@ def __init__(
             early_stopping_rounds=self.early_stopping_rounds,
         )
 
-        self.naming = XgboostClassification.name
+        self.naming = XGBoostClassification.name
 
     @property
     def settings(self) -> Dict:
@@ -1538,7 +1538,7 @@ def special_components(self, **kwargs) -> None:
         #     mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
         # )
         self._plot_feature_importance(
-            X_train=XgboostClassification.X_train,
+            X_train=XGBoostClassification.X_train,
             trained_model=self.model,
             image_config=self.image_config,
             algorithm_name=self.naming,
@@ -1551,7 +1551,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:
         """Invoke all special application functions for this algorithms by FLAML framework."""
         GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
         self._plot_feature_importance(
-            X_train=XgboostClassification.X_train,
+            X_train=XGBoostClassification.X_train,
             trained_model=self.auto_model,
             image_config=self.image_config,
             algorithm_name=self.naming,
diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py b/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py
index 8b557fae..951dca13 100644
--- a/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py
+++ b/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py
@@ -23,11 +23,11 @@ def dbscan_manual_hyper_parameters() -> Dict:
     print("Please specify the number of samples. A good starting range could be between 5 and 20, such as 5.")
     min_samples = num_input(SECTION[2], "Min Samples: ")
     print("Metric: The metric to use when calculating distance between instances in a feature array.")
-    print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it set to euclidean.")
+    print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it as 'euclidean'.")
     metrics = ["euclidean", "manhattan", "chebyshev", "minkowski", "cosine", "correlation"]
     metric = str_input(metrics, SECTION[2])
     print("Algorithm: The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors.")
-    print("Please specify the algorithm. It is generally recommended to leave it set to auto.")
+    print("Please specify the algorithm. It is generally recommended to leave it as 'auto'.")
     algorithms = ["auto", "ball_tree", "kd_tree", "brute"]
     algorithm = str_input(algorithms, SECTION[2])
     print("Leaf Size: Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree.")
diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py b/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py
index 60c203c1..e15c3219 100644
--- a/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py
+++ b/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py
@@ -22,7 +22,7 @@ def kmeans_manual_hyper_parameters() -> Dict:
     print("Please specify the number of clusters for KMeans. A good starting range could be between 2 and 10, such as 4.")
     n_clusters = num_input(SECTION[2], "N Clusters: ")
     print("Init: Method for initialization of centroids. The centroids represent the center points of the clusters in the dataset.")
-    print("Please specify the method for initialization of centroids. It is generally recommended to leave it set to k-means++.")
+    print("Please specify the method for initialization of centroids. It is generally recommended to leave it as 'k-means++'.")
     inits = ["k-means++", "random"]
     init = str_input(inits, SECTION[2])
     print("Max Iter: Maximum number of iterations of the k-means algorithm for a single run.")
@@ -32,7 +32,7 @@ def kmeans_manual_hyper_parameters() -> Dict:
     print("Please specify the relative tolerance with regards to inertia to declare convergence. A good starting range could be between 0.0001 and 0.001, such as 0.0005.")
     tol = float_input(0.0005, SECTION[2], "Tolerance: ")
     print("Algorithm: The algorithm to use for the computation.")
-    print("Please specify the algorithm to use for the computation. It is generally recommended to leave it set to auto.")
+    print("Please specify the algorithm to use for the computation. It is generally recommended to leave it as 'auto'.")
     print("Auto: selects 'elkan' for dense data and 'full' for sparse data. 'elkan' is generally faster on data with lower dimensionality, while 'full' is faster on data with higher dimensionality")
     algorithms = ["auto", "full", "elkan"]
     algorithm = str_input(algorithms, SECTION[2])
diff --git a/geochemistrypi/data_mining/model/func/algo_decomposition/_pca.py b/geochemistrypi/data_mining/model/func/algo_decomposition/_pca.py
index 4c2d7aec..13ee9741 100644
--- a/geochemistrypi/data_mining/model/func/algo_decomposition/_pca.py
+++ b/geochemistrypi/data_mining/model/func/algo_decomposition/_pca.py
@@ -21,7 +21,7 @@ def pca_manual_hyper_parameters() -> Dict:
     print("Please specify the number of components to retain. A good starting range could be between 2 and 10, such as 4.")
     n_components = num_input(SECTION[2], "N Components: ")
     print("SVD Solver: This parameter specifies the algorithm used to perform the singular value decomposition.")
-    print("Please specify the algorithm. It is generally recommended to leave it set to auto.")
+    print("Please specify the algorithm. It is generally recommended to leave it as 'auto'.")
     svd_solvers = ["auto", "full", "arpack", "randomized"]
     svd_solver = str_input(svd_solvers, SECTION[2])
     hyper_parameters = {"n_components": n_components, "svd_solver": svd_solver}
diff --git a/geochemistrypi/data_mining/model/regression.py b/geochemistrypi/data_mining/model/regression.py
index e726683e..59e06381 100644
--- a/geochemistrypi/data_mining/model/regression.py
+++ b/geochemistrypi/data_mining/model/regression.py
@@ -308,7 +308,7 @@ def special_components(self, **kwargs) -> None:
         )
 
 
-class XgboostRegression(TreeWorkflowMixin, RegressionWorkflowBase):
+class XGBoostRegression(TreeWorkflowMixin, RegressionWorkflowBase):
     """The automation workflow of using Xgboost algorithm to make insightful products."""
 
     name = "Xgboost"
@@ -591,7 +591,7 @@ def __init__(
             early_stopping_rounds=self.early_stopping_rounds,
         )
 
-        self.naming = XgboostRegression.name
+        self.naming = XGBoostRegression.name
 
     @property
     def settings(self) -> Dict:
@@ -625,7 +625,7 @@ def special_components(self, **kwargs) -> None:
         """Invoke all special application functions for this algorithms by Scikit-learn framework."""
         GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
         self._plot_feature_importance(
-            X_train=XgboostRegression.X_train,
+            X_train=XGBoostRegression.X_train,
             trained_model=self.model,
             image_config=self.image_config,
             algorithm_name=self.naming,
@@ -633,7 +633,7 @@ def special_components(self, **kwargs) -> None:
             mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
         )
         # self._histograms_feature_weights(
-        #     X=XgboostRegression.X,
+        #     X=XGBoostRegression.X,
         #     trained_model=self.model,
         #     image_config=self.image_config,
         #     algorithm_name=self.naming,
@@ -646,7 +646,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:
         """Invoke all special application functions for this algorithms by FLAML framework."""
         GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
         self._plot_feature_importance(
-            X_train=XgboostRegression.X_train,
+            X_train=XGBoostRegression.X_train,
             trained_model=self.auto_model,
             image_config=self.image_config,
             algorithm_name=self.naming,
@@ -654,7 +654,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:
             mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
         )
         # self._histograms_feature_weights(
-        #     X=XgboostRegression.X,
+        #     X=XGBoostRegression.X,
         #     trained_model=self.auto_model,
         #     image_config=self.image_config,
         #     algorithm_name=self.naming,
diff --git a/geochemistrypi/data_mining/process/classify.py b/geochemistrypi/data_mining/process/classify.py
index 9686544b..7239b26f 100644
--- a/geochemistrypi/data_mining/process/classify.py
+++ b/geochemistrypi/data_mining/process/classify.py
@@ -15,7 +15,7 @@
     MLPClassification,
     RandomForestClassification,
     SVMClassification,
-    XgboostClassification,
+    XGBoostClassification,
 )
 from ._base import ModelSelectionBase
 
@@ -80,8 +80,8 @@ def activate(
                 max_samples=hyper_parameters["max_samples"],
             )
         elif self.model_name == "Xgboost":
-            hyper_parameters = XgboostClassification.manual_hyper_parameters()
-            self.clf_workflow = XgboostClassification(
+            hyper_parameters = XGBoostClassification.manual_hyper_parameters()
+            self.clf_workflow = XGBoostClassification(
                 n_estimators=hyper_parameters["n_estimators"],
                 learning_rate=hyper_parameters["learning_rate"],
                 max_depth=hyper_parameters["max_depth"],
@@ -196,7 +196,7 @@ def activate(
         elif self.model_name == "Random Forest":
             self.clf_workflow = RandomForestClassification()
         elif self.model_name == "Xgboost":
-            self.clf_workflow = XgboostClassification()
+            self.clf_workflow = XGBoostClassification()
         elif self.model_name == "Logistic Regression":
             self.clf_workflow = LogisticRegressionClassification()
         elif self.model_name == "Multi-layer Perceptron":
diff --git a/geochemistrypi/data_mining/process/regress.py b/geochemistrypi/data_mining/process/regress.py
index efa34375..10083972 100644
--- a/geochemistrypi/data_mining/process/regress.py
+++ b/geochemistrypi/data_mining/process/regress.py
@@ -21,7 +21,7 @@
     RegressionWorkflowBase,
     SGDRegression,
     SVMRegression,
-    XgboostRegression,
+    XGBoostRegression,
 )
 from ._base import ModelSelectionBase
 
@@ -60,8 +60,8 @@ def activate(
             self.transformer_config.update(poly_config)
             self.reg_workflow.data_upload(X_train=X_train, X_test=X_test)
         elif self.model_name == "Xgboost":
-            hyper_parameters = XgboostRegression.manual_hyper_parameters()
-            self.reg_workflow = XgboostRegression(
+            hyper_parameters = XGBoostRegression.manual_hyper_parameters()
+            self.reg_workflow = XGBoostRegression(
                 n_estimators=hyper_parameters["n_estimators"],
                 learning_rate=hyper_parameters["learning_rate"],
                 max_depth=hyper_parameters["max_depth"],
@@ -229,7 +229,7 @@ def activate(
             self.transformer_config.update(poly_config)
             self.reg_workflow.data_upload(X_train=X_train, X_test=X_test)
         elif self.model_name == "Xgboost":
-            self.reg_workflow = XgboostRegression()
+            self.reg_workflow = XGBoostRegression()
         elif self.model_name == "Decision Tree":
             self.reg_workflow = DecisionTreeRegression()
         elif self.model_name == "Extra-Trees":