From 75ec881b640222fe4a593366f5a8a11d1ed79d2d Mon Sep 17 00:00:00 2001 From: sanyhe Date: Sun, 5 Nov 2023 21:11:25 +0800 Subject: [PATCH] feat: add feature scaling for unsupervised learning. --- README.md | 12 +++- geochemistrypi/data_mining/cli_pipeline.py | 57 +++++++++++++------ geochemistrypi/data_mining/data/inference.py | 9 +-- .../data_mining/model/classification.py | 8 +-- .../model/func/algo_clustering/_dbscan.py | 4 +- .../model/func/algo_clustering/_kmeans.py | 4 +- .../model/func/algo_decomposition/_pca.py | 2 +- .../data_mining/model/regression.py | 12 ++-- .../data_mining/process/classify.py | 8 +-- geochemistrypi/data_mining/process/regress.py | 8 +-- 10 files changed, 76 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 95b36e9a..2cbb2e92 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,9 @@ The following figure is the simplified overview of Geochemistry π:
The following figure is the frontend-backend separation architecture of Geochemistry:
-![Frontend-backend separation architecture of Geochemistry](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/3b27cbdb-ff50-4fa6-b1d1-4c75b253fdff) +
+ Frontend-backend separation architecture of Geochemistry +
## Quick Installation @@ -149,7 +151,9 @@ The following figure is the system architecture diagram:
The following figure is the customized automated ML pipeline:
-![Customized automated ML pipeline](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/07078b43-30bd-46cf-abad-2da509fae6aa) +
+ Customized automated ML pipeline +
The following figure is the design pattern hierarchical architecture:
@@ -158,7 +162,9 @@ The following figure is the design pattern hierarchical architecture:
The following figure is the storage mechanism:
-![Storage Mechanism](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/401f3429-c44f-4b76-b085-7a9dcc987cde) +
+ Storage Mechanism +
The whole package is under construction and the documentation is progressively evolving. diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py index 3b213887..11df08da 100644 --- a/geochemistrypi/data_mining/cli_pipeline.py +++ b/geochemistrypi/data_mining/cli_pipeline.py @@ -67,7 +67,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N # <-- User Data Loading --> with console.status("[bold green]Data Loading...[/bold green]", spinner="dots"): - sleep(1.5) + sleep(1) if training_data_path: # If the user provides file name, then load the data from the file. data = read_data(file_path=training_data_path, is_own_data=1) @@ -167,7 +167,9 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N print(f"Successfully loading the built-in training data set '{training_data_path}'.") show_data_columns(data.columns) clear_output() - # If the user doesn't provide the inference data path, then use the built-in data. + # If the user doesn't provide the inference data path and the training data is built-in data, + # then use the built-in data as inference data. Otherwise, the inference data is None. + # It means that the user doesn't want to run the model inference. if (not inference_data_path) and is_built_in_data: print("-*-*- Inference Data -*-*-") if built_in_data_num == 1: @@ -238,9 +240,6 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N test="kruskal", confidence=0.05, ) - # print("The statistics test method: Kruskal Wallis Test") - # monte_carlo_simulator(data_processed, data_processed_imputed, sample_size=50, - # iteration=100, test='kruskal', confidence=0.05) probability_plot(data_selected.columns, data_selected, data_selected_imputed) basic_info(data_selected_imputed) basic_statistic(data_selected_imputed) @@ -272,6 +271,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N # divide X and y data set when it is supervised learning logger.debug("Data Split") if mode_num == 1 or mode_num == 2: + # Supervised learning print("-*-*- Data Split - X Set and Y Set -*-*-") print("Divide the processing data set into X (feature value) and Y (target value) respectively.") # create X data set @@ -356,11 +356,32 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N del data_selected_imputed_fe clear_output() else: - # unsupervised learning - feature_scaling_config = {} - feature_selection_config = {} + # Unsupervised learning + # Create X data set without data split because it is unsupervised learning X = data_selected_imputed_fe - X_train = data_selected_imputed_fe + # <--- Feature Scaling ---> + print("-*-*- Feature Scaling on X Set -*-*-") + num2option(OPTION) + is_feature_scaling = limit_num_input(OPTION, SECTION[1], num_input) + if is_feature_scaling == 1: + print("Which strategy do you want to apply?") + num2option(FEATURE_SCALING_STRATEGY) + feature_scaling_num = limit_num_input(FEATURE_SCALING_STRATEGY, SECTION[1], num_input) + feature_scaling_config, X_scaled_np = feature_scaler(X, FEATURE_SCALING_STRATEGY, feature_scaling_num - 1) + X = np2pd(X_scaled_np, X.columns) + del X_scaled_np + print("Data Set After Scaling:") + print(X) + print("Basic Statistical Information: ") + basic_statistic(X) + save_data(X, "X With Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + else: + feature_scaling_config = {} + clear_output() + + feature_selection_config = {} + # Create training data without data split because it is unsupervised learning + X_train = X y, X_test, y_train, y_test = None, None, None, None # <--- Model Selection ---> @@ -401,11 +422,11 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N is_inference = False # If the model is supervised learning, then allow the user to use model inference. if mode_num == 1 or mode_num == 2: - print("-*-*- Feature Engineering on Inference Data -*-*-") + print("-*-*- Feature Engineering on Inference Data -*-*-") is_inference = True selected_columns = X_train.columns - # If feature_engineering_config is not {}, then apply feature engineering with the same operation to the input data. - if feature_engineering_config: + # If feature_engineering_config is not {} and inference_data is not None, then apply feature engineering with the same operation to the input data. + if feature_engineering_config and (inference_data is not None): print("The same feature engineering operation will be applied to the inference data.") new_feature_builder = FeatureConstructor(inference_data) inference_data_fe = new_feature_builder.batch_build(feature_engineering_config) @@ -418,6 +439,8 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N save_data(inference_data_fe, "Inference Data Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) save_data(inference_data_fe_selected, "Inference Data Feature-Engineering Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) clear_output() + else: + inference_data_fe_selected = None # <--- Model Training ---> logger.debug("Model Training") @@ -439,8 +462,9 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N # <--- Model Inference ---> logger.debug("Model Inference") - model_inference(inference_data_fe_selected, is_inference, feature_engineering_config, run, transformer_config, transform_pipeline) - clear_output() + if inference_data_fe_selected is not None: + model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline) + clear_output() else: # Run all models for i in range(len(MODELS) - 1): @@ -465,6 +489,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N # <--- Model Inference ---> logger.debug("Model Inference") - model_inference(inference_data_fe_selected, is_inference, feature_engineering_config, run, transformer_config, transform_pipeline) - clear_output() + if inference_data_fe_selected is not None: + model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline) + clear_output() mlflow.end_run() diff --git a/geochemistrypi/data_mining/data/inference.py b/geochemistrypi/data_mining/data/inference.py index 584df5f1..111bb0c7 100644 --- a/geochemistrypi/data_mining/data/inference.py +++ b/geochemistrypi/data_mining/data/inference.py @@ -108,7 +108,7 @@ def build_transform_pipeline(imputation_config: Dict, feature_scaling_config: Di return transformer_config, transform_pipeline -def model_inference(inference_data: pd.DataFrame, is_inference: bool, feature_engineering_config: Dict, run: object, transformer_config: Dict, transform_pipeline: Optional[object] = None): +def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: object, transformer_config: Dict, transform_pipeline: Optional[object] = None): """Run the model inference. Parameters @@ -119,9 +119,6 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, feature_en is_inference : bool Whether to run the model inference. - feature_engineering_config : Dict - The feature engineering configuration. - run : object The model selection object. @@ -131,8 +128,8 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, feature_en transform_pipeline : Optional[object], optional The transform pipeline object. The default is None. """ - # If inference_data is not None and is_inference is True, then run the model inference. - if (inference_data is not None) and (is_inference is True): + # If is_inference is True, then run the model inference. + if is_inference is True: print("-*-*- Model Inference -*-*-") print("Use the trained model to make predictions on the inference data.") # If transformer_config is not {}, then transform the inference data with the transform pipeline. diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py index 7f883600..27cb95c7 100644 --- a/geochemistrypi/data_mining/model/classification.py +++ b/geochemistrypi/data_mining/model/classification.py @@ -1201,7 +1201,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None: ) -class XgboostClassification(TreeWorkflowMixin, ClassificationWorkflowBase): +class XGBoostClassification(TreeWorkflowMixin, ClassificationWorkflowBase): """The automation workflow of using Xgboost algorithm to make insightful products.""" name = "Xgboost" @@ -1490,7 +1490,7 @@ def __init__( early_stopping_rounds=self.early_stopping_rounds, ) - self.naming = XgboostClassification.name + self.naming = XGBoostClassification.name @property def settings(self) -> Dict: @@ -1538,7 +1538,7 @@ def special_components(self, **kwargs) -> None: # mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, # ) self._plot_feature_importance( - X_train=XgboostClassification.X_train, + X_train=XGBoostClassification.X_train, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -1551,7 +1551,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None: """Invoke all special application functions for this algorithms by FLAML framework.""" GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( - X_train=XgboostClassification.X_train, + X_train=XGBoostClassification.X_train, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py b/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py index 8b557fae..951dca13 100644 --- a/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py +++ b/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py @@ -23,11 +23,11 @@ def dbscan_manual_hyper_parameters() -> Dict: print("Please specify the number of samples. A good starting range could be between 5 and 20, such as 5.") min_samples = num_input(SECTION[2], "Min Samples: ") print("Metric: The metric to use when calculating distance between instances in a feature array.") - print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it set to euclidean.") + print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it as 'euclidean'.") metrics = ["euclidean", "manhattan", "chebyshev", "minkowski", "cosine", "correlation"] metric = str_input(metrics, SECTION[2]) print("Algorithm: The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors.") - print("Please specify the algorithm. It is generally recommended to leave it set to auto.") + print("Please specify the algorithm. It is generally recommended to leave it as 'auto'.") algorithms = ["auto", "ball_tree", "kd_tree", "brute"] algorithm = str_input(algorithms, SECTION[2]) print("Leaf Size: Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree.") diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py b/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py index 60c203c1..e15c3219 100644 --- a/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py +++ b/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py @@ -22,7 +22,7 @@ def kmeans_manual_hyper_parameters() -> Dict: print("Please specify the number of clusters for KMeans. A good starting range could be between 2 and 10, such as 4.") n_clusters = num_input(SECTION[2], "N Clusters: ") print("Init: Method for initialization of centroids. The centroids represent the center points of the clusters in the dataset.") - print("Please specify the method for initialization of centroids. It is generally recommended to leave it set to k-means++.") + print("Please specify the method for initialization of centroids. It is generally recommended to leave it as 'k-means++'.") inits = ["k-means++", "random"] init = str_input(inits, SECTION[2]) print("Max Iter: Maximum number of iterations of the k-means algorithm for a single run.") @@ -32,7 +32,7 @@ def kmeans_manual_hyper_parameters() -> Dict: print("Please specify the relative tolerance with regards to inertia to declare convergence. A good starting range could be between 0.0001 and 0.001, such as 0.0005.") tol = float_input(0.0005, SECTION[2], "Tolerance: ") print("Algorithm: The algorithm to use for the computation.") - print("Please specify the algorithm to use for the computation. It is generally recommended to leave it set to auto.") + print("Please specify the algorithm to use for the computation. It is generally recommended to leave it as 'auto'.") print("Auto: selects 'elkan' for dense data and 'full' for sparse data. 'elkan' is generally faster on data with lower dimensionality, while 'full' is faster on data with higher dimensionality") algorithms = ["auto", "full", "elkan"] algorithm = str_input(algorithms, SECTION[2]) diff --git a/geochemistrypi/data_mining/model/func/algo_decomposition/_pca.py b/geochemistrypi/data_mining/model/func/algo_decomposition/_pca.py index 4c2d7aec..13ee9741 100644 --- a/geochemistrypi/data_mining/model/func/algo_decomposition/_pca.py +++ b/geochemistrypi/data_mining/model/func/algo_decomposition/_pca.py @@ -21,7 +21,7 @@ def pca_manual_hyper_parameters() -> Dict: print("Please specify the number of components to retain. A good starting range could be between 2 and 10, such as 4.") n_components = num_input(SECTION[2], "N Components: ") print("SVD Solver: This parameter specifies the algorithm used to perform the singular value decomposition.") - print("Please specify the algorithm. It is generally recommended to leave it set to auto.") + print("Please specify the algorithm. It is generally recommended to leave it as 'auto'.") svd_solvers = ["auto", "full", "arpack", "randomized"] svd_solver = str_input(svd_solvers, SECTION[2]) hyper_parameters = {"n_components": n_components, "svd_solver": svd_solver} diff --git a/geochemistrypi/data_mining/model/regression.py b/geochemistrypi/data_mining/model/regression.py index e726683e..59e06381 100644 --- a/geochemistrypi/data_mining/model/regression.py +++ b/geochemistrypi/data_mining/model/regression.py @@ -308,7 +308,7 @@ def special_components(self, **kwargs) -> None: ) -class XgboostRegression(TreeWorkflowMixin, RegressionWorkflowBase): +class XGBoostRegression(TreeWorkflowMixin, RegressionWorkflowBase): """The automation workflow of using Xgboost algorithm to make insightful products.""" name = "Xgboost" @@ -591,7 +591,7 @@ def __init__( early_stopping_rounds=self.early_stopping_rounds, ) - self.naming = XgboostRegression.name + self.naming = XGBoostRegression.name @property def settings(self) -> Dict: @@ -625,7 +625,7 @@ def special_components(self, **kwargs) -> None: """Invoke all special application functions for this algorithms by Scikit-learn framework.""" GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( - X_train=XgboostRegression.X_train, + X_train=XGBoostRegression.X_train, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -633,7 +633,7 @@ def special_components(self, **kwargs) -> None: mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, ) # self._histograms_feature_weights( - # X=XgboostRegression.X, + # X=XGBoostRegression.X, # trained_model=self.model, # image_config=self.image_config, # algorithm_name=self.naming, @@ -646,7 +646,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None: """Invoke all special application functions for this algorithms by FLAML framework.""" GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( - X_train=XgboostRegression.X_train, + X_train=XGBoostRegression.X_train, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, @@ -654,7 +654,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None: mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, ) # self._histograms_feature_weights( - # X=XgboostRegression.X, + # X=XGBoostRegression.X, # trained_model=self.auto_model, # image_config=self.image_config, # algorithm_name=self.naming, diff --git a/geochemistrypi/data_mining/process/classify.py b/geochemistrypi/data_mining/process/classify.py index 9686544b..7239b26f 100644 --- a/geochemistrypi/data_mining/process/classify.py +++ b/geochemistrypi/data_mining/process/classify.py @@ -15,7 +15,7 @@ MLPClassification, RandomForestClassification, SVMClassification, - XgboostClassification, + XGBoostClassification, ) from ._base import ModelSelectionBase @@ -80,8 +80,8 @@ def activate( max_samples=hyper_parameters["max_samples"], ) elif self.model_name == "Xgboost": - hyper_parameters = XgboostClassification.manual_hyper_parameters() - self.clf_workflow = XgboostClassification( + hyper_parameters = XGBoostClassification.manual_hyper_parameters() + self.clf_workflow = XGBoostClassification( n_estimators=hyper_parameters["n_estimators"], learning_rate=hyper_parameters["learning_rate"], max_depth=hyper_parameters["max_depth"], @@ -196,7 +196,7 @@ def activate( elif self.model_name == "Random Forest": self.clf_workflow = RandomForestClassification() elif self.model_name == "Xgboost": - self.clf_workflow = XgboostClassification() + self.clf_workflow = XGBoostClassification() elif self.model_name == "Logistic Regression": self.clf_workflow = LogisticRegressionClassification() elif self.model_name == "Multi-layer Perceptron": diff --git a/geochemistrypi/data_mining/process/regress.py b/geochemistrypi/data_mining/process/regress.py index efa34375..10083972 100644 --- a/geochemistrypi/data_mining/process/regress.py +++ b/geochemistrypi/data_mining/process/regress.py @@ -21,7 +21,7 @@ RegressionWorkflowBase, SGDRegression, SVMRegression, - XgboostRegression, + XGBoostRegression, ) from ._base import ModelSelectionBase @@ -60,8 +60,8 @@ def activate( self.transformer_config.update(poly_config) self.reg_workflow.data_upload(X_train=X_train, X_test=X_test) elif self.model_name == "Xgboost": - hyper_parameters = XgboostRegression.manual_hyper_parameters() - self.reg_workflow = XgboostRegression( + hyper_parameters = XGBoostRegression.manual_hyper_parameters() + self.reg_workflow = XGBoostRegression( n_estimators=hyper_parameters["n_estimators"], learning_rate=hyper_parameters["learning_rate"], max_depth=hyper_parameters["max_depth"], @@ -229,7 +229,7 @@ def activate( self.transformer_config.update(poly_config) self.reg_workflow.data_upload(X_train=X_train, X_test=X_test) elif self.model_name == "Xgboost": - self.reg_workflow = XgboostRegression() + self.reg_workflow = XGBoostRegression() elif self.model_name == "Decision Tree": self.reg_workflow = DecisionTreeRegression() elif self.model_name == "Extra-Trees":