Merge pull request #289 from ZJUEarthData/web

feat: distinguish which model can deal with missing values.
ZJUEarthData · Dec 20, 2023 · 8858768 · 8858768
2 parents 0c3b9de + 5d9cccf
commit 8858768
Show file tree

Hide file tree

Showing 8 changed files with 93 additions and 36 deletions.
diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py
@@ -10,7 +10,9 @@
 
 from .constants import (
     CLASSIFICATION_MODELS,
+    CLASSIFICATION_MODELS_WITH_MISSING_VALUES,
     CLUSTERING_MODELS,
+    CLUSTERING_MODELS_WITH_MISSING_VALUES,
     DECOMPOSITION_MODELS,
     FEATURE_SCALING_STRATEGY,
     FEATURE_SELECTION_STRATEGY,
@@ -21,6 +23,7 @@
     OPTION,
     OUTPUT_PATH,
     REGRESSION_MODELS,
+    REGRESSION_MODELS_WITH_MISSING_VALUES,
     SECTION,
     TEST_DATA_OPTION,
     WORKING_PATH,
@@ -32,7 +35,7 @@
 from .data.preprocessing import feature_scaler, feature_selector
 from .data.statistic import monte_carlo_simulator
 from .plot.map_plot import process_world_map
-from .plot.statistic_plot import basic_statistic, correlation_plot, distribution_plot, is_imputed, is_null_value, log_distribution_plot, probability_plot, ratio_null_vs_filled
+from .plot.statistic_plot import basic_statistic, check_missing_value, correlation_plot, distribution_plot, is_null_value, log_distribution_plot, probability_plot, ratio_null_vs_filled
 from .process.classify import ClassificationModelSelection
 from .process.cluster import ClusteringModelSelection
 from .process.decompose import DecompositionModelSelection
@@ -157,7 +160,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
                 raise e
         experiment = mlflow.get_experiment(experiment_id=new_experiment_id)
     # print("Artifact Location: {}".format(experiment.artifact_location))
-    run_name = Prompt.ask("✨ Run Name", default="Xgboost Algorithm - Test 1")
+    run_name = Prompt.ask("✨ Run Name", default="XGBoost Algorithm - Test 1")
     # run_tag = Prompt.ask("✨ Run Tag Version", default="R - v1.0.0")
     # run_description = Prompt.ask("✨ Run Description", default="Use xgboost for GeoPi classification.")
     # mlflow.start_run(run_name=run_name, experiment_id=experiment.experiment_id, tags={"version": run_tag, "description": run_description})
@@ -219,7 +222,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
     print("The Selected Data Set:")
     print(data_selected)
     clear_output()
-    print("Basic Statistical Information: ")
+    print("-*-*- Basic Statistical Information -*-*-")
     basic_info(data_selected)
     basic_statistic(data_selected)
     correlation_plot(data_selected.columns, data_selected)
@@ -232,11 +235,26 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
 
     # <--- Imputation --->
     logger.debug("Imputation")
-    print("-*-*- Imputation -*-*-")
+    print("-*-*- Missing Value Check -*-*-")
     is_null_value(data_selected)
     ratio_null_vs_filled(data_selected)
-    imputed_flag = is_imputed(data_selected)
+    missing_value_flag = check_missing_value(data_selected)
     clear_output()
+    if missing_value_flag:
+        # Ask the user whether to use imputation techniques to deal with the missing values.
+        print("-*-*- Imputation Option -*-*-")
+        num2option(OPTION)
+        imputation_num = limit_num_input(OPTION, SECTION[1], num_input)
+        if imputation_num == 1:
+            imputed_flag = True
+        else:
+            imputed_flag = False
+        clear_output()
+    else:
+        # Allow the user not to use imputation techniques to deal with the missing values.
+        # Subsequently, in the mode selection, only regression, classification and clustering models are available.
+        # In the corresponding model selection, only the models that support missing values are available.
+        imputed_flag = False
     if imputed_flag:
         print("-*-*- Strategy for Missing Values -*-*-")
         num2option(IMPUTING_STRATEGY)
@@ -281,8 +299,16 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
     # <--- Mode Selection --->
     logger.debug("Mode Selection")
     print("-*-*- Mode Selection -*-*-")
-    num2option(MODE_OPTION)
-    mode_num = limit_num_input(MODE_OPTION, SECTION[2], num_input)
+    # If the selected data set is with missing values and is not been imputed, then only allow the user to choose regression, classification and clustering modes.
+    # Otherwise, allow the user to choose decomposition modes.
+    if missing_value_flag and not imputed_flag:
+        # Delete the decomposition mode because it doesn't support missing values.
+        MODE_OPTION.remove("Dimensional Reduction")
+        num2option(MODE_OPTION)
+        mode_num = limit_num_input(MODE_OPTION, SECTION[2], num_input)
+    else:
+        num2option(MODE_OPTION)
+        mode_num = limit_num_input(MODE_OPTION, SECTION[2], num_input)
     clear_output()
 
     # <--- Data Segmentation --->
@@ -359,7 +385,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
 
         # create training data and testing data
         print("-*-*- Data Split - Train Set and Test Set -*-*-")
-        print("Notice: Normally, set 20% of the dataset aside as test set, such as 0.2")
+        print("Notice: Normally, set 20% of the dataset aside as test set, such as 0.2.")
         test_ratio = float_input(default=0.2, prefix=SECTION[1], slogan="@Test Ratio: ")
         train_test_data = data_split(X, y, test_ratio)
         for key, value in train_test_data.items():
@@ -404,14 +430,20 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
 
     # <--- Model Selection --->
     logger.debug("Model Selection")
-    print("-*-*- Model Selection -*-*-:")
-    Modes2Models = {1: REGRESSION_MODELS, 2: CLASSIFICATION_MODELS, 3: CLUSTERING_MODELS, 4: DECOMPOSITION_MODELS}
-    Modes2Initiators = {
-        1: RegressionModelSelection,
-        2: ClassificationModelSelection,
-        3: ClusteringModelSelection,
-        4: DecompositionModelSelection,
-    }
+    print("-*-*- Model Selection -*-*-")
+    # If the selected data set is with missing values and is not been imputed, then only allow the user to choose regression, classification and clustering models.
+    # Otherwise, allow the user to choose decomposition models.
+    if missing_value_flag and not imputed_flag:
+        Modes2Models = {1: REGRESSION_MODELS_WITH_MISSING_VALUES, 2: CLASSIFICATION_MODELS_WITH_MISSING_VALUES, 3: CLUSTERING_MODELS_WITH_MISSING_VALUES}
+        Modes2Initiators = {1: RegressionModelSelection, 2: ClassificationModelSelection, 3: ClusteringModelSelection}
+    else:
+        Modes2Models = {1: REGRESSION_MODELS, 2: CLASSIFICATION_MODELS, 3: CLUSTERING_MODELS, 4: DECOMPOSITION_MODELS}
+        Modes2Initiators = {
+            1: RegressionModelSelection,
+            2: ClassificationModelSelection,
+            3: ClusteringModelSelection,
+            4: DecompositionModelSelection,
+        }
     MODELS = Modes2Models[mode_num]
     num2option(MODELS)
     # Add the option of all models

diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -39,27 +39,52 @@
     "Random Forest",
     "Extra-Trees",
     "Gradient Boosting",
-    "Xgboost",
+    "XGBoost",
     "Multi-layer Perceptron",
     "Lasso Regression",
     "Elastic Net",
     "SGD Regression",
+    # "Bagging Regression",
+    # "Decision Tree",
+    # Histogram-based Gradient Boosting,
 ]
 CLASSIFICATION_MODELS = [
     "Logistic Regression",
     "Support Vector Machine",
     "Decision Tree",
     "Random Forest",
     "Extra-Trees",
-    "Xgboost",
+    "XGBoost",
     "Multi-layer Perceptron",
     "Gradient Boosting",
     "K-Nearest Neighbors",
     "Stochastic Gradient Descent",
+    # "Bagging Classification",
+    # "Decision Tree",
+    # Histogram-based Gradient Boosting,
 ]
 CLUSTERING_MODELS = ["KMeans", "DBSCAN"]
 DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"]
 
+# The model can deal with missing values
+# Reference: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
+REGRESSION_MODELS_WITH_MISSING_VALUES = [
+    "XGBoost",
+    # "Bagging Regression",
+    # "Decision Tree",
+    # Histogram-based Gradient Boosting,
+]
+CLASSIFICATION_MODELS_WITH_MISSING_VALUES = [
+    "XGBoost",
+    # "Bagging Classification",
+    # "Decision Tree",
+    # Histogram-based Gradient Boosting,
+]
+CLUSTERING_MODELS_WITH_MISSING_VALUES = [
+    # "HDBSCAN"
+]
+
+
 # Special AutoML models
 NON_AUTOML_MODELS = ["Linear Regression", "Polynomial Regression"]
 RAY_FLAML = ["Multi-layer Perceptron"]

diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py
@@ -1211,9 +1211,9 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:
 
 
 class XGBoostClassification(TreeWorkflowMixin, ClassificationWorkflowBase):
-    """The automation workflow of using Xgboost algorithm to make insightful products."""
+    """The automation workflow of using XGBoost algorithm to make insightful products."""
 
-    name = "Xgboost"
+    name = "XGBoost"
     special_function = ["Feature Importance Diagram"]
 
     # https: // xgboost.readthedocs.io / en / stable / python / python_api.html  # module-xgboost.sklearn
@@ -1419,10 +1419,10 @@ def __init__(
 
         References
         ----------
-        [1] Xgboost Python API Reference - Scikit-Learn API
+        [1] XGBoost Python API Reference - Scikit-Learn API
             https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
 
-        [2] Xgboost API for the scikit-learn wrapper:
+        [2] XGBoost API for the scikit-learn wrapper:
             https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py
         """
         super().__init__()
@@ -1526,7 +1526,7 @@ def manual_hyper_parameters(cls) -> Dict:
     # def _plot_tree(trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
     #     # TODO: (solve the problem of failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH
     #     # Drawing diagrams of the first decision tree of xgboost
-    #     print("-----* Xgboost's Single Tree Diagram *-----")
+    #     print("-----* XGBoost's Single Tree Diagram *-----")
     #     xgboost.plot_tree(trained_model)
     #     # node_params = {
     #     #     'shape': 'box',

diff --git a/geochemistrypi/data_mining/model/regression.py b/geochemistrypi/data_mining/model/regression.py
@@ -310,9 +310,9 @@ def special_components(self, **kwargs) -> None:
 
 
 class XGBoostRegression(TreeWorkflowMixin, RegressionWorkflowBase):
-    """The automation workflow of using Xgboost algorithm to make insightful products."""
+    """The automation workflow of using XGBoost algorithm to make insightful products."""
 
-    name = "Xgboost"
+    name = "XGBoost"
     special_function = ["Feature Importance Diagram"]
 
     # In fact, it's used for type hint in the original xgboost package.
@@ -516,10 +516,10 @@ def __init__(
 
         References
         ----------
-        [1] Xgboost Python API Reference - Scikit-Learn API
+        [1] XGBoost Python API Reference - Scikit-Learn API
             https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
 
-        [2] Xgboost API for the scikit-learn wrapper:
+        [2] XGBoost API for the scikit-learn wrapper:
             https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py
         """
 

diff --git a/geochemistrypi/data_mining/plot/statistic_plot.py b/geochemistrypi/data_mining/plot/statistic_plot.py
@@ -37,7 +37,7 @@ def is_null_value(data: pd.DataFrame) -> None:
     print("--" * 10)
 
 
-def is_imputed(data: pd.DataFrame) -> bool:
+def check_missing_value(data: pd.DataFrame) -> bool:
     """Check whether the data set has null value or not.
 
     Parameters
@@ -54,7 +54,7 @@ def is_imputed(data: pd.DataFrame) -> bool:
     if flag:
         print("Note: you'd better use imputation techniques to deal with the missing values.")
     else:
-        print("Note: you don't need to deal with the missing values, we'll just pass this step!")
+        print("Note: The provided data set is complete without missing values, we'll just pass this step!")
     return flag
 
 

diff --git a/geochemistrypi/data_mining/process/classify.py b/geochemistrypi/data_mining/process/classify.py
@@ -80,7 +80,7 @@ def activate(
                 oob_score=hyper_parameters["oob_score"],
                 max_samples=hyper_parameters["max_samples"],
             )
-        elif self.model_name == "Xgboost":
+        elif self.model_name == "XGBoost":
             hyper_parameters = XGBoostClassification.manual_hyper_parameters()
             self.clf_workflow = XGBoostClassification(
                 n_estimators=hyper_parameters["n_estimators"],
@@ -214,7 +214,7 @@ def activate(
             self.clf_workflow = DecisionTreeClassification()
         elif self.model_name == "Random Forest":
             self.clf_workflow = RandomForestClassification()
-        elif self.model_name == "Xgboost":
+        elif self.model_name == "XGBoost":
             self.clf_workflow = XGBoostClassification()
         elif self.model_name == "Logistic Regression":
             self.clf_workflow = LogisticRegressionClassification()

diff --git a/geochemistrypi/data_mining/process/regress.py b/geochemistrypi/data_mining/process/regress.py
@@ -59,7 +59,7 @@ def activate(
             poly_config, X_train, X_test = self.reg_workflow.poly(X_train, X_test)
             self.transformer_config.update(poly_config)
             self.reg_workflow.data_upload(X_train=X_train, X_test=X_test)
-        elif self.model_name == "Xgboost":
+        elif self.model_name == "XGBoost":
             hyper_parameters = XGBoostRegression.manual_hyper_parameters()
             self.reg_workflow = XGBoostRegression(
                 n_estimators=hyper_parameters["n_estimators"],
@@ -228,7 +228,7 @@ def activate(
             poly_config, X_train, X_test = self.reg_workflow.poly(X_train, X_test)
             self.transformer_config.update(poly_config)
             self.reg_workflow.data_upload(X_train=X_train, X_test=X_test)
-        elif self.model_name == "Xgboost":
+        elif self.model_name == "XGBoost":
             self.reg_workflow = XGBoostRegression()
         elif self.model_name == "Decision Tree":
             self.reg_workflow = DecisionTreeRegression()

diff --git a/pyproject.toml b/pyproject.toml
@@ -33,9 +33,9 @@ dependencies = [
     "openpyxl==3.0.10",
     "pandas==1.5.2",
     "joblib==1.2.0",
-    "flaml==1.0.14",                # required to run Xgboost + FLMAL
-    "numpy==1.23.5",                # required to run Xgboost + FLMAL
-    "xgboost==1.6.2",               # required to run Xgboost + FLAML and be compatible with M2 chip on Mac
+    "flaml==1.0.14",                # required to run XGBoost + FLMAL
+    "numpy==1.23.5",                # required to run XGBoost + FLMAL
+    "xgboost==1.6.2",               # required to run XGBoost + FLAML and be compatible with M2 chip on Mac
     "threadpoolctl==3.1.0",         # required to draw 3d plot for KMeans
     "matplotlib==3.5.2",            # required to draw 3d plot for KMeans
     "fastapi",                      # backend framework