Skip to content

Commit

Permalink
Merge pull request #289 from ZJUEarthData/web
Browse files Browse the repository at this point in the history
feat: distinguish which model can deal with missing values.
  • Loading branch information
SanyHe authored Dec 20, 2023
2 parents 0c3b9de + 5d9cccf commit 8858768
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 36 deletions.
64 changes: 48 additions & 16 deletions geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

from .constants import (
CLASSIFICATION_MODELS,
CLASSIFICATION_MODELS_WITH_MISSING_VALUES,
CLUSTERING_MODELS,
CLUSTERING_MODELS_WITH_MISSING_VALUES,
DECOMPOSITION_MODELS,
FEATURE_SCALING_STRATEGY,
FEATURE_SELECTION_STRATEGY,
Expand All @@ -21,6 +23,7 @@
OPTION,
OUTPUT_PATH,
REGRESSION_MODELS,
REGRESSION_MODELS_WITH_MISSING_VALUES,
SECTION,
TEST_DATA_OPTION,
WORKING_PATH,
Expand All @@ -32,7 +35,7 @@
from .data.preprocessing import feature_scaler, feature_selector
from .data.statistic import monte_carlo_simulator
from .plot.map_plot import process_world_map
from .plot.statistic_plot import basic_statistic, correlation_plot, distribution_plot, is_imputed, is_null_value, log_distribution_plot, probability_plot, ratio_null_vs_filled
from .plot.statistic_plot import basic_statistic, check_missing_value, correlation_plot, distribution_plot, is_null_value, log_distribution_plot, probability_plot, ratio_null_vs_filled
from .process.classify import ClassificationModelSelection
from .process.cluster import ClusteringModelSelection
from .process.decompose import DecompositionModelSelection
Expand Down Expand Up @@ -157,7 +160,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
raise e
experiment = mlflow.get_experiment(experiment_id=new_experiment_id)
# print("Artifact Location: {}".format(experiment.artifact_location))
run_name = Prompt.ask("✨ Run Name", default="Xgboost Algorithm - Test 1")
run_name = Prompt.ask("✨ Run Name", default="XGBoost Algorithm - Test 1")
# run_tag = Prompt.ask("✨ Run Tag Version", default="R - v1.0.0")
# run_description = Prompt.ask("✨ Run Description", default="Use xgboost for GeoPi classification.")
# mlflow.start_run(run_name=run_name, experiment_id=experiment.experiment_id, tags={"version": run_tag, "description": run_description})
Expand Down Expand Up @@ -219,7 +222,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
print("The Selected Data Set:")
print(data_selected)
clear_output()
print("Basic Statistical Information: ")
print("-*-*- Basic Statistical Information -*-*-")
basic_info(data_selected)
basic_statistic(data_selected)
correlation_plot(data_selected.columns, data_selected)
Expand All @@ -232,11 +235,26 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N

# <--- Imputation --->
logger.debug("Imputation")
print("-*-*- Imputation -*-*-")
print("-*-*- Missing Value Check -*-*-")
is_null_value(data_selected)
ratio_null_vs_filled(data_selected)
imputed_flag = is_imputed(data_selected)
missing_value_flag = check_missing_value(data_selected)
clear_output()
if missing_value_flag:
# Ask the user whether to use imputation techniques to deal with the missing values.
print("-*-*- Imputation Option -*-*-")
num2option(OPTION)
imputation_num = limit_num_input(OPTION, SECTION[1], num_input)
if imputation_num == 1:
imputed_flag = True
else:
imputed_flag = False
clear_output()
else:
# Allow the user not to use imputation techniques to deal with the missing values.
# Subsequently, in the mode selection, only regression, classification and clustering models are available.
# In the corresponding model selection, only the models that support missing values are available.
imputed_flag = False
if imputed_flag:
print("-*-*- Strategy for Missing Values -*-*-")
num2option(IMPUTING_STRATEGY)
Expand Down Expand Up @@ -281,8 +299,16 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
# <--- Mode Selection --->
logger.debug("Mode Selection")
print("-*-*- Mode Selection -*-*-")
num2option(MODE_OPTION)
mode_num = limit_num_input(MODE_OPTION, SECTION[2], num_input)
# If the selected data set is with missing values and is not been imputed, then only allow the user to choose regression, classification and clustering modes.
# Otherwise, allow the user to choose decomposition modes.
if missing_value_flag and not imputed_flag:
# Delete the decomposition mode because it doesn't support missing values.
MODE_OPTION.remove("Dimensional Reduction")
num2option(MODE_OPTION)
mode_num = limit_num_input(MODE_OPTION, SECTION[2], num_input)
else:
num2option(MODE_OPTION)
mode_num = limit_num_input(MODE_OPTION, SECTION[2], num_input)
clear_output()

# <--- Data Segmentation --->
Expand Down Expand Up @@ -359,7 +385,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N

# create training data and testing data
print("-*-*- Data Split - Train Set and Test Set -*-*-")
print("Notice: Normally, set 20% of the dataset aside as test set, such as 0.2")
print("Notice: Normally, set 20% of the dataset aside as test set, such as 0.2.")
test_ratio = float_input(default=0.2, prefix=SECTION[1], slogan="@Test Ratio: ")
train_test_data = data_split(X, y, test_ratio)
for key, value in train_test_data.items():
Expand Down Expand Up @@ -404,14 +430,20 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N

# <--- Model Selection --->
logger.debug("Model Selection")
print("-*-*- Model Selection -*-*-:")
Modes2Models = {1: REGRESSION_MODELS, 2: CLASSIFICATION_MODELS, 3: CLUSTERING_MODELS, 4: DECOMPOSITION_MODELS}
Modes2Initiators = {
1: RegressionModelSelection,
2: ClassificationModelSelection,
3: ClusteringModelSelection,
4: DecompositionModelSelection,
}
print("-*-*- Model Selection -*-*-")
# If the selected data set is with missing values and is not been imputed, then only allow the user to choose regression, classification and clustering models.
# Otherwise, allow the user to choose decomposition models.
if missing_value_flag and not imputed_flag:
Modes2Models = {1: REGRESSION_MODELS_WITH_MISSING_VALUES, 2: CLASSIFICATION_MODELS_WITH_MISSING_VALUES, 3: CLUSTERING_MODELS_WITH_MISSING_VALUES}
Modes2Initiators = {1: RegressionModelSelection, 2: ClassificationModelSelection, 3: ClusteringModelSelection}
else:
Modes2Models = {1: REGRESSION_MODELS, 2: CLASSIFICATION_MODELS, 3: CLUSTERING_MODELS, 4: DECOMPOSITION_MODELS}
Modes2Initiators = {
1: RegressionModelSelection,
2: ClassificationModelSelection,
3: ClusteringModelSelection,
4: DecompositionModelSelection,
}
MODELS = Modes2Models[mode_num]
num2option(MODELS)
# Add the option of all models
Expand Down
29 changes: 27 additions & 2 deletions geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,27 +39,52 @@
"Random Forest",
"Extra-Trees",
"Gradient Boosting",
"Xgboost",
"XGBoost",
"Multi-layer Perceptron",
"Lasso Regression",
"Elastic Net",
"SGD Regression",
# "Bagging Regression",
# "Decision Tree",
# Histogram-based Gradient Boosting,
]
CLASSIFICATION_MODELS = [
"Logistic Regression",
"Support Vector Machine",
"Decision Tree",
"Random Forest",
"Extra-Trees",
"Xgboost",
"XGBoost",
"Multi-layer Perceptron",
"Gradient Boosting",
"K-Nearest Neighbors",
"Stochastic Gradient Descent",
# "Bagging Classification",
# "Decision Tree",
# Histogram-based Gradient Boosting,
]
CLUSTERING_MODELS = ["KMeans", "DBSCAN"]
DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"]

# The model can deal with missing values
# Reference: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
REGRESSION_MODELS_WITH_MISSING_VALUES = [
"XGBoost",
# "Bagging Regression",
# "Decision Tree",
# Histogram-based Gradient Boosting,
]
CLASSIFICATION_MODELS_WITH_MISSING_VALUES = [
"XGBoost",
# "Bagging Classification",
# "Decision Tree",
# Histogram-based Gradient Boosting,
]
CLUSTERING_MODELS_WITH_MISSING_VALUES = [
# "HDBSCAN"
]


# Special AutoML models
NON_AUTOML_MODELS = ["Linear Regression", "Polynomial Regression"]
RAY_FLAML = ["Multi-layer Perceptron"]
Expand Down
10 changes: 5 additions & 5 deletions geochemistrypi/data_mining/model/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,9 +1211,9 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:


class XGBoostClassification(TreeWorkflowMixin, ClassificationWorkflowBase):
"""The automation workflow of using Xgboost algorithm to make insightful products."""
"""The automation workflow of using XGBoost algorithm to make insightful products."""

name = "Xgboost"
name = "XGBoost"
special_function = ["Feature Importance Diagram"]

# https: // xgboost.readthedocs.io / en / stable / python / python_api.html # module-xgboost.sklearn
Expand Down Expand Up @@ -1419,10 +1419,10 @@ def __init__(
References
----------
[1] Xgboost Python API Reference - Scikit-Learn API
[1] XGBoost Python API Reference - Scikit-Learn API
https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
[2] Xgboost API for the scikit-learn wrapper:
[2] XGBoost API for the scikit-learn wrapper:
https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py
"""
super().__init__()
Expand Down Expand Up @@ -1526,7 +1526,7 @@ def manual_hyper_parameters(cls) -> Dict:
# def _plot_tree(trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
# # TODO: (solve the problem of failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH
# # Drawing diagrams of the first decision tree of xgboost
# print("-----* Xgboost's Single Tree Diagram *-----")
# print("-----* XGBoost's Single Tree Diagram *-----")
# xgboost.plot_tree(trained_model)
# # node_params = {
# # 'shape': 'box',
Expand Down
8 changes: 4 additions & 4 deletions geochemistrypi/data_mining/model/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,9 +310,9 @@ def special_components(self, **kwargs) -> None:


class XGBoostRegression(TreeWorkflowMixin, RegressionWorkflowBase):
"""The automation workflow of using Xgboost algorithm to make insightful products."""
"""The automation workflow of using XGBoost algorithm to make insightful products."""

name = "Xgboost"
name = "XGBoost"
special_function = ["Feature Importance Diagram"]

# In fact, it's used for type hint in the original xgboost package.
Expand Down Expand Up @@ -516,10 +516,10 @@ def __init__(
References
----------
[1] Xgboost Python API Reference - Scikit-Learn API
[1] XGBoost Python API Reference - Scikit-Learn API
https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
[2] Xgboost API for the scikit-learn wrapper:
[2] XGBoost API for the scikit-learn wrapper:
https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py
"""

Expand Down
4 changes: 2 additions & 2 deletions geochemistrypi/data_mining/plot/statistic_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def is_null_value(data: pd.DataFrame) -> None:
print("--" * 10)


def is_imputed(data: pd.DataFrame) -> bool:
def check_missing_value(data: pd.DataFrame) -> bool:
"""Check whether the data set has null value or not.
Parameters
Expand All @@ -54,7 +54,7 @@ def is_imputed(data: pd.DataFrame) -> bool:
if flag:
print("Note: you'd better use imputation techniques to deal with the missing values.")
else:
print("Note: you don't need to deal with the missing values, we'll just pass this step!")
print("Note: The provided data set is complete without missing values, we'll just pass this step!")
return flag


Expand Down
4 changes: 2 additions & 2 deletions geochemistrypi/data_mining/process/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def activate(
oob_score=hyper_parameters["oob_score"],
max_samples=hyper_parameters["max_samples"],
)
elif self.model_name == "Xgboost":
elif self.model_name == "XGBoost":
hyper_parameters = XGBoostClassification.manual_hyper_parameters()
self.clf_workflow = XGBoostClassification(
n_estimators=hyper_parameters["n_estimators"],
Expand Down Expand Up @@ -214,7 +214,7 @@ def activate(
self.clf_workflow = DecisionTreeClassification()
elif self.model_name == "Random Forest":
self.clf_workflow = RandomForestClassification()
elif self.model_name == "Xgboost":
elif self.model_name == "XGBoost":
self.clf_workflow = XGBoostClassification()
elif self.model_name == "Logistic Regression":
self.clf_workflow = LogisticRegressionClassification()
Expand Down
4 changes: 2 additions & 2 deletions geochemistrypi/data_mining/process/regress.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def activate(
poly_config, X_train, X_test = self.reg_workflow.poly(X_train, X_test)
self.transformer_config.update(poly_config)
self.reg_workflow.data_upload(X_train=X_train, X_test=X_test)
elif self.model_name == "Xgboost":
elif self.model_name == "XGBoost":
hyper_parameters = XGBoostRegression.manual_hyper_parameters()
self.reg_workflow = XGBoostRegression(
n_estimators=hyper_parameters["n_estimators"],
Expand Down Expand Up @@ -228,7 +228,7 @@ def activate(
poly_config, X_train, X_test = self.reg_workflow.poly(X_train, X_test)
self.transformer_config.update(poly_config)
self.reg_workflow.data_upload(X_train=X_train, X_test=X_test)
elif self.model_name == "Xgboost":
elif self.model_name == "XGBoost":
self.reg_workflow = XGBoostRegression()
elif self.model_name == "Decision Tree":
self.reg_workflow = DecisionTreeRegression()
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ dependencies = [
"openpyxl==3.0.10",
"pandas==1.5.2",
"joblib==1.2.0",
"flaml==1.0.14", # required to run Xgboost + FLMAL
"numpy==1.23.5", # required to run Xgboost + FLMAL
"xgboost==1.6.2", # required to run Xgboost + FLAML and be compatible with M2 chip on Mac
"flaml==1.0.14", # required to run XGBoost + FLMAL
"numpy==1.23.5", # required to run XGBoost + FLMAL
"xgboost==1.6.2", # required to run XGBoost + FLAML and be compatible with M2 chip on Mac
"threadpoolctl==3.1.0", # required to draw 3d plot for KMeans
"matplotlib==3.5.2", # required to draw 3d plot for KMeans
"fastapi", # backend framework
Expand Down

0 comments on commit 8858768

Please sign in to comment.