diff --git a/README.md b/README.md
index 95b36e9a..2cbb2e92 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,9 @@ The following figure is the simplified overview of Geochemistry π:
The following figure is the frontend-backend separation architecture of Geochemistry:
-![Frontend-backend separation architecture of Geochemistry](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/3b27cbdb-ff50-4fa6-b1d1-4c75b253fdff)
+
+
+
## Quick Installation
@@ -149,7 +151,9 @@ The following figure is the system architecture diagram:
The following figure is the customized automated ML pipeline:
-![Customized automated ML pipeline](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/07078b43-30bd-46cf-abad-2da509fae6aa)
+
+
+
The following figure is the design pattern hierarchical architecture:
@@ -158,7 +162,9 @@ The following figure is the design pattern hierarchical architecture:
The following figure is the storage mechanism:
-![Storage Mechanism](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/401f3429-c44f-4b76-b085-7a9dcc987cde)
+
+
+
The whole package is under construction and the documentation is progressively evolving.
diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py
index 3b213887..11df08da 100644
--- a/geochemistrypi/data_mining/cli_pipeline.py
+++ b/geochemistrypi/data_mining/cli_pipeline.py
@@ -67,7 +67,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
# <-- User Data Loading -->
with console.status("[bold green]Data Loading...[/bold green]", spinner="dots"):
- sleep(1.5)
+ sleep(1)
if training_data_path:
# If the user provides file name, then load the data from the file.
data = read_data(file_path=training_data_path, is_own_data=1)
@@ -167,7 +167,9 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
print(f"Successfully loading the built-in training data set '{training_data_path}'.")
show_data_columns(data.columns)
clear_output()
- # If the user doesn't provide the inference data path, then use the built-in data.
+ # If the user doesn't provide the inference data path and the training data is built-in data,
+ # then use the built-in data as inference data. Otherwise, the inference data is None.
+ # It means that the user doesn't want to run the model inference.
if (not inference_data_path) and is_built_in_data:
print("-*-*- Inference Data -*-*-")
if built_in_data_num == 1:
@@ -238,9 +240,6 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
test="kruskal",
confidence=0.05,
)
- # print("The statistics test method: Kruskal Wallis Test")
- # monte_carlo_simulator(data_processed, data_processed_imputed, sample_size=50,
- # iteration=100, test='kruskal', confidence=0.05)
probability_plot(data_selected.columns, data_selected, data_selected_imputed)
basic_info(data_selected_imputed)
basic_statistic(data_selected_imputed)
@@ -272,6 +271,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
# divide X and y data set when it is supervised learning
logger.debug("Data Split")
if mode_num == 1 or mode_num == 2:
+ # Supervised learning
print("-*-*- Data Split - X Set and Y Set -*-*-")
print("Divide the processing data set into X (feature value) and Y (target value) respectively.")
# create X data set
@@ -356,11 +356,32 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
del data_selected_imputed_fe
clear_output()
else:
- # unsupervised learning
- feature_scaling_config = {}
- feature_selection_config = {}
+ # Unsupervised learning
+ # Create X data set without data split because it is unsupervised learning
X = data_selected_imputed_fe
- X_train = data_selected_imputed_fe
+ # <--- Feature Scaling --->
+ print("-*-*- Feature Scaling on X Set -*-*-")
+ num2option(OPTION)
+ is_feature_scaling = limit_num_input(OPTION, SECTION[1], num_input)
+ if is_feature_scaling == 1:
+ print("Which strategy do you want to apply?")
+ num2option(FEATURE_SCALING_STRATEGY)
+ feature_scaling_num = limit_num_input(FEATURE_SCALING_STRATEGY, SECTION[1], num_input)
+ feature_scaling_config, X_scaled_np = feature_scaler(X, FEATURE_SCALING_STRATEGY, feature_scaling_num - 1)
+ X = np2pd(X_scaled_np, X.columns)
+ del X_scaled_np
+ print("Data Set After Scaling:")
+ print(X)
+ print("Basic Statistical Information: ")
+ basic_statistic(X)
+ save_data(X, "X With Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+ else:
+ feature_scaling_config = {}
+ clear_output()
+
+ feature_selection_config = {}
+ # Create training data without data split because it is unsupervised learning
+ X_train = X
y, X_test, y_train, y_test = None, None, None, None
# <--- Model Selection --->
@@ -401,11 +422,11 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
is_inference = False
# If the model is supervised learning, then allow the user to use model inference.
if mode_num == 1 or mode_num == 2:
- print("-*-*- Feature Engineering on Inference Data -*-*-")
+ print("-*-*- Feature Engineering on Inference Data -*-*-")
is_inference = True
selected_columns = X_train.columns
- # If feature_engineering_config is not {}, then apply feature engineering with the same operation to the input data.
- if feature_engineering_config:
+ # If feature_engineering_config is not {} and inference_data is not None, then apply feature engineering with the same operation to the input data.
+ if feature_engineering_config and (inference_data is not None):
print("The same feature engineering operation will be applied to the inference data.")
new_feature_builder = FeatureConstructor(inference_data)
inference_data_fe = new_feature_builder.batch_build(feature_engineering_config)
@@ -418,6 +439,8 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
save_data(inference_data_fe, "Inference Data Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(inference_data_fe_selected, "Inference Data Feature-Engineering Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()
+ else:
+ inference_data_fe_selected = None
# <--- Model Training --->
logger.debug("Model Training")
@@ -439,8 +462,9 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
# <--- Model Inference --->
logger.debug("Model Inference")
- model_inference(inference_data_fe_selected, is_inference, feature_engineering_config, run, transformer_config, transform_pipeline)
- clear_output()
+ if inference_data_fe_selected is not None:
+ model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline)
+ clear_output()
else:
# Run all models
for i in range(len(MODELS) - 1):
@@ -465,6 +489,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
# <--- Model Inference --->
logger.debug("Model Inference")
- model_inference(inference_data_fe_selected, is_inference, feature_engineering_config, run, transformer_config, transform_pipeline)
- clear_output()
+ if inference_data_fe_selected is not None:
+ model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline)
+ clear_output()
mlflow.end_run()
diff --git a/geochemistrypi/data_mining/data/inference.py b/geochemistrypi/data_mining/data/inference.py
index 584df5f1..111bb0c7 100644
--- a/geochemistrypi/data_mining/data/inference.py
+++ b/geochemistrypi/data_mining/data/inference.py
@@ -108,7 +108,7 @@ def build_transform_pipeline(imputation_config: Dict, feature_scaling_config: Di
return transformer_config, transform_pipeline
-def model_inference(inference_data: pd.DataFrame, is_inference: bool, feature_engineering_config: Dict, run: object, transformer_config: Dict, transform_pipeline: Optional[object] = None):
+def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: object, transformer_config: Dict, transform_pipeline: Optional[object] = None):
"""Run the model inference.
Parameters
@@ -119,9 +119,6 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, feature_en
is_inference : bool
Whether to run the model inference.
- feature_engineering_config : Dict
- The feature engineering configuration.
-
run : object
The model selection object.
@@ -131,8 +128,8 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, feature_en
transform_pipeline : Optional[object], optional
The transform pipeline object. The default is None.
"""
- # If inference_data is not None and is_inference is True, then run the model inference.
- if (inference_data is not None) and (is_inference is True):
+ # If is_inference is True, then run the model inference.
+ if is_inference is True:
print("-*-*- Model Inference -*-*-")
print("Use the trained model to make predictions on the inference data.")
# If transformer_config is not {}, then transform the inference data with the transform pipeline.
diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py
index 7f883600..27cb95c7 100644
--- a/geochemistrypi/data_mining/model/classification.py
+++ b/geochemistrypi/data_mining/model/classification.py
@@ -1201,7 +1201,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:
)
-class XgboostClassification(TreeWorkflowMixin, ClassificationWorkflowBase):
+class XGBoostClassification(TreeWorkflowMixin, ClassificationWorkflowBase):
"""The automation workflow of using Xgboost algorithm to make insightful products."""
name = "Xgboost"
@@ -1490,7 +1490,7 @@ def __init__(
early_stopping_rounds=self.early_stopping_rounds,
)
- self.naming = XgboostClassification.name
+ self.naming = XGBoostClassification.name
@property
def settings(self) -> Dict:
@@ -1538,7 +1538,7 @@ def special_components(self, **kwargs) -> None:
# mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
# )
self._plot_feature_importance(
- X_train=XgboostClassification.X_train,
+ X_train=XGBoostClassification.X_train,
trained_model=self.model,
image_config=self.image_config,
algorithm_name=self.naming,
@@ -1551,7 +1551,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:
"""Invoke all special application functions for this algorithms by FLAML framework."""
GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
self._plot_feature_importance(
- X_train=XgboostClassification.X_train,
+ X_train=XGBoostClassification.X_train,
trained_model=self.auto_model,
image_config=self.image_config,
algorithm_name=self.naming,
diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py b/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py
index 8b557fae..951dca13 100644
--- a/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py
+++ b/geochemistrypi/data_mining/model/func/algo_clustering/_dbscan.py
@@ -23,11 +23,11 @@ def dbscan_manual_hyper_parameters() -> Dict:
print("Please specify the number of samples. A good starting range could be between 5 and 20, such as 5.")
min_samples = num_input(SECTION[2], "Min Samples: ")
print("Metric: The metric to use when calculating distance between instances in a feature array.")
- print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it set to euclidean.")
+ print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it as 'euclidean'.")
metrics = ["euclidean", "manhattan", "chebyshev", "minkowski", "cosine", "correlation"]
metric = str_input(metrics, SECTION[2])
print("Algorithm: The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors.")
- print("Please specify the algorithm. It is generally recommended to leave it set to auto.")
+ print("Please specify the algorithm. It is generally recommended to leave it as 'auto'.")
algorithms = ["auto", "ball_tree", "kd_tree", "brute"]
algorithm = str_input(algorithms, SECTION[2])
print("Leaf Size: Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree.")
diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py b/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py
index 60c203c1..e15c3219 100644
--- a/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py
+++ b/geochemistrypi/data_mining/model/func/algo_clustering/_kmeans.py
@@ -22,7 +22,7 @@ def kmeans_manual_hyper_parameters() -> Dict:
print("Please specify the number of clusters for KMeans. A good starting range could be between 2 and 10, such as 4.")
n_clusters = num_input(SECTION[2], "N Clusters: ")
print("Init: Method for initialization of centroids. The centroids represent the center points of the clusters in the dataset.")
- print("Please specify the method for initialization of centroids. It is generally recommended to leave it set to k-means++.")
+ print("Please specify the method for initialization of centroids. It is generally recommended to leave it as 'k-means++'.")
inits = ["k-means++", "random"]
init = str_input(inits, SECTION[2])
print("Max Iter: Maximum number of iterations of the k-means algorithm for a single run.")
@@ -32,7 +32,7 @@ def kmeans_manual_hyper_parameters() -> Dict:
print("Please specify the relative tolerance with regards to inertia to declare convergence. A good starting range could be between 0.0001 and 0.001, such as 0.0005.")
tol = float_input(0.0005, SECTION[2], "Tolerance: ")
print("Algorithm: The algorithm to use for the computation.")
- print("Please specify the algorithm to use for the computation. It is generally recommended to leave it set to auto.")
+ print("Please specify the algorithm to use for the computation. It is generally recommended to leave it as 'auto'.")
print("Auto: selects 'elkan' for dense data and 'full' for sparse data. 'elkan' is generally faster on data with lower dimensionality, while 'full' is faster on data with higher dimensionality")
algorithms = ["auto", "full", "elkan"]
algorithm = str_input(algorithms, SECTION[2])
diff --git a/geochemistrypi/data_mining/model/func/algo_decomposition/_pca.py b/geochemistrypi/data_mining/model/func/algo_decomposition/_pca.py
index 4c2d7aec..13ee9741 100644
--- a/geochemistrypi/data_mining/model/func/algo_decomposition/_pca.py
+++ b/geochemistrypi/data_mining/model/func/algo_decomposition/_pca.py
@@ -21,7 +21,7 @@ def pca_manual_hyper_parameters() -> Dict:
print("Please specify the number of components to retain. A good starting range could be between 2 and 10, such as 4.")
n_components = num_input(SECTION[2], "N Components: ")
print("SVD Solver: This parameter specifies the algorithm used to perform the singular value decomposition.")
- print("Please specify the algorithm. It is generally recommended to leave it set to auto.")
+ print("Please specify the algorithm. It is generally recommended to leave it as 'auto'.")
svd_solvers = ["auto", "full", "arpack", "randomized"]
svd_solver = str_input(svd_solvers, SECTION[2])
hyper_parameters = {"n_components": n_components, "svd_solver": svd_solver}
diff --git a/geochemistrypi/data_mining/model/regression.py b/geochemistrypi/data_mining/model/regression.py
index e726683e..59e06381 100644
--- a/geochemistrypi/data_mining/model/regression.py
+++ b/geochemistrypi/data_mining/model/regression.py
@@ -308,7 +308,7 @@ def special_components(self, **kwargs) -> None:
)
-class XgboostRegression(TreeWorkflowMixin, RegressionWorkflowBase):
+class XGBoostRegression(TreeWorkflowMixin, RegressionWorkflowBase):
"""The automation workflow of using Xgboost algorithm to make insightful products."""
name = "Xgboost"
@@ -591,7 +591,7 @@ def __init__(
early_stopping_rounds=self.early_stopping_rounds,
)
- self.naming = XgboostRegression.name
+ self.naming = XGBoostRegression.name
@property
def settings(self) -> Dict:
@@ -625,7 +625,7 @@ def special_components(self, **kwargs) -> None:
"""Invoke all special application functions for this algorithms by Scikit-learn framework."""
GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
self._plot_feature_importance(
- X_train=XgboostRegression.X_train,
+ X_train=XGBoostRegression.X_train,
trained_model=self.model,
image_config=self.image_config,
algorithm_name=self.naming,
@@ -633,7 +633,7 @@ def special_components(self, **kwargs) -> None:
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
)
# self._histograms_feature_weights(
- # X=XgboostRegression.X,
+ # X=XGBoostRegression.X,
# trained_model=self.model,
# image_config=self.image_config,
# algorithm_name=self.naming,
@@ -646,7 +646,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:
"""Invoke all special application functions for this algorithms by FLAML framework."""
GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
self._plot_feature_importance(
- X_train=XgboostRegression.X_train,
+ X_train=XGBoostRegression.X_train,
trained_model=self.auto_model,
image_config=self.image_config,
algorithm_name=self.naming,
@@ -654,7 +654,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
)
# self._histograms_feature_weights(
- # X=XgboostRegression.X,
+ # X=XGBoostRegression.X,
# trained_model=self.auto_model,
# image_config=self.image_config,
# algorithm_name=self.naming,
diff --git a/geochemistrypi/data_mining/process/classify.py b/geochemistrypi/data_mining/process/classify.py
index 9686544b..7239b26f 100644
--- a/geochemistrypi/data_mining/process/classify.py
+++ b/geochemistrypi/data_mining/process/classify.py
@@ -15,7 +15,7 @@
MLPClassification,
RandomForestClassification,
SVMClassification,
- XgboostClassification,
+ XGBoostClassification,
)
from ._base import ModelSelectionBase
@@ -80,8 +80,8 @@ def activate(
max_samples=hyper_parameters["max_samples"],
)
elif self.model_name == "Xgboost":
- hyper_parameters = XgboostClassification.manual_hyper_parameters()
- self.clf_workflow = XgboostClassification(
+ hyper_parameters = XGBoostClassification.manual_hyper_parameters()
+ self.clf_workflow = XGBoostClassification(
n_estimators=hyper_parameters["n_estimators"],
learning_rate=hyper_parameters["learning_rate"],
max_depth=hyper_parameters["max_depth"],
@@ -196,7 +196,7 @@ def activate(
elif self.model_name == "Random Forest":
self.clf_workflow = RandomForestClassification()
elif self.model_name == "Xgboost":
- self.clf_workflow = XgboostClassification()
+ self.clf_workflow = XGBoostClassification()
elif self.model_name == "Logistic Regression":
self.clf_workflow = LogisticRegressionClassification()
elif self.model_name == "Multi-layer Perceptron":
diff --git a/geochemistrypi/data_mining/process/regress.py b/geochemistrypi/data_mining/process/regress.py
index efa34375..10083972 100644
--- a/geochemistrypi/data_mining/process/regress.py
+++ b/geochemistrypi/data_mining/process/regress.py
@@ -21,7 +21,7 @@
RegressionWorkflowBase,
SGDRegression,
SVMRegression,
- XgboostRegression,
+ XGBoostRegression,
)
from ._base import ModelSelectionBase
@@ -60,8 +60,8 @@ def activate(
self.transformer_config.update(poly_config)
self.reg_workflow.data_upload(X_train=X_train, X_test=X_test)
elif self.model_name == "Xgboost":
- hyper_parameters = XgboostRegression.manual_hyper_parameters()
- self.reg_workflow = XgboostRegression(
+ hyper_parameters = XGBoostRegression.manual_hyper_parameters()
+ self.reg_workflow = XGBoostRegression(
n_estimators=hyper_parameters["n_estimators"],
learning_rate=hyper_parameters["learning_rate"],
max_depth=hyper_parameters["max_depth"],
@@ -229,7 +229,7 @@ def activate(
self.transformer_config.update(poly_config)
self.reg_workflow.data_upload(X_train=X_train, X_test=X_test)
elif self.model_name == "Xgboost":
- self.reg_workflow = XgboostRegression()
+ self.reg_workflow = XGBoostRegression()
elif self.model_name == "Decision Tree":
self.reg_workflow = DecisionTreeRegression()
elif self.model_name == "Extra-Trees":