Skip to content

Commit

Permalink
Merge pull request #276 from ZJUEarthData/web
Browse files Browse the repository at this point in the history
feat: add feature scaling for unsupervised learning.
  • Loading branch information
SanyHe authored Nov 5, 2023
2 parents 8db31ad + 75ec881 commit 56159c5
Show file tree
Hide file tree
Showing 10 changed files with 76 additions and 48 deletions.
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ The following figure is the simplified overview of Geochemistry π: <br>

The following figure is the frontend-backend separation architecture of Geochemistry: <br>

![Frontend-backend separation architecture of Geochemistry](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/3b27cbdb-ff50-4fa6-b1d1-4c75b253fdff)
<div style="text-align:center;">
<img src="https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/3b27cbdb-ff50-4fa6-b1d1-4c75b253fdff" alt="Frontend-backend separation architecture of Geochemistry" width="400" />
</div>

## Quick Installation

Expand Down Expand Up @@ -149,7 +151,9 @@ The following figure is the system architecture diagram: <br>

The following figure is the customized automated ML pipeline: <br>

![Customized automated ML pipeline](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/07078b43-30bd-46cf-abad-2da509fae6aa)
<div style="text-align:center;">
<img src="https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/07078b43-30bd-46cf-abad-2da509fae6aa" alt="Customized automated ML pipeline" width="400" />
</div>

The following figure is the design pattern hierarchical architecture: <br>

Expand All @@ -158,7 +162,9 @@ The following figure is the design pattern hierarchical architecture: <br>

The following figure is the storage mechanism: <br>

![Storage Mechanism](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/401f3429-c44f-4b76-b085-7a9dcc987cde)
<div style="text-align:center;">
<img src="https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/401f3429-c44f-4b76-b085-7a9dcc987cde" alt="Storage Mechanism" width="500" />
</div>

The whole package is under construction and the documentation is progressively evolving.

Expand Down
57 changes: 41 additions & 16 deletions geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N

# <-- User Data Loading -->
with console.status("[bold green]Data Loading...[/bold green]", spinner="dots"):
sleep(1.5)
sleep(1)
if training_data_path:
# If the user provides file name, then load the data from the file.
data = read_data(file_path=training_data_path, is_own_data=1)
Expand Down Expand Up @@ -167,7 +167,9 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
print(f"Successfully loading the built-in training data set '{training_data_path}'.")
show_data_columns(data.columns)
clear_output()
# If the user doesn't provide the inference data path, then use the built-in data.
# If the user doesn't provide the inference data path and the training data is built-in data,
# then use the built-in data as inference data. Otherwise, the inference data is None.
# It means that the user doesn't want to run the model inference.
if (not inference_data_path) and is_built_in_data:
print("-*-*- Inference Data -*-*-")
if built_in_data_num == 1:
Expand Down Expand Up @@ -238,9 +240,6 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
test="kruskal",
confidence=0.05,
)
# print("The statistics test method: Kruskal Wallis Test")
# monte_carlo_simulator(data_processed, data_processed_imputed, sample_size=50,
# iteration=100, test='kruskal', confidence=0.05)
probability_plot(data_selected.columns, data_selected, data_selected_imputed)
basic_info(data_selected_imputed)
basic_statistic(data_selected_imputed)
Expand Down Expand Up @@ -272,6 +271,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
# divide X and y data set when it is supervised learning
logger.debug("Data Split")
if mode_num == 1 or mode_num == 2:
# Supervised learning
print("-*-*- Data Split - X Set and Y Set -*-*-")
print("Divide the processing data set into X (feature value) and Y (target value) respectively.")
# create X data set
Expand Down Expand Up @@ -356,11 +356,32 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
del data_selected_imputed_fe
clear_output()
else:
# unsupervised learning
feature_scaling_config = {}
feature_selection_config = {}
# Unsupervised learning
# Create X data set without data split because it is unsupervised learning
X = data_selected_imputed_fe
X_train = data_selected_imputed_fe
# <--- Feature Scaling --->
print("-*-*- Feature Scaling on X Set -*-*-")
num2option(OPTION)
is_feature_scaling = limit_num_input(OPTION, SECTION[1], num_input)
if is_feature_scaling == 1:
print("Which strategy do you want to apply?")
num2option(FEATURE_SCALING_STRATEGY)
feature_scaling_num = limit_num_input(FEATURE_SCALING_STRATEGY, SECTION[1], num_input)
feature_scaling_config, X_scaled_np = feature_scaler(X, FEATURE_SCALING_STRATEGY, feature_scaling_num - 1)
X = np2pd(X_scaled_np, X.columns)
del X_scaled_np
print("Data Set After Scaling:")
print(X)
print("Basic Statistical Information: ")
basic_statistic(X)
save_data(X, "X With Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
else:
feature_scaling_config = {}
clear_output()

feature_selection_config = {}
# Create training data without data split because it is unsupervised learning
X_train = X
y, X_test, y_train, y_test = None, None, None, None

# <--- Model Selection --->
Expand Down Expand Up @@ -401,11 +422,11 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
is_inference = False
# If the model is supervised learning, then allow the user to use model inference.
if mode_num == 1 or mode_num == 2:
print("-*-*- Feature Engineering on Inference Data -*-*-")
print("-*-*- Feature Engineering on Inference Data -*-*-")
is_inference = True
selected_columns = X_train.columns
# If feature_engineering_config is not {}, then apply feature engineering with the same operation to the input data.
if feature_engineering_config:
# If feature_engineering_config is not {} and inference_data is not None, then apply feature engineering with the same operation to the input data.
if feature_engineering_config and (inference_data is not None):
print("The same feature engineering operation will be applied to the inference data.")
new_feature_builder = FeatureConstructor(inference_data)
inference_data_fe = new_feature_builder.batch_build(feature_engineering_config)
Expand All @@ -418,6 +439,8 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
save_data(inference_data_fe, "Inference Data Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(inference_data_fe_selected, "Inference Data Feature-Engineering Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()
else:
inference_data_fe_selected = None

# <--- Model Training --->
logger.debug("Model Training")
Expand All @@ -439,8 +462,9 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N

# <--- Model Inference --->
logger.debug("Model Inference")
model_inference(inference_data_fe_selected, is_inference, feature_engineering_config, run, transformer_config, transform_pipeline)
clear_output()
if inference_data_fe_selected is not None:
model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline)
clear_output()
else:
# Run all models
for i in range(len(MODELS) - 1):
Expand All @@ -465,6 +489,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N

# <--- Model Inference --->
logger.debug("Model Inference")
model_inference(inference_data_fe_selected, is_inference, feature_engineering_config, run, transformer_config, transform_pipeline)
clear_output()
if inference_data_fe_selected is not None:
model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline)
clear_output()
mlflow.end_run()
9 changes: 3 additions & 6 deletions geochemistrypi/data_mining/data/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def build_transform_pipeline(imputation_config: Dict, feature_scaling_config: Di
return transformer_config, transform_pipeline


def model_inference(inference_data: pd.DataFrame, is_inference: bool, feature_engineering_config: Dict, run: object, transformer_config: Dict, transform_pipeline: Optional[object] = None):
def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: object, transformer_config: Dict, transform_pipeline: Optional[object] = None):
"""Run the model inference.
Parameters
Expand All @@ -119,9 +119,6 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, feature_en
is_inference : bool
Whether to run the model inference.
feature_engineering_config : Dict
The feature engineering configuration.
run : object
The model selection object.
Expand All @@ -131,8 +128,8 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, feature_en
transform_pipeline : Optional[object], optional
The transform pipeline object. The default is None.
"""
# If inference_data is not None and is_inference is True, then run the model inference.
if (inference_data is not None) and (is_inference is True):
# If is_inference is True, then run the model inference.
if is_inference is True:
print("-*-*- Model Inference -*-*-")
print("Use the trained model to make predictions on the inference data.")
# If transformer_config is not {}, then transform the inference data with the transform pipeline.
Expand Down
8 changes: 4 additions & 4 deletions geochemistrypi/data_mining/model/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,7 +1201,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:
)


class XgboostClassification(TreeWorkflowMixin, ClassificationWorkflowBase):
class XGBoostClassification(TreeWorkflowMixin, ClassificationWorkflowBase):
"""The automation workflow of using Xgboost algorithm to make insightful products."""

name = "Xgboost"
Expand Down Expand Up @@ -1490,7 +1490,7 @@ def __init__(
early_stopping_rounds=self.early_stopping_rounds,
)

self.naming = XgboostClassification.name
self.naming = XGBoostClassification.name

@property
def settings(self) -> Dict:
Expand Down Expand Up @@ -1538,7 +1538,7 @@ def special_components(self, **kwargs) -> None:
# mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
# )
self._plot_feature_importance(
X_train=XgboostClassification.X_train,
X_train=XGBoostClassification.X_train,
trained_model=self.model,
image_config=self.image_config,
algorithm_name=self.naming,
Expand All @@ -1551,7 +1551,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:
"""Invoke all special application functions for this algorithms by FLAML framework."""
GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
self._plot_feature_importance(
X_train=XgboostClassification.X_train,
X_train=XGBoostClassification.X_train,
trained_model=self.auto_model,
image_config=self.image_config,
algorithm_name=self.naming,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ def dbscan_manual_hyper_parameters() -> Dict:
print("Please specify the number of samples. A good starting range could be between 5 and 20, such as 5.")
min_samples = num_input(SECTION[2], "Min Samples: ")
print("Metric: The metric to use when calculating distance between instances in a feature array.")
print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it set to euclidean.")
print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it as 'euclidean'.")
metrics = ["euclidean", "manhattan", "chebyshev", "minkowski", "cosine", "correlation"]
metric = str_input(metrics, SECTION[2])
print("Algorithm: The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors.")
print("Please specify the algorithm. It is generally recommended to leave it set to auto.")
print("Please specify the algorithm. It is generally recommended to leave it as 'auto'.")
algorithms = ["auto", "ball_tree", "kd_tree", "brute"]
algorithm = str_input(algorithms, SECTION[2])
print("Leaf Size: Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def kmeans_manual_hyper_parameters() -> Dict:
print("Please specify the number of clusters for KMeans. A good starting range could be between 2 and 10, such as 4.")
n_clusters = num_input(SECTION[2], "N Clusters: ")
print("Init: Method for initialization of centroids. The centroids represent the center points of the clusters in the dataset.")
print("Please specify the method for initialization of centroids. It is generally recommended to leave it set to k-means++.")
print("Please specify the method for initialization of centroids. It is generally recommended to leave it as 'k-means++'.")
inits = ["k-means++", "random"]
init = str_input(inits, SECTION[2])
print("Max Iter: Maximum number of iterations of the k-means algorithm for a single run.")
Expand All @@ -32,7 +32,7 @@ def kmeans_manual_hyper_parameters() -> Dict:
print("Please specify the relative tolerance with regards to inertia to declare convergence. A good starting range could be between 0.0001 and 0.001, such as 0.0005.")
tol = float_input(0.0005, SECTION[2], "Tolerance: ")
print("Algorithm: The algorithm to use for the computation.")
print("Please specify the algorithm to use for the computation. It is generally recommended to leave it set to auto.")
print("Please specify the algorithm to use for the computation. It is generally recommended to leave it as 'auto'.")
print("Auto: selects 'elkan' for dense data and 'full' for sparse data. 'elkan' is generally faster on data with lower dimensionality, while 'full' is faster on data with higher dimensionality")
algorithms = ["auto", "full", "elkan"]
algorithm = str_input(algorithms, SECTION[2])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def pca_manual_hyper_parameters() -> Dict:
print("Please specify the number of components to retain. A good starting range could be between 2 and 10, such as 4.")
n_components = num_input(SECTION[2], "N Components: ")
print("SVD Solver: This parameter specifies the algorithm used to perform the singular value decomposition.")
print("Please specify the algorithm. It is generally recommended to leave it set to auto.")
print("Please specify the algorithm. It is generally recommended to leave it as 'auto'.")
svd_solvers = ["auto", "full", "arpack", "randomized"]
svd_solver = str_input(svd_solvers, SECTION[2])
hyper_parameters = {"n_components": n_components, "svd_solver": svd_solver}
Expand Down
12 changes: 6 additions & 6 deletions geochemistrypi/data_mining/model/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def special_components(self, **kwargs) -> None:
)


class XgboostRegression(TreeWorkflowMixin, RegressionWorkflowBase):
class XGBoostRegression(TreeWorkflowMixin, RegressionWorkflowBase):
"""The automation workflow of using Xgboost algorithm to make insightful products."""

name = "Xgboost"
Expand Down Expand Up @@ -591,7 +591,7 @@ def __init__(
early_stopping_rounds=self.early_stopping_rounds,
)

self.naming = XgboostRegression.name
self.naming = XGBoostRegression.name

@property
def settings(self) -> Dict:
Expand Down Expand Up @@ -625,15 +625,15 @@ def special_components(self, **kwargs) -> None:
"""Invoke all special application functions for this algorithms by Scikit-learn framework."""
GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
self._plot_feature_importance(
X_train=XgboostRegression.X_train,
X_train=XGBoostRegression.X_train,
trained_model=self.model,
image_config=self.image_config,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
)
# self._histograms_feature_weights(
# X=XgboostRegression.X,
# X=XGBoostRegression.X,
# trained_model=self.model,
# image_config=self.image_config,
# algorithm_name=self.naming,
Expand All @@ -646,15 +646,15 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:
"""Invoke all special application functions for this algorithms by FLAML framework."""
GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
self._plot_feature_importance(
X_train=XgboostRegression.X_train,
X_train=XGBoostRegression.X_train,
trained_model=self.auto_model,
image_config=self.image_config,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
)
# self._histograms_feature_weights(
# X=XgboostRegression.X,
# X=XGBoostRegression.X,
# trained_model=self.auto_model,
# image_config=self.image_config,
# algorithm_name=self.naming,
Expand Down
8 changes: 4 additions & 4 deletions geochemistrypi/data_mining/process/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
MLPClassification,
RandomForestClassification,
SVMClassification,
XgboostClassification,
XGBoostClassification,
)
from ._base import ModelSelectionBase

Expand Down Expand Up @@ -80,8 +80,8 @@ def activate(
max_samples=hyper_parameters["max_samples"],
)
elif self.model_name == "Xgboost":
hyper_parameters = XgboostClassification.manual_hyper_parameters()
self.clf_workflow = XgboostClassification(
hyper_parameters = XGBoostClassification.manual_hyper_parameters()
self.clf_workflow = XGBoostClassification(
n_estimators=hyper_parameters["n_estimators"],
learning_rate=hyper_parameters["learning_rate"],
max_depth=hyper_parameters["max_depth"],
Expand Down Expand Up @@ -196,7 +196,7 @@ def activate(
elif self.model_name == "Random Forest":
self.clf_workflow = RandomForestClassification()
elif self.model_name == "Xgboost":
self.clf_workflow = XgboostClassification()
self.clf_workflow = XGBoostClassification()
elif self.model_name == "Logistic Regression":
self.clf_workflow = LogisticRegressionClassification()
elif self.model_name == "Multi-layer Perceptron":
Expand Down
Loading

0 comments on commit 56159c5

Please sign in to comment.