Skip to content

Commit

Permalink
perf: build-in inference data and model inference control are improved.
Browse files Browse the repository at this point in the history
  • Loading branch information
MinkiGao committed Nov 12, 2023
1 parent aa8a5f4 commit 87605b3
Showing 1 changed file with 25 additions and 29 deletions.
54 changes: 25 additions & 29 deletions geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
# <--- Built-in Data Loading --->
logger.debug("Built-in Data Loading")
# If the user doesn't provide the training data path, then use the built-in data.
is_built_in_data = False
if not training_data_path:
print("-*-*- Built-in Data Option-*-*-")
num2option(TEST_DATA_OPTION)
Expand All @@ -163,23 +162,15 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
elif built_in_data_num == 4:
training_data_path = "Data_Decomposition.xlsx"
data = read_data(file_path=training_data_path)
is_built_in_data = True
print(f"Successfully loading the built-in training data set '{training_data_path}'.")
show_data_columns(data.columns)
clear_output()
# If the user doesn't provide the inference data path and the training data is built-in data,
# then use the built-in data as inference data. Otherwise, the inference data is None.
# It means that the user doesn't want to run the model inference.
if (not inference_data_path) and is_built_in_data:
print("-*-*- Inference Data -*-*-")
if built_in_data_num == 1:
inference_data_path = "Data_Regression.xlsx"
elif built_in_data_num == 2:
inference_data_path = "Data_Classification.xlsx"
elif built_in_data_num == 3:
inference_data_path = "Data_Clustering.xlsx"
elif built_in_data_num == 4:
inference_data_path = "Data_Decomposition.xlsx"

# <--- Built-in Inference Data Loading --->
logger.debug("Built-in Inference Data Loading")
# If the user doesn't provide the inference data path, it means that the user doesn't want to run the model inference.
if inference_data_path:
print("-*-*- Built-in Inference Data-*-*-")
inference_data = read_data(file_path=inference_data_path)
print(f"Successfully loading the built-in inference data set '{inference_data_path}'.")
show_data_columns(inference_data.columns)
Expand Down Expand Up @@ -425,22 +416,27 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
print("-*-*- Feature Engineering on Inference Data -*-*-")
is_inference = True
selected_columns = X_train.columns
# If feature_engineering_config is not {} and inference_data is not None, then apply feature engineering with the same operation to the input data.
if feature_engineering_config and (inference_data is not None):
print("The same feature engineering operation will be applied to the inference data.")
new_feature_builder = FeatureConstructor(inference_data)
inference_data_fe = new_feature_builder.batch_build(feature_engineering_config)
if inference_data is not None:
if feature_engineering_config:
# If inference_data is not None and feature_engineering_config is not {}, then apply feature engineering with the same operation to the input data.
print("The same feature engineering operation will be applied to the inference data.")
new_feature_builder = FeatureConstructor(inference_data)
inference_data_fe = new_feature_builder.batch_build(feature_engineering_config)
inference_data_fe_selected = inference_data_fe[selected_columns]
save_data(inference_data, "Inference Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(inference_data_fe, "Inference Data Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(inference_data_fe_selected, "Inference Data Feature-Engineering Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
else:
print("You have not applied feature engineering to the training data.")
print("Hence, no feature engineering operation will be applied to the inference data.")
inference_data_fe_selected = inference_data[selected_columns]
save_data(inference_data, "Inference Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(inference_data_fe_selected, "Inference Data Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
else:
print("You have not applied feature engineering to the training data.")
print("Hence, no feature engineering operation will be applied to the inference data.")
inference_data_fe = inference_data
inference_data_fe_selected = inference_data_fe[selected_columns]
save_data(inference_data, "Inference Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(inference_data_fe, "Inference Data Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(inference_data_fe_selected, "Inference Data Feature-Engineering Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
# If the user doesn't provide the inference data path, it means that the user doesn't want to run the model inference.
print("You did not enter inference data.")
inference_data_fe_selected = None
clear_output()
else:
inference_data_fe_selected = None

# <--- Model Training --->
logger.debug("Model Training")
Expand Down

0 comments on commit 87605b3

Please sign in to comment.