diff --git a/README.md b/README.md index 80fcafd9..5e83b343 100644 --- a/README.md +++ b/README.md @@ -148,7 +148,7 @@ https://docs.qq.com/document/DQ3BDeHhxRGNzSXZN) + Data_AbnormalDetectioon.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1NqTQZCkv74Sn_iOJOKRc-QnJzpaWmnzC_lET_0ZreiQ/edit?usp=sharing) | [[Tencent Docs]]( https://docs.qq.com/document/DQ2hqQ2N2ZGlOUWlT) -**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**. +**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**. ## Running Example diff --git a/docs/source/Home/Introduction.md b/docs/source/Home/Introduction.md index 62199014..40705df8 100644 --- a/docs/source/Home/Introduction.md +++ b/docs/source/Home/Introduction.md @@ -149,7 +149,7 @@ https://docs.qq.com/document/DQ3BDeHhxRGNzSXZN) + Data_AbnormalDetectioon.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1NqTQZCkv74Sn_iOJOKRc-QnJzpaWmnzC_lET_0ZreiQ/edit?usp=sharing) | [[Tencent Docs]]( https://docs.qq.com/document/DQ2hqQ2N2ZGlOUWlT) -**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**. +**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**. ## Running Example diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py index 40575e0a..387d78bd 100644 --- a/geochemistrypi/data_mining/cli_pipeline.py +++ b/geochemistrypi/data_mining/cli_pipeline.py @@ -15,6 +15,7 @@ CLUSTERING_MODELS, CLUSTERING_MODELS_WITH_MISSING_VALUES, DECOMPOSITION_MODELS, + DROP_MISSING_VALUE_STRATEGY, FEATURE_SCALING_STRATEGY, FEATURE_SELECTION_STRATEGY, IMPUTING_STRATEGY, @@ -43,7 +44,7 @@ from .process.decompose import DecompositionModelSelection from .process.detect import AbnormalDetectionModelSelection from .process.regress import RegressionModelSelection -from .utils.base import check_package, clear_output, create_geopi_output_dir, get_os, install_package, log, save_data, show_warning +from .utils.base import check_package, clear_output, copy_files, create_geopi_output_dir, get_os, install_package, log, save_data, show_warning from .utils.mlflow_utils import retrieve_previous_experiment_id @@ -281,23 +282,49 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = if is_process_missing_value == 1: process_missing_value_flag = True # If the user wants to deal with the missing values, then ask the user which strategy to use. + clear_output() print("-*-*- Strategy for Missing Values -*-*-") num2option(MISSING_VALUE_STRATEGY) print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.") print("Which strategy do you want to apply?") missing_value_strategy_num = limit_num_input(MISSING_VALUE_STRATEGY, SECTION[1], num_input) + clear_output() if missing_value_strategy_num == 1: - # Drop the rows with missing values - data_selected_dropped = data_selected.dropna() - # Reset the index of the data set after dropping the rows with missing values. - data_selected_dropped = data_selected_dropped.reset_index(drop=True) - print("Successfully drop the rows with missing values.") - print("The Selected Data Set After Dropping:") - print(data_selected_dropped) - print("Basic Statistical Information:") - save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) - drop_rows_with_missing_value_flag = True - imputed_flag = False + print("-*-*- Drop the rows with Missing Values -*-*-") + num2option(DROP_MISSING_VALUE_STRATEGY) + print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.") + print("Which strategy do you want to apply?") + drop_missing_value_strategy_num = limit_num_input(DROP_MISSING_VALUE_STRATEGY, SECTION[1], num_input) + if drop_missing_value_strategy_num == 1: + # Drop the rows with missing values + data_selected_dropped = data_selected.dropna() + # Reset the index of the data set after dropping the rows with missing values. + data_selected_dropped = data_selected_dropped.reset_index(drop=True) + print("Successfully drop the rows with missing values.") + print("The Selected Data Set After Dropping:") + print(data_selected_dropped) + print("Basic Statistical Information:") + save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + drop_rows_with_missing_value_flag = True + imputed_flag = False + elif drop_missing_value_strategy_num == 2: + show_data_columns(data_selected.columns) + drop_data_selected = create_sub_data_set(data_selected) + for column_name in drop_data_selected.columns: + # Drop the rows with missing values + data_selected_dropped = data_selected.dropna(subset=[column_name]) + # Reset the index of the data set after dropping the rows with missing values. + data_selected_dropped = data_selected_dropped.reset_index(drop=True) + print("Successfully drop the rows with missing values.") + print("The Selected Data Set After Dropping:") + print(data_selected_dropped) + print("Basic Statistical Information:") + save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + drop_rows_with_missing_value_flag = True + imputed_flag = False + missing_value_flag = check_missing_value(data_selected_dropped) + if missing_value_flag: + process_missing_value_flag = False elif missing_value_strategy_num == 2: # Don't drop the rows with missing values but use imputation techniques to deal with the missing values later. # No need to save the data set here because it will be saved after imputation. @@ -654,4 +681,10 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = else: model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline) clear_output() + + # <--- Data Dumping ---> + # In this section, convert the data in the output to the summary. + GEOPI_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_PATH") + GEOPI_SUMMARY_PATH = os.getenv("GEOPI_SUMMARY_PATH") + copy_files(GEOPI_OUTPUT_PATH, GEOPI_SUMMARY_PATH) mlflow.end_run() diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py index 01a71d2c..b75bcfbb 100644 --- a/geochemistrypi/data_mining/constants.py +++ b/geochemistrypi/data_mining/constants.py @@ -15,6 +15,9 @@ # the root directory where all the output stays OUTPUT_PATH = os.path.join(WORKING_PATH, "geopi_output") +# the summary root directory where all the output stays +SUMMARY_PATH = os.path.join(OUTPUT_PATH, "summary") + # the directory where the artifact is saved within the MLflow run's artifact directory MLFLOW_ARTIFACT_DATA_PATH = "data" MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH = os.path.join("image", "statistic") @@ -105,3 +108,5 @@ FEATURE_SELECTION_STRATEGY = ["Generic Univariate Select", "Select K Best"] CALCULATION_METHOD_OPTION = ["Micro", "Macro", "Weighted"] + +DROP_MISSING_VALUE_STRATEGY = ["Drop All Rows with Missing Values", "Drop Rows with Missing Values by Specific Columns"] diff --git a/geochemistrypi/data_mining/utils/base.py b/geochemistrypi/data_mining/utils/base.py index 8d95625a..ac9dcfa1 100644 --- a/geochemistrypi/data_mining/utils/base.py +++ b/geochemistrypi/data_mining/utils/base.py @@ -3,6 +3,7 @@ import os import pickle import platform +import shutil from typing import Optional import joblib @@ -11,7 +12,7 @@ from matplotlib import pyplot as plt from rich import print -from ..constants import OUTPUT_PATH +from ..constants import OUTPUT_PATH, SUMMARY_PATH def create_geopi_output_dir(experiment_name: str, run_name: str, sub_run_name: Optional[str] = None) -> None: @@ -32,10 +33,14 @@ def create_geopi_output_dir(experiment_name: str, run_name: str, sub_run_name: O # timestamp = datetime.datetime.now().strftime("%m-%d-%H-%M") if sub_run_name: geopi_output_path = os.path.join(OUTPUT_PATH, experiment_name, f"{run_name}", sub_run_name) + geopi_summary_path = os.path.join(SUMMARY_PATH, experiment_name, f"{run_name}", sub_run_name) else: geopi_output_path = os.path.join(OUTPUT_PATH, experiment_name, f"{run_name}") + geopi_summary_path = os.path.join(SUMMARY_PATH, experiment_name, f"{run_name}") os.environ["GEOPI_OUTPUT_PATH"] = geopi_output_path os.makedirs(geopi_output_path, exist_ok=True) + os.environ["GEOPI_SUMMARY_PATH"] = geopi_summary_path + os.makedirs(geopi_summary_path, exist_ok=True) # Set the output artifacts path for the current run geopi_output_artifacts_path = os.path.join(geopi_output_path, "artifacts") @@ -309,3 +314,20 @@ def show_warning(is_show: bool = True) -> None: os.environ["PYTHONWARNINGS"] = "ignore" # os.environ["PYTHONWARNINGS"] = "default" + + +def copy_files(GEOPI_OUTPUT_PATH: str, GEOPI_SUMMARY_PATH: str) -> None: + """Copy all files from the source folder to the destination folder. + + Parameters + ---------- + GEOPI_OUTPUT_PATH: str + Source folder path. + + GEOPI_SUMMARY_PATH: str + Destination folder path + """ + for root, dirs, files in os.walk(GEOPI_OUTPUT_PATH): + for file in files: + source_file_path = os.path.join(root, file) + shutil.copy2(source_file_path, GEOPI_SUMMARY_PATH)