ZJUEarthData · SanyHe · Apr 18, 2024 · Apr 8, 2024 · Apr 8, 2024 · Apr 9, 2024
diff --git a/README.md b/README.md
@@ -148,7 +148,7 @@ https://docs.qq.com/document/DQ3BDeHhxRGNzSXZN)
 +  Data_AbnormalDetectioon.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1NqTQZCkv74Sn_iOJOKRc-QnJzpaWmnzC_lET_0ZreiQ/edit?usp=sharing) | [[Tencent Docs]](
 https://docs.qq.com/document/DQ2hqQ2N2ZGlOUWlT)
 
-**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**. 
+**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**.
 
 ## Running Example
 

diff --git a/docs/source/Home/Introduction.md b/docs/source/Home/Introduction.md
@@ -149,7 +149,7 @@ https://docs.qq.com/document/DQ3BDeHhxRGNzSXZN)
 +  Data_AbnormalDetectioon.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1NqTQZCkv74Sn_iOJOKRc-QnJzpaWmnzC_lET_0ZreiQ/edit?usp=sharing) | [[Tencent Docs]](
 https://docs.qq.com/document/DQ2hqQ2N2ZGlOUWlT)
 
-**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**. 
+**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**.
 
 ## Running Example
 

diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py
@@ -15,6 +15,7 @@
     CLUSTERING_MODELS,
     CLUSTERING_MODELS_WITH_MISSING_VALUES,
     DECOMPOSITION_MODELS,
+    DROP_MISSING_VALUE_STRATEGY,
     FEATURE_SCALING_STRATEGY,
     FEATURE_SELECTION_STRATEGY,
     IMPUTING_STRATEGY,
@@ -43,7 +44,7 @@
 from .process.decompose import DecompositionModelSelection
 from .process.detect import AbnormalDetectionModelSelection
 from .process.regress import RegressionModelSelection
-from .utils.base import check_package, clear_output, create_geopi_output_dir, get_os, install_package, log, save_data, show_warning
+from .utils.base import check_package, clear_output, copy_files, create_geopi_output_dir, get_os, install_package, log, save_data, show_warning
 from .utils.mlflow_utils import retrieve_previous_experiment_id
 
 
@@ -281,23 +282,49 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
         if is_process_missing_value == 1:
             process_missing_value_flag = True
             # If the user wants to deal with the missing values, then ask the user which strategy to use.
+            clear_output()
             print("-*-*- Strategy for Missing Values -*-*-")
             num2option(MISSING_VALUE_STRATEGY)
             print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.")
             print("Which strategy do you want to apply?")
             missing_value_strategy_num = limit_num_input(MISSING_VALUE_STRATEGY, SECTION[1], num_input)
+            clear_output()
             if missing_value_strategy_num == 1:
-                # Drop the rows with missing values
-                data_selected_dropped = data_selected.dropna()
-                # Reset the index of the data set after dropping the rows with missing values.
-                data_selected_dropped = data_selected_dropped.reset_index(drop=True)
-                print("Successfully drop the rows with missing values.")
-                print("The Selected Data Set After Dropping:")
-                print(data_selected_dropped)
-                print("Basic Statistical Information:")
-                save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
-                drop_rows_with_missing_value_flag = True
-                imputed_flag = False
+                print("-*-*-  Drop the rows with Missing Values -*-*-")
+                num2option(DROP_MISSING_VALUE_STRATEGY)
+                print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.")
+                print("Which strategy do you want to apply?")
+                drop_missing_value_strategy_num = limit_num_input(DROP_MISSING_VALUE_STRATEGY, SECTION[1], num_input)
+                if drop_missing_value_strategy_num == 1:
+                    # Drop the rows with missing values
+                    data_selected_dropped = data_selected.dropna()
+                    # Reset the index of the data set after dropping the rows with missing values.
+                    data_selected_dropped = data_selected_dropped.reset_index(drop=True)
+                    print("Successfully drop the rows with missing values.")
+                    print("The Selected Data Set After Dropping:")
+                    print(data_selected_dropped)
+                    print("Basic Statistical Information:")
+                    save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+                    drop_rows_with_missing_value_flag = True
+                    imputed_flag = False
+                elif drop_missing_value_strategy_num == 2:
+                    show_data_columns(data_selected.columns)
+                    drop_data_selected = create_sub_data_set(data_selected)
+                    for column_name in drop_data_selected.columns:
+                        # Drop the rows with missing values
+                        data_selected_dropped = data_selected.dropna(subset=[column_name])
+                        # Reset the index of the data set after dropping the rows with missing values.
+                        data_selected_dropped = data_selected_dropped.reset_index(drop=True)
+                    print("Successfully drop the rows with missing values.")
+                    print("The Selected Data Set After Dropping:")
+                    print(data_selected_dropped)
+                    print("Basic Statistical Information:")
+                    save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+                    drop_rows_with_missing_value_flag = True
+                    imputed_flag = False
+                    missing_value_flag = check_missing_value(data_selected_dropped)
+                    if missing_value_flag:
+                        process_missing_value_flag = False
             elif missing_value_strategy_num == 2:
                 # Don't drop the rows with missing values but use imputation techniques to deal with the missing values later.
                 # No need to save the data set here because it will be saved after imputation.
@@ -654,4 +681,10 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
                     else:
                         model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline)
                     clear_output()
+
+    # <--- Data Dumping --->
+    # In this section, convert the data in the output to the summary.
+    GEOPI_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_PATH")
+    GEOPI_SUMMARY_PATH = os.getenv("GEOPI_SUMMARY_PATH")
+    copy_files(GEOPI_OUTPUT_PATH, GEOPI_SUMMARY_PATH)
     mlflow.end_run()
diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -15,6 +15,9 @@
 # the root directory where all the output stays
 OUTPUT_PATH = os.path.join(WORKING_PATH, "geopi_output")
 
+# the summary root directory where all the output stays
+SUMMARY_PATH = os.path.join(OUTPUT_PATH, "summary")
+
 # the directory where the artifact is saved within the MLflow run's artifact directory
 MLFLOW_ARTIFACT_DATA_PATH = "data"
 MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH = os.path.join("image", "statistic")
@@ -105,3 +108,5 @@
 FEATURE_SELECTION_STRATEGY = ["Generic Univariate Select", "Select K Best"]
 
 CALCULATION_METHOD_OPTION = ["Micro", "Macro", "Weighted"]
+
+DROP_MISSING_VALUE_STRATEGY = ["Drop All Rows with Missing Values", "Drop Rows with Missing Values by Specific Columns"]
diff --git a/geochemistrypi/data_mining/utils/base.py b/geochemistrypi/data_mining/utils/base.py
@@ -3,6 +3,7 @@
 import os
 import pickle
 import platform
+import shutil
 from typing import Optional
 
 import joblib
@@ -11,7 +12,7 @@
 from matplotlib import pyplot as plt
 from rich import print
 
-from ..constants import OUTPUT_PATH
+from ..constants import OUTPUT_PATH, SUMMARY_PATH
 
 
 def create_geopi_output_dir(experiment_name: str, run_name: str, sub_run_name: Optional[str] = None) -> None:
@@ -32,10 +33,14 @@ def create_geopi_output_dir(experiment_name: str, run_name: str, sub_run_name: O
     # timestamp = datetime.datetime.now().strftime("%m-%d-%H-%M")
     if sub_run_name:
         geopi_output_path = os.path.join(OUTPUT_PATH, experiment_name, f"{run_name}", sub_run_name)
+        geopi_summary_path = os.path.join(SUMMARY_PATH, experiment_name, f"{run_name}", sub_run_name)
     else:
         geopi_output_path = os.path.join(OUTPUT_PATH, experiment_name, f"{run_name}")
+        geopi_summary_path = os.path.join(SUMMARY_PATH, experiment_name, f"{run_name}")
     os.environ["GEOPI_OUTPUT_PATH"] = geopi_output_path
     os.makedirs(geopi_output_path, exist_ok=True)
+    os.environ["GEOPI_SUMMARY_PATH"] = geopi_summary_path
+    os.makedirs(geopi_summary_path, exist_ok=True)
 
     # Set the output artifacts path for the current run
     geopi_output_artifacts_path = os.path.join(geopi_output_path, "artifacts")
@@ -309,3 +314,20 @@ def show_warning(is_show: bool = True) -> None:
 
             os.environ["PYTHONWARNINGS"] = "ignore"
             # os.environ["PYTHONWARNINGS"] = "default"
+
+
+def copy_files(GEOPI_OUTPUT_PATH: str, GEOPI_SUMMARY_PATH: str) -> None:
+    """Copy all files from the source folder to the destination folder.
+
+    Parameters
+    ----------
+    GEOPI_OUTPUT_PATH: str
+        Source folder path.
+
+    GEOPI_SUMMARY_PATH: str
+        Destination folder path
+    """
+    for root, dirs, files in os.walk(GEOPI_OUTPUT_PATH):
+        for file in files:
+            source_file_path = os.path.join(root, file)
+            shutil.copy2(source_file_path, GEOPI_SUMMARY_PATH)