perf: Dropping rows with missing values in selected column

ZJUEarthData · Mar 18, 2024 · 9951ab2 · 9951ab2
1 parent b7c5fd4
commit 9951ab2
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 11 deletions.
diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py
@@ -29,6 +29,7 @@
     SECTION,
     TEST_DATA_OPTION,
     WORKING_PATH,
+    Drop_MISSING_VALUE_STRATEGY,
 )
 from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns
 from .data.feature_engineering import FeatureConstructor
@@ -281,23 +282,46 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
         if is_process_missing_value == 1:
             process_missing_value_flag = True
             # If the user wants to deal with the missing values, then ask the user which strategy to use.
+            clear_output()
             print("-*-*- Strategy for Missing Values -*-*-")
             num2option(MISSING_VALUE_STRATEGY)
             print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.")
             print("Which strategy do you want to apply?")
             missing_value_strategy_num = limit_num_input(MISSING_VALUE_STRATEGY, SECTION[1], num_input)
+            clear_output()
             if missing_value_strategy_num == 1:
-                # Drop the rows with missing values
-                data_selected_dropped = data_selected.dropna()
-                # Reset the index of the data set after dropping the rows with missing values.
-                data_selected_dropped = data_selected_dropped.reset_index(drop=True)
-                print("Successfully drop the rows with missing values.")
-                print("The Selected Data Set After Dropping:")
-                print(data_selected_dropped)
-                print("Basic Statistical Information:")
-                save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
-                drop_rows_with_missing_value_flag = True
-                imputed_flag = False
+                print("-*-*-  Drop the rows with Missing Values -*-*-")
+                num2option(Drop_MISSING_VALUE_STRATEGY)
+                print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.")
+                print("Which strategy do you want to apply?")
+                drop_missing_value_strategy_num = limit_num_input(Drop_MISSING_VALUE_STRATEGY, SECTION[1], num_input)
+                if drop_missing_value_strategy_num == 1:
+                    # Drop the rows with missing values
+                    data_selected_dropped = data_selected.dropna()
+                    # Reset the index of the data set after dropping the rows with missing values.
+                    data_selected_dropped = data_selected_dropped.reset_index(drop=True)
+                    print("Successfully drop the rows with missing values.")
+                    print("The Selected Data Set After Dropping:")
+                    print(data_selected_dropped)
+                    print("Basic Statistical Information:")
+                    save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+                    drop_rows_with_missing_value_flag = True
+                    imputed_flag = False
+                elif drop_missing_value_strategy_num == 2:
+                    show_data_columns(data_selected.columns)
+                    drop_data_selected = create_sub_data_set(data_selected)
+                    for column_name in drop_data_selected.columns:
+                        # Drop the rows with missing values
+                        data_selected_dropped = data_selected.dropna(subset=[column_name])
+                        # Reset the index of the data set after dropping the rows with missing values.
+                        data_selected_dropped = data_selected_dropped.reset_index(drop=True)
+                    print("Successfully drop the rows with missing values.")
+                    print("The Selected Data Set After Dropping:")
+                    print(data_selected_dropped)
+                    print("Basic Statistical Information:")
+                    save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+                    drop_rows_with_missing_value_flag = True
+                    imputed_flag = False
             elif missing_value_strategy_num == 2:
                 # Don't drop the rows with missing values but use imputation techniques to deal with the missing values later.
                 # No need to save the data set here because it will be saved after imputation.

diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -105,3 +105,5 @@
 FEATURE_SELECTION_STRATEGY = ["Generic Univariate Select", "Select K Best"]
 
 CALCULATION_METHOD_OPTION = ["Micro", "Macro", "Weighted"]
+
+Drop_MISSING_VALUE_STRATEGY = ["Drop All Rows with Missing Values", "Drop Rows with Missing Values in Column A"]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -105,3 +105,5 @@
		FEATURE_SELECTION_STRATEGY = ["Generic Univariate Select", "Select K Best"]

		CALCULATION_METHOD_OPTION = ["Micro", "Macro", "Weighted"]

		Drop_MISSING_VALUE_STRATEGY = ["Drop All Rows with Missing Values", "Drop Rows with Missing Values in Column A"]