Skip to content

Commit

Permalink
perf: Dropping rows with missing values in selected column
Browse files Browse the repository at this point in the history
  • Loading branch information
unknown authored and unknown committed Mar 18, 2024
1 parent b7c5fd4 commit 9951ab2
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 11 deletions.
46 changes: 35 additions & 11 deletions geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
SECTION,
TEST_DATA_OPTION,
WORKING_PATH,
Drop_MISSING_VALUE_STRATEGY,
)
from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns
from .data.feature_engineering import FeatureConstructor
Expand Down Expand Up @@ -281,23 +282,46 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
if is_process_missing_value == 1:
process_missing_value_flag = True
# If the user wants to deal with the missing values, then ask the user which strategy to use.
clear_output()
print("-*-*- Strategy for Missing Values -*-*-")
num2option(MISSING_VALUE_STRATEGY)
print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.")
print("Which strategy do you want to apply?")
missing_value_strategy_num = limit_num_input(MISSING_VALUE_STRATEGY, SECTION[1], num_input)
clear_output()
if missing_value_strategy_num == 1:
# Drop the rows with missing values
data_selected_dropped = data_selected.dropna()
# Reset the index of the data set after dropping the rows with missing values.
data_selected_dropped = data_selected_dropped.reset_index(drop=True)
print("Successfully drop the rows with missing values.")
print("The Selected Data Set After Dropping:")
print(data_selected_dropped)
print("Basic Statistical Information:")
save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
drop_rows_with_missing_value_flag = True
imputed_flag = False
print("-*-*- Drop the rows with Missing Values -*-*-")
num2option(Drop_MISSING_VALUE_STRATEGY)
print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.")
print("Which strategy do you want to apply?")
drop_missing_value_strategy_num = limit_num_input(Drop_MISSING_VALUE_STRATEGY, SECTION[1], num_input)
if drop_missing_value_strategy_num == 1:
# Drop the rows with missing values
data_selected_dropped = data_selected.dropna()
# Reset the index of the data set after dropping the rows with missing values.
data_selected_dropped = data_selected_dropped.reset_index(drop=True)
print("Successfully drop the rows with missing values.")
print("The Selected Data Set After Dropping:")
print(data_selected_dropped)
print("Basic Statistical Information:")
save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
drop_rows_with_missing_value_flag = True
imputed_flag = False
elif drop_missing_value_strategy_num == 2:
show_data_columns(data_selected.columns)
drop_data_selected = create_sub_data_set(data_selected)
for column_name in drop_data_selected.columns:
# Drop the rows with missing values
data_selected_dropped = data_selected.dropna(subset=[column_name])
# Reset the index of the data set after dropping the rows with missing values.
data_selected_dropped = data_selected_dropped.reset_index(drop=True)
print("Successfully drop the rows with missing values.")
print("The Selected Data Set After Dropping:")
print(data_selected_dropped)
print("Basic Statistical Information:")
save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
drop_rows_with_missing_value_flag = True
imputed_flag = False
elif missing_value_strategy_num == 2:
# Don't drop the rows with missing values but use imputation techniques to deal with the missing values later.
# No need to save the data set here because it will be saved after imputation.
Expand Down
2 changes: 2 additions & 0 deletions geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,5 @@
FEATURE_SELECTION_STRATEGY = ["Generic Univariate Select", "Select K Best"]

CALCULATION_METHOD_OPTION = ["Micro", "Macro", "Weighted"]

Drop_MISSING_VALUE_STRATEGY = ["Drop All Rows with Missing Values", "Drop Rows with Missing Values in Column A"]

0 comments on commit 9951ab2

Please sign in to comment.