Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

"fix: Improved the functionality for dropping missing values." #327

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 38 additions & 11 deletions geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
CLUSTERING_MODELS,
CLUSTERING_MODELS_WITH_MISSING_VALUES,
DECOMPOSITION_MODELS,
DROP_MISSING_VALUE_STRATEGY,
FEATURE_SCALING_STRATEGY,
FEATURE_SELECTION_STRATEGY,
IMPUTING_STRATEGY,
Expand Down Expand Up @@ -281,23 +282,49 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
if is_process_missing_value == 1:
process_missing_value_flag = True
# If the user wants to deal with the missing values, then ask the user which strategy to use.
clear_output()
print("-*-*- Strategy for Missing Values -*-*-")
num2option(MISSING_VALUE_STRATEGY)
print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.")
print("Which strategy do you want to apply?")
missing_value_strategy_num = limit_num_input(MISSING_VALUE_STRATEGY, SECTION[1], num_input)
clear_output()
if missing_value_strategy_num == 1:
# Drop the rows with missing values
data_selected_dropped = data_selected.dropna()
# Reset the index of the data set after dropping the rows with missing values.
data_selected_dropped = data_selected_dropped.reset_index(drop=True)
print("Successfully drop the rows with missing values.")
print("The Selected Data Set After Dropping:")
print(data_selected_dropped)
print("Basic Statistical Information:")
save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
drop_rows_with_missing_value_flag = True
imputed_flag = False
print("-*-*- Drop the rows with Missing Values -*-*-")
num2option(DROP_MISSING_VALUE_STRATEGY)
print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.")
print("Which strategy do you want to apply?")
drop_missing_value_strategy_num = limit_num_input(DROP_MISSING_VALUE_STRATEGY, SECTION[1], num_input)
if drop_missing_value_strategy_num == 1:
# Drop the rows with missing values
data_selected_dropped = data_selected.dropna()
# Reset the index of the data set after dropping the rows with missing values.
data_selected_dropped = data_selected_dropped.reset_index(drop=True)
print("Successfully drop the rows with missing values.")
print("The Selected Data Set After Dropping:")
print(data_selected_dropped)
print("Basic Statistical Information:")
save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
drop_rows_with_missing_value_flag = True
imputed_flag = False
elif drop_missing_value_strategy_num == 2:
show_data_columns(data_selected.columns)
drop_data_selected = create_sub_data_set(data_selected)
for column_name in drop_data_selected.columns:
# Drop the rows with missing values
data_selected_dropped = data_selected.dropna(subset=[column_name])
# Reset the index of the data set after dropping the rows with missing values.
data_selected_dropped = data_selected_dropped.reset_index(drop=True)
print("Successfully drop the rows with missing values.")
print("The Selected Data Set After Dropping:")
print(data_selected_dropped)
print("Basic Statistical Information:")
save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
drop_rows_with_missing_value_flag = True
imputed_flag = False
missing_value_flag = check_missing_value(data_selected_dropped)
if missing_value_flag:
process_missing_value_flag = False
elif missing_value_strategy_num == 2:
# Don't drop the rows with missing values but use imputation techniques to deal with the missing values later.
# No need to save the data set here because it will be saved after imputation.
Expand Down
2 changes: 2 additions & 0 deletions geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,5 @@
FEATURE_SELECTION_STRATEGY = ["Generic Univariate Select", "Select K Best"]

CALCULATION_METHOD_OPTION = ["Micro", "Macro", "Weighted"]

DROP_MISSING_VALUE_STRATEGY = ["Drop All Rows with Missing Values", "Drop Rows with Missing Values by Specific Columns"]
Loading