Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Improved the functionality for dropping missing values. #331

Merged
merged 3 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ https://docs.qq.com/document/DQ3BDeHhxRGNzSXZN)
+ Data_AbnormalDetectioon.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1NqTQZCkv74Sn_iOJOKRc-QnJzpaWmnzC_lET_0ZreiQ/edit?usp=sharing) | [[Tencent Docs]](
https://docs.qq.com/document/DQ2hqQ2N2ZGlOUWlT)

**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**.
**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**.

## Running Example

Expand Down
2 changes: 1 addition & 1 deletion docs/source/Home/Introduction.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ https://docs.qq.com/document/DQ3BDeHhxRGNzSXZN)
+ Data_AbnormalDetectioon.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1NqTQZCkv74Sn_iOJOKRc-QnJzpaWmnzC_lET_0ZreiQ/edit?usp=sharing) | [[Tencent Docs]](
https://docs.qq.com/document/DQ2hqQ2N2ZGlOUWlT)

**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**.
**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**.

## Running Example

Expand Down
57 changes: 45 additions & 12 deletions geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
CLUSTERING_MODELS,
CLUSTERING_MODELS_WITH_MISSING_VALUES,
DECOMPOSITION_MODELS,
DROP_MISSING_VALUE_STRATEGY,
FEATURE_SCALING_STRATEGY,
FEATURE_SELECTION_STRATEGY,
IMPUTING_STRATEGY,
Expand Down Expand Up @@ -43,7 +44,7 @@
from .process.decompose import DecompositionModelSelection
from .process.detect import AbnormalDetectionModelSelection
from .process.regress import RegressionModelSelection
from .utils.base import check_package, clear_output, create_geopi_output_dir, get_os, install_package, log, save_data, show_warning
from .utils.base import check_package, clear_output, copy_files, create_geopi_output_dir, get_os, install_package, log, save_data, show_warning
from .utils.mlflow_utils import retrieve_previous_experiment_id


Expand Down Expand Up @@ -281,23 +282,49 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
if is_process_missing_value == 1:
process_missing_value_flag = True
# If the user wants to deal with the missing values, then ask the user which strategy to use.
clear_output()
print("-*-*- Strategy for Missing Values -*-*-")
num2option(MISSING_VALUE_STRATEGY)
print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.")
print("Which strategy do you want to apply?")
missing_value_strategy_num = limit_num_input(MISSING_VALUE_STRATEGY, SECTION[1], num_input)
clear_output()
if missing_value_strategy_num == 1:
# Drop the rows with missing values
data_selected_dropped = data_selected.dropna()
# Reset the index of the data set after dropping the rows with missing values.
data_selected_dropped = data_selected_dropped.reset_index(drop=True)
print("Successfully drop the rows with missing values.")
print("The Selected Data Set After Dropping:")
print(data_selected_dropped)
print("Basic Statistical Information:")
save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
drop_rows_with_missing_value_flag = True
imputed_flag = False
print("-*-*- Drop the rows with Missing Values -*-*-")
num2option(DROP_MISSING_VALUE_STRATEGY)
print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.")
print("Which strategy do you want to apply?")
drop_missing_value_strategy_num = limit_num_input(DROP_MISSING_VALUE_STRATEGY, SECTION[1], num_input)
if drop_missing_value_strategy_num == 1:
# Drop the rows with missing values
data_selected_dropped = data_selected.dropna()
# Reset the index of the data set after dropping the rows with missing values.
data_selected_dropped = data_selected_dropped.reset_index(drop=True)
print("Successfully drop the rows with missing values.")
print("The Selected Data Set After Dropping:")
print(data_selected_dropped)
print("Basic Statistical Information:")
save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
drop_rows_with_missing_value_flag = True
imputed_flag = False
elif drop_missing_value_strategy_num == 2:
show_data_columns(data_selected.columns)
drop_data_selected = create_sub_data_set(data_selected)
for column_name in drop_data_selected.columns:
# Drop the rows with missing values
data_selected_dropped = data_selected.dropna(subset=[column_name])
# Reset the index of the data set after dropping the rows with missing values.
data_selected_dropped = data_selected_dropped.reset_index(drop=True)
print("Successfully drop the rows with missing values.")
print("The Selected Data Set After Dropping:")
print(data_selected_dropped)
print("Basic Statistical Information:")
save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
drop_rows_with_missing_value_flag = True
imputed_flag = False
missing_value_flag = check_missing_value(data_selected_dropped)
if missing_value_flag:
process_missing_value_flag = False
elif missing_value_strategy_num == 2:
# Don't drop the rows with missing values but use imputation techniques to deal with the missing values later.
# No need to save the data set here because it will be saved after imputation.
Expand Down Expand Up @@ -654,4 +681,10 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
else:
model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline)
clear_output()

# <--- Data Dumping --->
# In this section, convert the data in the output to the summary.
GEOPI_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_PATH")
GEOPI_SUMMARY_PATH = os.getenv("GEOPI_SUMMARY_PATH")
copy_files(GEOPI_OUTPUT_PATH, GEOPI_SUMMARY_PATH)
mlflow.end_run()
5 changes: 5 additions & 0 deletions geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
# the root directory where all the output stays
OUTPUT_PATH = os.path.join(WORKING_PATH, "geopi_output")

# the summary root directory where all the output stays
SUMMARY_PATH = os.path.join(OUTPUT_PATH, "summary")

# the directory where the artifact is saved within the MLflow run's artifact directory
MLFLOW_ARTIFACT_DATA_PATH = "data"
MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH = os.path.join("image", "statistic")
Expand Down Expand Up @@ -105,3 +108,5 @@
FEATURE_SELECTION_STRATEGY = ["Generic Univariate Select", "Select K Best"]

CALCULATION_METHOD_OPTION = ["Micro", "Macro", "Weighted"]

DROP_MISSING_VALUE_STRATEGY = ["Drop All Rows with Missing Values", "Drop Rows with Missing Values by Specific Columns"]
24 changes: 23 additions & 1 deletion geochemistrypi/data_mining/utils/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import pickle
import platform
import shutil
from typing import Optional

import joblib
Expand All @@ -11,7 +12,7 @@
from matplotlib import pyplot as plt
from rich import print

from ..constants import OUTPUT_PATH
from ..constants import OUTPUT_PATH, SUMMARY_PATH


def create_geopi_output_dir(experiment_name: str, run_name: str, sub_run_name: Optional[str] = None) -> None:
Expand All @@ -32,10 +33,14 @@ def create_geopi_output_dir(experiment_name: str, run_name: str, sub_run_name: O
# timestamp = datetime.datetime.now().strftime("%m-%d-%H-%M")
if sub_run_name:
geopi_output_path = os.path.join(OUTPUT_PATH, experiment_name, f"{run_name}", sub_run_name)
geopi_summary_path = os.path.join(SUMMARY_PATH, experiment_name, f"{run_name}", sub_run_name)
else:
geopi_output_path = os.path.join(OUTPUT_PATH, experiment_name, f"{run_name}")
geopi_summary_path = os.path.join(SUMMARY_PATH, experiment_name, f"{run_name}")
os.environ["GEOPI_OUTPUT_PATH"] = geopi_output_path
os.makedirs(geopi_output_path, exist_ok=True)
os.environ["GEOPI_SUMMARY_PATH"] = geopi_summary_path
os.makedirs(geopi_summary_path, exist_ok=True)

# Set the output artifacts path for the current run
geopi_output_artifacts_path = os.path.join(geopi_output_path, "artifacts")
Expand Down Expand Up @@ -309,3 +314,20 @@ def show_warning(is_show: bool = True) -> None:

os.environ["PYTHONWARNINGS"] = "ignore"
# os.environ["PYTHONWARNINGS"] = "default"


def copy_files(GEOPI_OUTPUT_PATH: str, GEOPI_SUMMARY_PATH: str) -> None:
"""Copy all files from the source folder to the destination folder.

Parameters
----------
GEOPI_OUTPUT_PATH: str
Source folder path.

GEOPI_SUMMARY_PATH: str
Destination folder path
"""
for root, dirs, files in os.walk(GEOPI_OUTPUT_PATH):
for file in files:
source_file_path = os.path.join(root, file)
shutil.copy2(source_file_path, GEOPI_SUMMARY_PATH)
Loading