Skip to content

Commit

Permalink
Merge pull request #331 from ZJUEarthData/dev/Yongkang
Browse files Browse the repository at this point in the history
fix: Improved the functionality for dropping missing values.
  • Loading branch information
SanyHe authored Apr 18, 2024
2 parents e9e212d + 13e05b2 commit 11ef7f0
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 15 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ https://docs.qq.com/document/DQ3BDeHhxRGNzSXZN)
+ Data_AbnormalDetectioon.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1NqTQZCkv74Sn_iOJOKRc-QnJzpaWmnzC_lET_0ZreiQ/edit?usp=sharing) | [[Tencent Docs]](
https://docs.qq.com/document/DQ2hqQ2N2ZGlOUWlT)

**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**.
**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**.

## Running Example

Expand Down
2 changes: 1 addition & 1 deletion docs/source/Home/Introduction.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ https://docs.qq.com/document/DQ3BDeHhxRGNzSXZN)
+ Data_AbnormalDetectioon.xlsx [[Google Drive]](https://docs.google.com/spreadsheets/d/1NqTQZCkv74Sn_iOJOKRc-QnJzpaWmnzC_lET_0ZreiQ/edit?usp=sharing) | [[Tencent Docs]](
https://docs.qq.com/document/DQ2hqQ2N2ZGlOUWlT)

**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**.
**Note**: For more detail on data preparation, please refer to our online documentation in **Model Example** under the section of **FOR USER**.

## Running Example

Expand Down
57 changes: 45 additions & 12 deletions geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
CLUSTERING_MODELS,
CLUSTERING_MODELS_WITH_MISSING_VALUES,
DECOMPOSITION_MODELS,
DROP_MISSING_VALUE_STRATEGY,
FEATURE_SCALING_STRATEGY,
FEATURE_SELECTION_STRATEGY,
IMPUTING_STRATEGY,
Expand Down Expand Up @@ -43,7 +44,7 @@
from .process.decompose import DecompositionModelSelection
from .process.detect import AbnormalDetectionModelSelection
from .process.regress import RegressionModelSelection
from .utils.base import check_package, clear_output, create_geopi_output_dir, get_os, install_package, log, save_data, show_warning
from .utils.base import check_package, clear_output, copy_files, create_geopi_output_dir, get_os, install_package, log, save_data, show_warning
from .utils.mlflow_utils import retrieve_previous_experiment_id


Expand Down Expand Up @@ -281,23 +282,49 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
if is_process_missing_value == 1:
process_missing_value_flag = True
# If the user wants to deal with the missing values, then ask the user which strategy to use.
clear_output()
print("-*-*- Strategy for Missing Values -*-*-")
num2option(MISSING_VALUE_STRATEGY)
print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.")
print("Which strategy do you want to apply?")
missing_value_strategy_num = limit_num_input(MISSING_VALUE_STRATEGY, SECTION[1], num_input)
clear_output()
if missing_value_strategy_num == 1:
# Drop the rows with missing values
data_selected_dropped = data_selected.dropna()
# Reset the index of the data set after dropping the rows with missing values.
data_selected_dropped = data_selected_dropped.reset_index(drop=True)
print("Successfully drop the rows with missing values.")
print("The Selected Data Set After Dropping:")
print(data_selected_dropped)
print("Basic Statistical Information:")
save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
drop_rows_with_missing_value_flag = True
imputed_flag = False
print("-*-*- Drop the rows with Missing Values -*-*-")
num2option(DROP_MISSING_VALUE_STRATEGY)
print("Notice: Drop the rows with missing values may lead to a significant loss of data if too many features are chosen.")
print("Which strategy do you want to apply?")
drop_missing_value_strategy_num = limit_num_input(DROP_MISSING_VALUE_STRATEGY, SECTION[1], num_input)
if drop_missing_value_strategy_num == 1:
# Drop the rows with missing values
data_selected_dropped = data_selected.dropna()
# Reset the index of the data set after dropping the rows with missing values.
data_selected_dropped = data_selected_dropped.reset_index(drop=True)
print("Successfully drop the rows with missing values.")
print("The Selected Data Set After Dropping:")
print(data_selected_dropped)
print("Basic Statistical Information:")
save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
drop_rows_with_missing_value_flag = True
imputed_flag = False
elif drop_missing_value_strategy_num == 2:
show_data_columns(data_selected.columns)
drop_data_selected = create_sub_data_set(data_selected)
for column_name in drop_data_selected.columns:
# Drop the rows with missing values
data_selected_dropped = data_selected.dropna(subset=[column_name])
# Reset the index of the data set after dropping the rows with missing values.
data_selected_dropped = data_selected_dropped.reset_index(drop=True)
print("Successfully drop the rows with missing values.")
print("The Selected Data Set After Dropping:")
print(data_selected_dropped)
print("Basic Statistical Information:")
save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
drop_rows_with_missing_value_flag = True
imputed_flag = False
missing_value_flag = check_missing_value(data_selected_dropped)
if missing_value_flag:
process_missing_value_flag = False
elif missing_value_strategy_num == 2:
# Don't drop the rows with missing values but use imputation techniques to deal with the missing values later.
# No need to save the data set here because it will be saved after imputation.
Expand Down Expand Up @@ -654,4 +681,10 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
else:
model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline)
clear_output()

# <--- Data Dumping --->
# In this section, convert the data in the output to the summary.
GEOPI_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_PATH")
GEOPI_SUMMARY_PATH = os.getenv("GEOPI_SUMMARY_PATH")
copy_files(GEOPI_OUTPUT_PATH, GEOPI_SUMMARY_PATH)
mlflow.end_run()
5 changes: 5 additions & 0 deletions geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
# the root directory where all the output stays
OUTPUT_PATH = os.path.join(WORKING_PATH, "geopi_output")

# the summary root directory where all the output stays
SUMMARY_PATH = os.path.join(OUTPUT_PATH, "summary")

# the directory where the artifact is saved within the MLflow run's artifact directory
MLFLOW_ARTIFACT_DATA_PATH = "data"
MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH = os.path.join("image", "statistic")
Expand Down Expand Up @@ -105,3 +108,5 @@
FEATURE_SELECTION_STRATEGY = ["Generic Univariate Select", "Select K Best"]

CALCULATION_METHOD_OPTION = ["Micro", "Macro", "Weighted"]

DROP_MISSING_VALUE_STRATEGY = ["Drop All Rows with Missing Values", "Drop Rows with Missing Values by Specific Columns"]
24 changes: 23 additions & 1 deletion geochemistrypi/data_mining/utils/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import pickle
import platform
import shutil
from typing import Optional

import joblib
Expand All @@ -11,7 +12,7 @@
from matplotlib import pyplot as plt
from rich import print

from ..constants import OUTPUT_PATH
from ..constants import OUTPUT_PATH, SUMMARY_PATH


def create_geopi_output_dir(experiment_name: str, run_name: str, sub_run_name: Optional[str] = None) -> None:
Expand All @@ -32,10 +33,14 @@ def create_geopi_output_dir(experiment_name: str, run_name: str, sub_run_name: O
# timestamp = datetime.datetime.now().strftime("%m-%d-%H-%M")
if sub_run_name:
geopi_output_path = os.path.join(OUTPUT_PATH, experiment_name, f"{run_name}", sub_run_name)
geopi_summary_path = os.path.join(SUMMARY_PATH, experiment_name, f"{run_name}", sub_run_name)
else:
geopi_output_path = os.path.join(OUTPUT_PATH, experiment_name, f"{run_name}")
geopi_summary_path = os.path.join(SUMMARY_PATH, experiment_name, f"{run_name}")
os.environ["GEOPI_OUTPUT_PATH"] = geopi_output_path
os.makedirs(geopi_output_path, exist_ok=True)
os.environ["GEOPI_SUMMARY_PATH"] = geopi_summary_path
os.makedirs(geopi_summary_path, exist_ok=True)

# Set the output artifacts path for the current run
geopi_output_artifacts_path = os.path.join(geopi_output_path, "artifacts")
Expand Down Expand Up @@ -309,3 +314,20 @@ def show_warning(is_show: bool = True) -> None:

os.environ["PYTHONWARNINGS"] = "ignore"
# os.environ["PYTHONWARNINGS"] = "default"


def copy_files(GEOPI_OUTPUT_PATH: str, GEOPI_SUMMARY_PATH: str) -> None:
"""Copy all files from the source folder to the destination folder.
Parameters
----------
GEOPI_OUTPUT_PATH: str
Source folder path.
GEOPI_SUMMARY_PATH: str
Destination folder path
"""
for root, dirs, files in os.walk(GEOPI_OUTPUT_PATH):
for file in files:
source_file_path = os.path.join(root, file)
shutil.copy2(source_file_path, GEOPI_SUMMARY_PATH)

0 comments on commit 11ef7f0

Please sign in to comment.