Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: move classification common function name to enum #392

Merged
merged 1 commit into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 30 additions & 20 deletions geochemistrypi/data_mining/model/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,24 +144,26 @@ def _classification_report(y_true: pd.DataFrame, y_predict: pd.DataFrame, algori
mlflow.log_artifact(os.path.join(store_path, f"Classification Report - {algorithm_name}.txt"))

@staticmethod
def _cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, average: str, cv_num: int, algorithm_name: str, store_path: str) -> None:
def _cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, graph_name: str, average: str, cv_num: int, algorithm_name: str, store_path: str) -> None:
"""Perform cross validation on the model."""
print("-----* Cross Validation *-----")
print(f"-----* {graph_name} *-----")
print(f"K-Folds: {cv_num}")
scores = cross_validation(trained_model, X_train, y_train, average=average, cv_num=cv_num)
scores_str = json.dumps(scores, indent=4)
save_text(scores_str, f"Cross Validation - {algorithm_name}", store_path)
save_text(scores_str, f"{graph_name} - {algorithm_name}", store_path)

@staticmethod
def _plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, name_column: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
def _plot_confusion_matrix(
y_test: pd.DataFrame, y_test_predict: pd.DataFrame, name_column: str, graph_name: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str
) -> None:
"""Plot the confusion matrix of the model."""
print("-----* Confusion Matrix *-----")
print("-----* {graph_name} *-----")
data = plot_confusion_matrix(y_test, y_test_predict, trained_model)
save_fig(f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path)
save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
index = [f"true_{i}" for i in range(int(y_test.nunique().values))]
columns = [f"pred_{i}" for i in range(int(y_test.nunique().values))]
data = pd.DataFrame(data, columns=columns, index=index)
save_data(data, name_column, f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path, True)
save_data(data, name_column, f"{graph_name} - {algorithm_name}", local_path, mlflow_path, True)

@staticmethod
def _plot_precision_recall(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
Expand Down Expand Up @@ -192,29 +194,29 @@ def _plot_precision_recall_threshold(
save_data(thresholds, name_column, f"{graph_name} - Thresholds", local_path, mlflow_path)

@staticmethod
def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
print("-----* ROC Curve *-----")
y_probs, fpr, tpr, thresholds = plot_ROC(X_test, y_test, trained_model, algorithm_name)
save_fig(f"ROC Curve - {algorithm_name}", local_path, mlflow_path)
def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, graph_name: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
print(f"-----* {graph_name} *-----")
y_probs, fpr, tpr, thresholds = plot_ROC(X_test, y_test, trained_model, graph_name, algorithm_name)
save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
y_probs = pd.DataFrame(y_probs, columns=["Probabilities"])
fpr = pd.DataFrame(fpr, columns=["False Positive Rate"])
tpr = pd.DataFrame(tpr, columns=["True Positive Rate"])
thresholds = pd.DataFrame(thresholds, columns=["Thresholds"])
save_data(y_probs, name_column, "ROC Curve - Probabilities", local_path, mlflow_path)
save_data(fpr, name_column, "ROC Curve - False Positive Rate", local_path, mlflow_path)
save_data(tpr, name_column, "ROC Curve - True Positive Rate", local_path, mlflow_path)
save_data(thresholds, name_column, "ROC Curve - Thresholds", local_path, mlflow_path)
save_data(y_probs, name_column, f"{graph_name} - Probabilities", local_path, mlflow_path)
save_data(fpr, name_column, f"{graph_name} - False Positive Rate", local_path, mlflow_path)
save_data(tpr, name_column, f"{graph_name} - True Positive Rate", local_path, mlflow_path)
save_data(thresholds, name_column, f"{graph_name} - Thresholds", local_path, mlflow_path)

@staticmethod
def _plot_2d_decision_boundary(
X: pd.DataFrame, X_test: pd.DataFrame, name_column1: str, name_column2: str, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str
X: pd.DataFrame, X_test: pd.DataFrame, name_column1: str, name_column2: str, trained_model: object, graph_name: str, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str
) -> None:
"""Plot the decision boundary of the trained model with the testing data set below."""
print("-----* Two-dimensional Decision Boundary Diagram *-----")
print(f"-----* {graph_name} *-----")
plot_2d_decision_boundary(X, X_test, trained_model, image_config)
save_fig(f"Decision Boundary - {algorithm_name}", local_path, mlflow_path)
save_data(X, name_column1, "Decision Boundary - X", local_path, mlflow_path)
save_data(X_test, name_column2, "Decision Boundary - X Test", local_path, mlflow_path)
save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
save_data(X, name_column1, f"{graph_name} - X", local_path, mlflow_path)
save_data(X_test, name_column2, f"{graph_name} - X Test", local_path, mlflow_path)

@staticmethod
def sample_balance(X_train: pd.DataFrame, y_train: pd.DataFrame, name_column: str, local_path: str, mlflow_path: str) -> tuple:
Expand Down Expand Up @@ -286,6 +288,7 @@ def common_components(self) -> None:
trained_model=self.model,
X_train=ClassificationWorkflowBase.X_train,
y_train=ClassificationWorkflowBase.y_train,
graph_name=ClassificationCommonFunction.CROSS_VALIDATION.value,
average=average,
cv_num=10,
algorithm_name=self.naming,
Expand All @@ -296,6 +299,7 @@ def common_components(self) -> None:
y_test_predict=ClassificationWorkflowBase.y_test_predict,
name_column=ClassificationWorkflowBase.name_test,
trained_model=self.model,
graph_name=ClassificationCommonFunction.CONFUSION_MATRIX.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand Down Expand Up @@ -326,6 +330,7 @@ def common_components(self) -> None:
y_test=ClassificationWorkflowBase.y_test,
name_column=ClassificationWorkflowBase.name_test,
trained_model=self.model,
graph_name=ClassificationCommonFunction.ROC_CURVE.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand All @@ -348,6 +353,7 @@ def common_components(self) -> None:
name_column2=ClassificationWorkflowBase.name_test,
trained_model=self.model,
image_config=self.image_config,
graph_name=ClassificationCommonFunction.TWO_DIMENSIONAL_DECISION_BOUNDARY_DIAGRAM.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand All @@ -374,6 +380,7 @@ def common_components(self, is_automl: bool) -> None:
trained_model=self.auto_model,
X_train=ClassificationWorkflowBase.X_train,
y_train=ClassificationWorkflowBase.y_train,
graph_name=ClassificationCommonFunction.CROSS_VALIDATION.value,
average=average,
cv_num=10,
algorithm_name=self.naming,
Expand All @@ -384,6 +391,7 @@ def common_components(self, is_automl: bool) -> None:
y_test_predict=ClassificationWorkflowBase.y_test_predict,
name_column=ClassificationWorkflowBase.name_test,
trained_model=self.auto_model,
graph_name=ClassificationCommonFunction.CONFUSION_MATRIX.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand Down Expand Up @@ -414,6 +422,7 @@ def common_components(self, is_automl: bool) -> None:
y_test=ClassificationWorkflowBase.y_test,
name_column=ClassificationWorkflowBase.name_test,
trained_model=self.auto_model,
graph_name=ClassificationCommonFunction.ROC_CURVE.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand All @@ -436,6 +445,7 @@ def common_components(self, is_automl: bool) -> None:
name_column2=ClassificationWorkflowBase.name_test,
trained_model=self.auto_model,
image_config=self.image_config,
graph_name=ClassificationCommonFunction.TWO_DIMENSIONAL_DECISION_BOUNDARY_DIAGRAM.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def plot_precision_recall_threshold(X_test: pd.DataFrame, y_test: pd.DataFrame,
return y_probs, precisions, recalls, thresholds


def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, algorithm_name: str) -> tuple:
def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, graph_name: str, algorithm_name: str) -> tuple:
"""Plot the ROC curve.

Parameters
Expand Down Expand Up @@ -324,7 +324,7 @@ def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object,
plt.plot([0, 1], [0, 1], "r--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title(f"ROC Curve - {algorithm_name}")
plt.title(f"{graph_name} - {algorithm_name}")
return y_probs, fpr, tpr, thresholds


Expand Down
Loading