diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py index 8af681f..4870d2b 100644 --- a/geochemistrypi/data_mining/model/classification.py +++ b/geochemistrypi/data_mining/model/classification.py @@ -144,24 +144,26 @@ def _classification_report(y_true: pd.DataFrame, y_predict: pd.DataFrame, algori mlflow.log_artifact(os.path.join(store_path, f"Classification Report - {algorithm_name}.txt")) @staticmethod - def _cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, average: str, cv_num: int, algorithm_name: str, store_path: str) -> None: + def _cross_validation(trained_model: object, X_train: pd.DataFrame, graph_name: str, y_train: pd.DataFrame, average: str, cv_num: int, algorithm_name: str, store_path: str) -> None: """Perform cross validation on the model.""" - print("-----* Cross Validation *-----") + print(f"-----* {graph_name} *-----") print(f"K-Folds: {cv_num}") - scores = cross_validation(trained_model, X_train, y_train, average=average, cv_num=cv_num) + scores = cross_validation(trained_model, X_train, y_train, graph_name, average=average, cv_num=cv_num) scores_str = json.dumps(scores, indent=4) - save_text(scores_str, f"Cross Validation - {algorithm_name}", store_path) + save_text(scores_str, f"{graph_name} - {algorithm_name}", store_path) @staticmethod - def _plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, name_column: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_confusion_matrix( + y_test: pd.DataFrame, y_test_predict: pd.DataFrame, name_column: str, graph_name: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str + ) -> None: """Plot the confusion matrix of the model.""" - print("-----* Confusion Matrix *-----") - data = plot_confusion_matrix(y_test, y_test_predict, trained_model) - save_fig(f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path) + print(f"-----* {graph_name} *-----") + data = plot_confusion_matrix(y_test, y_test_predict, trained_model, graph_name) + save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) index = [f"true_{i}" for i in range(int(y_test.nunique().values))] columns = [f"pred_{i}" for i in range(int(y_test.nunique().values))] data = pd.DataFrame(data, columns=columns, index=index) - save_data(data, name_column, f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path, True) + save_data(data, name_column, f"{graph_name} - {algorithm_name}", local_path, mlflow_path, True) @staticmethod def _plot_precision_recall(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: @@ -192,29 +194,29 @@ def _plot_precision_recall_threshold( save_data(thresholds, name_column, f"{graph_name} - Thresholds", local_path, mlflow_path) @staticmethod - def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None: - print("-----* ROC Curve *-----") - y_probs, fpr, tpr, thresholds = plot_ROC(X_test, y_test, trained_model, algorithm_name) - save_fig(f"ROC Curve - {algorithm_name}", local_path, mlflow_path) + def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + print(f"-----* {graph_name} *-----") + y_probs, fpr, tpr, thresholds = plot_ROC(X_test, y_test, trained_model, graph_name, algorithm_name) + save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) y_probs = pd.DataFrame(y_probs, columns=["Probabilities"]) fpr = pd.DataFrame(fpr, columns=["False Positive Rate"]) tpr = pd.DataFrame(tpr, columns=["True Positive Rate"]) thresholds = pd.DataFrame(thresholds, columns=["Thresholds"]) - save_data(y_probs, name_column, "ROC Curve - Probabilities", local_path, mlflow_path) - save_data(fpr, name_column, "ROC Curve - False Positive Rate", local_path, mlflow_path) - save_data(tpr, name_column, "ROC Curve - True Positive Rate", local_path, mlflow_path) - save_data(thresholds, name_column, "ROC Curve - Thresholds", local_path, mlflow_path) + save_data(y_probs, name_column, f"{graph_name} - Probabilities", local_path, mlflow_path) + save_data(fpr, name_column, f"{graph_name} - False Positive Rate", local_path, mlflow_path) + save_data(tpr, name_column, f"{graph_name} - True Positive Rate", local_path, mlflow_path) + save_data(thresholds, name_column, f"{graph_name} - Thresholds", local_path, mlflow_path) @staticmethod def _plot_2d_decision_boundary( - X: pd.DataFrame, X_test: pd.DataFrame, name_column1: str, name_column2: str, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str + X: pd.DataFrame, X_test: pd.DataFrame, name_column1: str, name_column2: str, trained_model: object, graph_name: str, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str ) -> None: """Plot the decision boundary of the trained model with the testing data set below.""" - print("-----* Two-dimensional Decision Boundary Diagram *-----") + print(f"-----* {graph_name} *-----") plot_2d_decision_boundary(X, X_test, trained_model, image_config) - save_fig(f"Decision Boundary - {algorithm_name}", local_path, mlflow_path) - save_data(X, name_column1, "Decision Boundary - X", local_path, mlflow_path) - save_data(X_test, name_column2, "Decision Boundary - X Test", local_path, mlflow_path) + save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) + save_data(X, name_column1, f"{graph_name} - X", local_path, mlflow_path) + save_data(X_test, name_column2, f"{graph_name} - X Test", local_path, mlflow_path) @staticmethod def sample_balance(X_train: pd.DataFrame, y_train: pd.DataFrame, name_column: str, local_path: str, mlflow_path: str) -> tuple: @@ -286,6 +288,7 @@ def common_components(self) -> None: trained_model=self.model, X_train=ClassificationWorkflowBase.X_train, y_train=ClassificationWorkflowBase.y_train, + graph_name=ClassificationCommonFunction.CROSS_VALIDATION.value, average=average, cv_num=10, algorithm_name=self.naming, @@ -296,6 +299,7 @@ def common_components(self) -> None: y_test_predict=ClassificationWorkflowBase.y_test_predict, name_column=ClassificationWorkflowBase.name_test, trained_model=self.model, + graph_name=ClassificationCommonFunction.CONFUSION_MATRIX.value, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -326,6 +330,7 @@ def common_components(self) -> None: y_test=ClassificationWorkflowBase.y_test, name_column=ClassificationWorkflowBase.name_test, trained_model=self.model, + graph_name=ClassificationCommonFunction.ROC_CURVE.value, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -348,6 +353,7 @@ def common_components(self) -> None: name_column2=ClassificationWorkflowBase.name_test, trained_model=self.model, image_config=self.image_config, + graph_name=ClassificationCommonFunction.TWO_DIMENSIONAL_DECISION_BOUNDARY_DIAGRAM.value, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -374,6 +380,7 @@ def common_components(self, is_automl: bool) -> None: trained_model=self.auto_model, X_train=ClassificationWorkflowBase.X_train, y_train=ClassificationWorkflowBase.y_train, + graph_name=ClassificationCommonFunction.CROSS_VALIDATION.value, average=average, cv_num=10, algorithm_name=self.naming, @@ -384,6 +391,7 @@ def common_components(self, is_automl: bool) -> None: y_test_predict=ClassificationWorkflowBase.y_test_predict, name_column=ClassificationWorkflowBase.name_test, trained_model=self.auto_model, + graph_name=ClassificationCommonFunction.CONFUSION_MATRIX.value, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -414,6 +422,7 @@ def common_components(self, is_automl: bool) -> None: y_test=ClassificationWorkflowBase.y_test, name_column=ClassificationWorkflowBase.name_test, trained_model=self.auto_model, + graph_name=ClassificationCommonFunction.ROC_CURVE.value, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -436,6 +445,7 @@ def common_components(self, is_automl: bool) -> None: name_column2=ClassificationWorkflowBase.name_test, trained_model=self.auto_model, image_config=self.image_config, + graph_name=ClassificationCommonFunction.TWO_DIMENSIONAL_DECISION_BOUNDARY_DIAGRAM.value, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_common.py b/geochemistrypi/data_mining/model/func/algo_classification/_common.py index 8d2058c..78031b8 100644 --- a/geochemistrypi/data_mining/model/func/algo_classification/_common.py +++ b/geochemistrypi/data_mining/model/func/algo_classification/_common.py @@ -68,7 +68,7 @@ def score(y_true: pd.DataFrame, y_predict: pd.DataFrame) -> tuple[str, Dict]: return average, scores -def plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, trained_model: object) -> np.ndarray: +def plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, trained_model: object, graph_name: str) -> np.ndarray: """Plot the confusion matrix. Parameters @@ -124,7 +124,7 @@ def display_cross_validation_scores(scores: np.ndarray, score_name: str) -> Dict return cv_scores -def cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, average: str, cv_num: int = 10) -> Dict: +def cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, graph_name: str, average: str, cv_num: int = 10) -> Dict: """Evaluate metric(s) by cross-validation and also record fit/score times. Parameters @@ -286,7 +286,7 @@ def plot_precision_recall_threshold(X_test: pd.DataFrame, y_test: pd.DataFrame, return y_probs, precisions, recalls, thresholds -def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, algorithm_name: str) -> tuple: +def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, graph_name: str, algorithm_name: str) -> tuple: """Plot the ROC curve. Parameters @@ -324,7 +324,7 @@ def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, plt.plot([0, 1], [0, 1], "r--") plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate (Recall)") - plt.title(f"ROC Curve - {algorithm_name}") + plt.title(f"{graph_name} - {algorithm_name}") return y_probs, fpr, tpr, thresholds diff --git a/geochemistrypi/data_mining/utils/base.py b/geochemistrypi/data_mining/utils/base.py index 84b3965..9bb338d 100644 --- a/geochemistrypi/data_mining/utils/base.py +++ b/geochemistrypi/data_mining/utils/base.py @@ -216,7 +216,7 @@ def save_data(df: pd.DataFrame, name_column: str, df_name: str, local_data_path: Whether to write the index. """ if name_column is not None and len(df) == len(name_column): - name_column = name_column.loc[df.index].reset_index(drop=True) + # name_column = name_column.loc[df.index].reset_index(drop=True) df.reset_index(drop=True, inplace=True) name_column.reset_index(drop=True, inplace=True) df = pd.concat([name_column, df], axis=1)