From 6c1f66bb594d1ebb4441d381854b173da23ea51d Mon Sep 17 00:00:00 2001
From: Haibin <watsonwang00@gmail.com>
Date: Mon, 16 Sep 2024 01:19:10 +1000
Subject: [PATCH 1/4] 
 C:\Users\wang2\geochemistrypi\geochemistrypi\data_mining\model\classification.py

---
 .../data_mining/model/classification.py       | 40 +++++++++----------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py
index 8af681f..63ab736 100644
--- a/geochemistrypi/data_mining/model/classification.py
+++ b/geochemistrypi/data_mining/model/classification.py
@@ -144,24 +144,24 @@ def _classification_report(y_true: pd.DataFrame, y_predict: pd.DataFrame, algori
         mlflow.log_artifact(os.path.join(store_path, f"Classification Report - {algorithm_name}.txt"))
 
     @staticmethod
-    def _cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, average: str, cv_num: int, algorithm_name: str, store_path: str) -> None:
+    def _cross_validation(trained_model: object, X_train: pd.DataFrame, graph_name: str, y_train: pd.DataFrame, average: str, cv_num: int, algorithm_name: str, store_path: str) -> None:
         """Perform cross validation on the model."""
-        print("-----* Cross Validation *-----")
+        print(f"-----* {graph_name} *-----")
         print(f"K-Folds: {cv_num}")
         scores = cross_validation(trained_model, X_train, y_train, average=average, cv_num=cv_num)
         scores_str = json.dumps(scores, indent=4)
-        save_text(scores_str, f"Cross Validation - {algorithm_name}", store_path)
+        save_text(scores_str, f"{graph_name} - {algorithm_name}", store_path)
 
     @staticmethod
-    def _plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, name_column: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
+    def _plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, graph_name: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
         """Plot the confusion matrix of the model."""
-        print("-----* Confusion Matrix *-----")
+        print(f"-----* {graph_name} *-----")
         data = plot_confusion_matrix(y_test, y_test_predict, trained_model)
-        save_fig(f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path)
+        save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
         index = [f"true_{i}" for i in range(int(y_test.nunique().values))]
         columns = [f"pred_{i}" for i in range(int(y_test.nunique().values))]
         data = pd.DataFrame(data, columns=columns, index=index)
-        save_data(data, name_column, f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path, True)
+        save_data(data, f"{graph_name} - {algorithm_name}", local_path, mlflow_path, True)
 
     @staticmethod
     def _plot_precision_recall(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
@@ -192,29 +192,27 @@ def _plot_precision_recall_threshold(
         save_data(thresholds, name_column, f"{graph_name} - Thresholds", local_path, mlflow_path)
 
     @staticmethod
-    def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
-        print("-----* ROC Curve *-----")
+    def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
+        print(f"-----* {graph_name} *-----")
         y_probs, fpr, tpr, thresholds = plot_ROC(X_test, y_test, trained_model, algorithm_name)
-        save_fig(f"ROC Curve - {algorithm_name}", local_path, mlflow_path)
+        save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
         y_probs = pd.DataFrame(y_probs, columns=["Probabilities"])
         fpr = pd.DataFrame(fpr, columns=["False Positive Rate"])
         tpr = pd.DataFrame(tpr, columns=["True Positive Rate"])
         thresholds = pd.DataFrame(thresholds, columns=["Thresholds"])
-        save_data(y_probs, name_column, "ROC Curve - Probabilities", local_path, mlflow_path)
-        save_data(fpr, name_column, "ROC Curve - False Positive Rate", local_path, mlflow_path)
-        save_data(tpr, name_column, "ROC Curve - True Positive Rate", local_path, mlflow_path)
-        save_data(thresholds, name_column, "ROC Curve - Thresholds", local_path, mlflow_path)
+        save_data(y_probs, f"{graph_name} - Probabilities", local_path, mlflow_path)
+        save_data(fpr, f"{graph_name} - False Positive Rate", local_path, mlflow_path)
+        save_data(tpr, f"{graph_name} - True Positive Rate", local_path, mlflow_path)
+        save_data(thresholds, f"{graph_name} - Thresholds", local_path, mlflow_path)
 
     @staticmethod
-    def _plot_2d_decision_boundary(
-        X: pd.DataFrame, X_test: pd.DataFrame, name_column1: str, name_column2: str, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str
-    ) -> None:
+    def _plot_2d_decision_boundary(X: pd.DataFrame, X_test: pd.DataFrame, trained_model: object, graph_name: str, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
         """Plot the decision boundary of the trained model with the testing data set below."""
-        print("-----* Two-dimensional Decision Boundary Diagram *-----")
+        print(f"-----* {graph_name} *-----")
         plot_2d_decision_boundary(X, X_test, trained_model, image_config)
-        save_fig(f"Decision Boundary - {algorithm_name}", local_path, mlflow_path)
-        save_data(X, name_column1, "Decision Boundary - X", local_path, mlflow_path)
-        save_data(X_test, name_column2, "Decision Boundary - X Test", local_path, mlflow_path)
+        save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
+        save_data(X, f"{graph_name} - X", local_path, mlflow_path)
+        save_data(X_test, f"{graph_name} - X Test", local_path, mlflow_path)
 
     @staticmethod
     def sample_balance(X_train: pd.DataFrame, y_train: pd.DataFrame, name_column: str, local_path: str, mlflow_path: str) -> tuple:

From c4c4bca7dbb01deb5f3782b7390c9b55f0a98de6 Mon Sep 17 00:00:00 2001
From: Haibin <watsonwang00@gmail.com>
Date: Mon, 16 Sep 2024 01:02:37 +1000
Subject: [PATCH 2/4] perf: move name of classification common function output
 name to enum

---
 geochemistrypi/data_mining/model/classification.py | 14 +++++++++++---
 .../model/func/algo_classification/_common.py      |  8 ++++----
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py
index 63ab736..30a18e7 100644
--- a/geochemistrypi/data_mining/model/classification.py
+++ b/geochemistrypi/data_mining/model/classification.py
@@ -148,7 +148,7 @@ def _cross_validation(trained_model: object, X_train: pd.DataFrame, graph_name:
         """Perform cross validation on the model."""
         print(f"-----* {graph_name} *-----")
         print(f"K-Folds: {cv_num}")
-        scores = cross_validation(trained_model, X_train, y_train, average=average, cv_num=cv_num)
+        scores = cross_validation(trained_model, X_train, y_train, graph_name, average=average, cv_num=cv_num)
         scores_str = json.dumps(scores, indent=4)
         save_text(scores_str, f"{graph_name} - {algorithm_name}", store_path)
 
@@ -156,7 +156,7 @@ def _cross_validation(trained_model: object, X_train: pd.DataFrame, graph_name:
     def _plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, graph_name: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
         """Plot the confusion matrix of the model."""
         print(f"-----* {graph_name} *-----")
-        data = plot_confusion_matrix(y_test, y_test_predict, trained_model)
+        data = plot_confusion_matrix(y_test, y_test_predict, trained_model, graph_name)
         save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
         index = [f"true_{i}" for i in range(int(y_test.nunique().values))]
         columns = [f"pred_{i}" for i in range(int(y_test.nunique().values))]
@@ -194,7 +194,7 @@ def _plot_precision_recall_threshold(
     @staticmethod
     def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
         print(f"-----* {graph_name} *-----")
-        y_probs, fpr, tpr, thresholds = plot_ROC(X_test, y_test, trained_model, algorithm_name)
+        y_probs, fpr, tpr, thresholds = plot_ROC(X_test, y_test, trained_model, graph_name, algorithm_name)
         save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
         y_probs = pd.DataFrame(y_probs, columns=["Probabilities"])
         fpr = pd.DataFrame(fpr, columns=["False Positive Rate"])
@@ -284,6 +284,7 @@ def common_components(self) -> None:
             trained_model=self.model,
             X_train=ClassificationWorkflowBase.X_train,
             y_train=ClassificationWorkflowBase.y_train,
+            graph_name=ClassificationCommonFunction.CROSS_VALIDATION.value,
             average=average,
             cv_num=10,
             algorithm_name=self.naming,
@@ -294,6 +295,7 @@ def common_components(self) -> None:
             y_test_predict=ClassificationWorkflowBase.y_test_predict,
             name_column=ClassificationWorkflowBase.name_test,
             trained_model=self.model,
+            graph_name=ClassificationCommonFunction.CONFUSION_MATRIX.value,
             algorithm_name=self.naming,
             local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
             mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
@@ -324,6 +326,7 @@ def common_components(self) -> None:
                 y_test=ClassificationWorkflowBase.y_test,
                 name_column=ClassificationWorkflowBase.name_test,
                 trained_model=self.model,
+                graph_name=ClassificationCommonFunction.ROC_CURVE.value,
                 algorithm_name=self.naming,
                 local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
                 mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
@@ -346,6 +349,7 @@ def common_components(self) -> None:
                 name_column2=ClassificationWorkflowBase.name_test,
                 trained_model=self.model,
                 image_config=self.image_config,
+                graph_name=ClassificationCommonFunction.TWO_DIMENSIONAL_DECISION_BOUNDARY_DIAGRAM.value,
                 algorithm_name=self.naming,
                 local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
                 mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
@@ -372,6 +376,7 @@ def common_components(self, is_automl: bool) -> None:
             trained_model=self.auto_model,
             X_train=ClassificationWorkflowBase.X_train,
             y_train=ClassificationWorkflowBase.y_train,
+            graph_name=ClassificationCommonFunction.CROSS_VALIDATION.value,
             average=average,
             cv_num=10,
             algorithm_name=self.naming,
@@ -382,6 +387,7 @@ def common_components(self, is_automl: bool) -> None:
             y_test_predict=ClassificationWorkflowBase.y_test_predict,
             name_column=ClassificationWorkflowBase.name_test,
             trained_model=self.auto_model,
+            graph_name=ClassificationCommonFunction.CONFUSION_MATRIX.value,
             algorithm_name=self.naming,
             local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
             mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
@@ -412,6 +418,7 @@ def common_components(self, is_automl: bool) -> None:
                 y_test=ClassificationWorkflowBase.y_test,
                 name_column=ClassificationWorkflowBase.name_test,
                 trained_model=self.auto_model,
+                graph_name=ClassificationCommonFunction.ROC_CURVE.value,
                 algorithm_name=self.naming,
                 local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
                 mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
@@ -434,6 +441,7 @@ def common_components(self, is_automl: bool) -> None:
                 name_column2=ClassificationWorkflowBase.name_test,
                 trained_model=self.auto_model,
                 image_config=self.image_config,
+                graph_name=ClassificationCommonFunction.TWO_DIMENSIONAL_DECISION_BOUNDARY_DIAGRAM.value,
                 algorithm_name=self.naming,
                 local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
                 mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_common.py b/geochemistrypi/data_mining/model/func/algo_classification/_common.py
index 8d2058c..78031b8 100644
--- a/geochemistrypi/data_mining/model/func/algo_classification/_common.py
+++ b/geochemistrypi/data_mining/model/func/algo_classification/_common.py
@@ -68,7 +68,7 @@ def score(y_true: pd.DataFrame, y_predict: pd.DataFrame) -> tuple[str, Dict]:
     return average, scores
 
 
-def plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, trained_model: object) -> np.ndarray:
+def plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, trained_model: object, graph_name: str) -> np.ndarray:
     """Plot the confusion matrix.
 
     Parameters
@@ -124,7 +124,7 @@ def display_cross_validation_scores(scores: np.ndarray, score_name: str) -> Dict
     return cv_scores
 
 
-def cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, average: str, cv_num: int = 10) -> Dict:
+def cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, graph_name: str, average: str, cv_num: int = 10) -> Dict:
     """Evaluate metric(s) by cross-validation and also record fit/score times.
 
     Parameters
@@ -286,7 +286,7 @@ def plot_precision_recall_threshold(X_test: pd.DataFrame, y_test: pd.DataFrame,
     return y_probs, precisions, recalls, thresholds
 
 
-def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, algorithm_name: str) -> tuple:
+def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, graph_name: str, algorithm_name: str) -> tuple:
     """Plot the ROC curve.
 
     Parameters
@@ -324,7 +324,7 @@ def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object,
     plt.plot([0, 1], [0, 1], "r--")
     plt.xlabel("False Positive Rate")
     plt.ylabel("True Positive Rate (Recall)")
-    plt.title(f"ROC Curve - {algorithm_name}")
+    plt.title(f"{graph_name} - {algorithm_name}")
     return y_probs, fpr, tpr, thresholds
 
 

From e5f518b19351dbc8acb3e8f92be3c5aa9d5f39aa Mon Sep 17 00:00:00 2001
From: Haibin <watsonwang00@gmail.com>
Date: Fri, 20 Sep 2024 00:03:30 +1000
Subject: [PATCH 3/4] perf: rename common fucntion name to enum/fix some bug

---
 .../data_mining/model/classification.py       | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py
index 30a18e7..4870d2b 100644
--- a/geochemistrypi/data_mining/model/classification.py
+++ b/geochemistrypi/data_mining/model/classification.py
@@ -153,7 +153,9 @@ def _cross_validation(trained_model: object, X_train: pd.DataFrame, graph_name:
         save_text(scores_str, f"{graph_name} - {algorithm_name}", store_path)
 
     @staticmethod
-    def _plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, graph_name: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
+    def _plot_confusion_matrix(
+        y_test: pd.DataFrame, y_test_predict: pd.DataFrame, name_column: str, graph_name: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str
+    ) -> None:
         """Plot the confusion matrix of the model."""
         print(f"-----* {graph_name} *-----")
         data = plot_confusion_matrix(y_test, y_test_predict, trained_model, graph_name)
@@ -161,7 +163,7 @@ def _plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, g
         index = [f"true_{i}" for i in range(int(y_test.nunique().values))]
         columns = [f"pred_{i}" for i in range(int(y_test.nunique().values))]
         data = pd.DataFrame(data, columns=columns, index=index)
-        save_data(data, f"{graph_name} - {algorithm_name}", local_path, mlflow_path, True)
+        save_data(data, name_column, f"{graph_name} - {algorithm_name}", local_path, mlflow_path, True)
 
     @staticmethod
     def _plot_precision_recall(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
@@ -192,7 +194,7 @@ def _plot_precision_recall_threshold(
         save_data(thresholds, name_column, f"{graph_name} - Thresholds", local_path, mlflow_path)
 
     @staticmethod
-    def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
+    def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
         print(f"-----* {graph_name} *-----")
         y_probs, fpr, tpr, thresholds = plot_ROC(X_test, y_test, trained_model, graph_name, algorithm_name)
         save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
@@ -200,19 +202,21 @@ def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object,
         fpr = pd.DataFrame(fpr, columns=["False Positive Rate"])
         tpr = pd.DataFrame(tpr, columns=["True Positive Rate"])
         thresholds = pd.DataFrame(thresholds, columns=["Thresholds"])
-        save_data(y_probs, f"{graph_name} - Probabilities", local_path, mlflow_path)
-        save_data(fpr, f"{graph_name} - False Positive Rate", local_path, mlflow_path)
-        save_data(tpr, f"{graph_name} - True Positive Rate", local_path, mlflow_path)
-        save_data(thresholds, f"{graph_name} - Thresholds", local_path, mlflow_path)
+        save_data(y_probs, name_column, f"{graph_name} - Probabilities", local_path, mlflow_path)
+        save_data(fpr, name_column, f"{graph_name} - False Positive Rate", local_path, mlflow_path)
+        save_data(tpr, name_column, f"{graph_name} - True Positive Rate", local_path, mlflow_path)
+        save_data(thresholds, name_column, f"{graph_name} - Thresholds", local_path, mlflow_path)
 
     @staticmethod
-    def _plot_2d_decision_boundary(X: pd.DataFrame, X_test: pd.DataFrame, trained_model: object, graph_name: str, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
+    def _plot_2d_decision_boundary(
+        X: pd.DataFrame, X_test: pd.DataFrame, name_column1: str, name_column2: str, trained_model: object, graph_name: str, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str
+    ) -> None:
         """Plot the decision boundary of the trained model with the testing data set below."""
         print(f"-----* {graph_name} *-----")
         plot_2d_decision_boundary(X, X_test, trained_model, image_config)
         save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
-        save_data(X, f"{graph_name} - X", local_path, mlflow_path)
-        save_data(X_test, f"{graph_name} - X Test", local_path, mlflow_path)
+        save_data(X, name_column1, f"{graph_name} - X", local_path, mlflow_path)
+        save_data(X_test, name_column2, f"{graph_name} - X Test", local_path, mlflow_path)
 
     @staticmethod
     def sample_balance(X_train: pd.DataFrame, y_train: pd.DataFrame, name_column: str, local_path: str, mlflow_path: str) -> tuple:

From 051eb1466b7be4af16790505315c7a901708ec0c Mon Sep 17 00:00:00 2001
From: Haibin <watsonwang00@gmail.com>
Date: Fri, 20 Sep 2024 00:05:21 +1000
Subject: [PATCH 4/4] perf: rename common fucntion name to enum/fix some bug

---
 geochemistrypi/data_mining/utils/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/geochemistrypi/data_mining/utils/base.py b/geochemistrypi/data_mining/utils/base.py
index 84b3965..9bb338d 100644
--- a/geochemistrypi/data_mining/utils/base.py
+++ b/geochemistrypi/data_mining/utils/base.py
@@ -216,7 +216,7 @@ def save_data(df: pd.DataFrame, name_column: str, df_name: str, local_data_path:
         Whether to write the index.
     """
     if name_column is not None and len(df) == len(name_column):
-        name_column = name_column.loc[df.index].reset_index(drop=True)
+        # name_column = name_column.loc[df.index].reset_index(drop=True)
         df.reset_index(drop=True, inplace=True)
         name_column.reset_index(drop=True, inplace=True)
         df = pd.concat([name_column, df], axis=1)