From c9791880a0fdab3201144e6d7007a1961a580160 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 8 Oct 2024 21:15:40 +0000
Subject: [PATCH] [#154] Rework the
 model_exploration.link_step_train_test_models output

The "Evaluating model performance..." print statements are too chatty for runs
with a large threshold matrix.
---
 .../link_step_train_test_models.py               | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 570aade..b694980 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -74,9 +74,9 @@ def _run(self) -> None:
             f"each of these has {n_training_iterations} train-test splits to test on"
         )
         for run_index, run in enumerate(model_parameters, 1):
-            logger.info(
-                f"Starting run {run_index} of {len(model_parameters)} with these parameters: {run}"
-            )
+            run_start_info = f"Starting run {run_index} of {len(model_parameters)} with these parameters: {run}"
+            print(run_start_info)
+            logger.info(run_start_info)
             params = run.copy()
             model_type = params.pop("type")
 
@@ -103,9 +103,9 @@ def _run(self) -> None:
 
             first = True
             for split_index, (training_data, test_data) in enumerate(splits, 1):
-                logger.debug(
-                    f"Training and testing the model on train-test split {split_index} of {n_training_iterations}"
-                )
+                split_start_info = f"Training and testing the model on train-test split {split_index} of {n_training_iterations}"
+                print(split_start_info)
+                logger.debug(split_start_info)
                 training_data.cache()
                 test_data.cache()
 
@@ -139,7 +139,7 @@ def _run(self) -> None:
                 param_text = np.full(precision.shape, f"{model_type}_{params}")
 
                 pr_auc = auc(recall, precision)
-                print(f"Area under PR curve: {pr_auc}")
+                print(f"The area under the precision-recall curve is {pr_auc}")
 
                 if first:
                     prc = pd.DataFrame(
@@ -287,7 +287,6 @@ def _capture_results(
     ) -> pd.DataFrame:
         table_prefix = self.task.table_prefix
 
-        print("Evaluating model performance...")
         # write to sql tables for testing
         predictions.createOrReplaceTempView(f"{table_prefix}predictions")
         predict_train.createOrReplaceTempView(f"{table_prefix}predict_train")
@@ -596,7 +595,6 @@ def _append_results(
     params: dict[str, Any],
 ) -> pd.DataFrame:
     # run.pop("type")
-    print(results_df)
 
     new_desc = pd.DataFrame(
         {