From f0833781d0205f4989005b5ef19ada3ac24caf8f Mon Sep 17 00:00:00 2001
From: Colin Davis <colin.c.davis@gmail.com>
Date: Tue, 10 Dec 2024 11:25:45 -0600
Subject: [PATCH] Tests pass

---
 .../link_step_train_test_models.py            | 26 ++++++++++++++++---
 hlink/tests/model_exploration_test.py         | 12 ++++++---
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index a2e65c5..070c1da 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -975,7 +975,7 @@ def _combine_by_threshold_matrix_entry(
     threshold_results: list[dict[int, ThresholdTestResult]],
 ) -> list[ThresholdTestResult]:
     # This list will have a size of the number of threshold matrix entries
-    results: list[ThresholdTestResult] = []
+    results: list[list[ThresholdTestResult]] = []
 
     # Check number of folds
     if len(threshold_results) < 2:
@@ -1027,15 +1027,35 @@ def _aggregate_per_threshold_results(
     pr_auc_test_sd = statistics.stdev(pr_auc_test) if len(pr_auc_test) > 1 else np.nan
     mcc_test_sd = statistics.stdev(mcc_test) if len(mcc_test) > 1 else np.nan
 
+    # Deal with tiny test data. This should never arise in practice but if it did we ought
+    # to issue a warning.
+    if len(precision_test) < 1:
+        #        raise RuntimeError("Not enough training data to get any valid precision values.")
+        precision_test_mean = np.nan
+    else:
+        precision_test_mean = (
+            statistics.mean(precision_test)
+            if len(precision_test) > 1
+            else precision_test[0]
+        )
+
+    if len(recall_test) < 1:
+        # raise RuntimeError("Not enough training data to get any valid recall values.")
+        recall_test_mean = np.nan
+    else:
+        recall_test_mean = (
+            statistics.mean(recall_test) if len(recall_test) > 1 else recall_test[0]
+        )
+
     new_desc = pd.DataFrame(
         {
             "model": [best_models[0].model_type],
             "parameters": [best_models[0].hyperparams],
             "alpha_threshold": [alpha_threshold],
             "threshold_ratio": [threshold_ratio],
-            "precision_test_mean": [statistics.mean(precision_test)],
+            "precision_test_mean": [precision_test_mean],
             "precision_test_sd": [precision_test_sd],
-            "recall_test_mean": [statistics.mean(recall_test)],
+            "recall_test_mean": [recall_test_mean],
             "recall_test_sd": [recall_test_sd],
             "pr_auc_test_mean": [statistics.mean(pr_auc_test)],
             "pr_auc_test_sd": [pr_auc_test_sd],
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 30bca92..46166c5 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -684,7 +684,6 @@ def test_step_2_train_random_forest_spark(
             "featureSubsetStrategy": "sqrt",
         }
     ]
-    feature_conf["training"]["output_suspicious_TD"] = True
     feature_conf["training"]["n_training_iterations"] = 3
 
     model_exploration.run_step(0)
@@ -694,9 +693,12 @@ def test_step_2_train_random_forest_spark(
     tr = spark.table("model_eval_training_results").toPandas()
     print(f"training results {tr}")
     # assert tr.shape == (1, 18)
-    assert tr.query("model == 'random_forest'")["pr_auc_mean"].iloc[0] > 2.0 / 3.0
+    assert tr.query("model == 'random_forest'")["pr_auc_test_mean"].iloc[0] > 2.0 / 3.0
     assert tr.query("model == 'random_forest'")["maxDepth"].iloc[0] == 3
 
+    # TODO probably remove these since we're not planning to test suspicious data anymore.
+    # I disabled the saving of suspicious in this test config so these are invalid currently.
+    """
     FNs = spark.table("model_eval_repeat_fns").toPandas()
     assert FNs.shape == (3, 4)
     assert FNs.query("id_a == 30")["count"].iloc[0] == 3
@@ -706,6 +708,7 @@ def test_step_2_train_random_forest_spark(
 
     TNs = spark.table("model_eval_repeat_tns").toPandas()
     assert TNs.shape == (6, 4)
+    """
 
     main.do_drop_all("")
 
@@ -717,18 +720,19 @@ def test_step_2_train_logistic_regression_spark(
     feature_conf["training"]["model_parameters"] = [
         {"type": "logistic_regression", "threshold": 0.7}
     ]
-    feature_conf["training"]["n_training_iterations"] = 4
+    feature_conf["training"]["n_training_iterations"] = 3
 
     model_exploration.run_step(0)
     model_exploration.run_step(1)
     model_exploration.run_step(2)
 
     tr = spark.table("model_eval_training_results").toPandas()
+    # assert tr.count == 3
 
     assert tr.shape == (1, 11)
     # This is now 0.83333333333.... I'm not sure it's worth testing against
     # assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] == 0.75
-    assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] > 0.74
+    assert tr.query("model == 'logistic_regression'")["pr_auc_test_mean"].iloc[0] > 0.74
     assert (
         round(tr.query("model == 'logistic_regression'")["alpha_threshold"].iloc[0], 1)
         == 0.7