From c166ace82bb5f13e09cc5e9a4c74c5ddbc9fa29c Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Wed, 11 Dec 2024 13:55:01 -0600
Subject: [PATCH] [#179] Create a new core.model_metrics module and move
 _calc_mcc() there

---
 hlink/linking/core/model_metrics.py           | 20 +++++++++++++++++++
 .../link_step_train_test_models.py            | 20 ++++---------------
 2 files changed, 24 insertions(+), 16 deletions(-)
 create mode 100644 hlink/linking/core/model_metrics.py

diff --git a/hlink/linking/core/model_metrics.py b/hlink/linking/core/model_metrics.py
new file mode 100644
index 0000000..7222c55
--- /dev/null
+++ b/hlink/linking/core/model_metrics.py
@@ -0,0 +1,20 @@
+# This file is part of the ISRDI's hlink.
+# For copyright and licensing information, see the NOTICE and LICENSE files
+# in this project's top-level directory, and also on-line at:
+#   https://github.com/ipums/hlink
+import math
+
+
+def mcc(tp: int, tn: int, fp: int, fn: int) -> float:
+    """
+    Given the counts of true positives (tp), true negatives (tn), false
+    positives (fp), and false negatives (fn) for a model run, compute the
+    Matthews Correlation Coefficient (MCC).
+    """
+    if (math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))) != 0:
+        mcc = ((tp * tn) - (fp * fn)) / (
+            math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
+        )
+    else:
+        mcc = 0
+    return mcc
diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 4693b9a..4498ed1 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -23,6 +23,7 @@
 from pyspark.sql import DataFrame
 from pyspark.sql.functions import col, count, count_if, mean
 from functools import reduce
+import hlink.linking.core.model_metrics as metrics_core
 import hlink.linking.core.threshold as threshold_core
 import hlink.linking.core.classifier as classifier_core
 
@@ -690,21 +691,6 @@ def _save_training_results(
             # )
 
 
-def _calc_mcc(tp: int, tn: int, fp: int, fn: int) -> float:
-    """
-    Given the counts of true positives (tp), true negatives (tn), false
-    positives (fp), and false negatives (fn) for a model run, compute the
-    Matthews Correlation Coefficient (MCC).
-    """
-    if (math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))) != 0:
-        mcc = ((tp * tn) - (fp * fn)) / (
-            math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
-        )
-    else:
-        mcc = 0
-    return mcc
-
-
 def _calc_threshold_matrix(
     alpha_threshold: float | list[float], threshold_ratio: float | list[float] | None
 ) -> list[list[float]]:
@@ -796,7 +782,9 @@ def _get_aggregate_metrics(
         recall = np.nan
     else:
         recall = true_positives / (true_positives + false_negatives)
-    mcc = _calc_mcc(true_positives, true_negatives, false_positives, false_negatives)
+    mcc = metrics_core.mcc(
+        true_positives, true_negatives, false_positives, false_negatives
+    )
     return precision, recall, mcc