From c166ace82bb5f13e09cc5e9a4c74c5ddbc9fa29c Mon Sep 17 00:00:00 2001 From: rileyh Date: Wed, 11 Dec 2024 13:55:01 -0600 Subject: [PATCH] [#179] Create a new core.model_metrics module and move _calc_mcc() there --- hlink/linking/core/model_metrics.py | 20 +++++++++++++++++++ .../link_step_train_test_models.py | 20 ++++--------------- 2 files changed, 24 insertions(+), 16 deletions(-) create mode 100644 hlink/linking/core/model_metrics.py diff --git a/hlink/linking/core/model_metrics.py b/hlink/linking/core/model_metrics.py new file mode 100644 index 0000000..7222c55 --- /dev/null +++ b/hlink/linking/core/model_metrics.py @@ -0,0 +1,20 @@ +# This file is part of the ISRDI's hlink. +# For copyright and licensing information, see the NOTICE and LICENSE files +# in this project's top-level directory, and also on-line at: +# https://github.com/ipums/hlink +import math + + +def mcc(tp: int, tn: int, fp: int, fn: int) -> float: + """ + Given the counts of true positives (tp), true negatives (tn), false + positives (fp), and false negatives (fn) for a model run, compute the + Matthews Correlation Coefficient (MCC). + """ + if (math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))) != 0: + mcc = ((tp * tn) - (fp * fn)) / ( + math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) + ) + else: + mcc = 0 + return mcc diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 4693b9a..4498ed1 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -23,6 +23,7 @@ from pyspark.sql import DataFrame from pyspark.sql.functions import col, count, count_if, mean from functools import reduce +import hlink.linking.core.model_metrics as metrics_core import hlink.linking.core.threshold as threshold_core import hlink.linking.core.classifier as classifier_core @@ -690,21 +691,6 @@ def _save_training_results( # ) -def _calc_mcc(tp: int, tn: int, fp: int, fn: int) -> float: - """ - Given the counts of true positives (tp), true negatives (tn), false - positives (fp), and false negatives (fn) for a model run, compute the - Matthews Correlation Coefficient (MCC). - """ - if (math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))) != 0: - mcc = ((tp * tn) - (fp * fn)) / ( - math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) - ) - else: - mcc = 0 - return mcc - - def _calc_threshold_matrix( alpha_threshold: float | list[float], threshold_ratio: float | list[float] | None ) -> list[list[float]]: @@ -796,7 +782,9 @@ def _get_aggregate_metrics( recall = np.nan else: recall = true_positives / (true_positives + false_negatives) - mcc = _calc_mcc(true_positives, true_negatives, false_positives, false_negatives) + mcc = metrics_core.mcc( + true_positives, true_negatives, false_positives, false_negatives + ) return precision, recall, mcc