From f0ca30925e720f62ec0d077f40c6b8395df6e2bc Mon Sep 17 00:00:00 2001
From: Patrick Bloebaum <bloebp@amazon.com>
Date: Fri, 12 Apr 2024 10:25:32 -0700
Subject: [PATCH] Add new statistical method to merge p-values

This uses an improved version of the "twice the average" rule following recent results from M. Gasparini, R. Wang, and A. Ramdas, *Combining exchangeable p-values*, arXiv 2404.03484, 2024.

This new method is now used by default when merging p-values. Accordingly, the quantile based method was renamed to be more consistent with the naming pattern.

Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
---
 dowhy/gcm/independence_test/kernel.py     |  6 +--
 dowhy/gcm/independence_test/regression.py |  4 +-
 dowhy/gcm/model_evaluation.py             |  4 +-
 dowhy/gcm/stats.py                        | 45 ++++++++++++++++--
 tests/gcm/test_stats.py                   | 58 +++++++++++++++--------
 5 files changed, 86 insertions(+), 31 deletions(-)

diff --git a/dowhy/gcm/independence_test/kernel.py b/dowhy/gcm/independence_test/kernel.py
index 1212d0d8a5..c1db78f38c 100644
--- a/dowhy/gcm/independence_test/kernel.py
+++ b/dowhy/gcm/independence_test/kernel.py
@@ -8,7 +8,7 @@
 
 import dowhy.gcm.config as config
 from dowhy.gcm.independence_test.kernel_operation import approximate_rbf_kernel_features
-from dowhy.gcm.stats import quantile_based_fwer
+from dowhy.gcm.stats import merge_p_values_average
 from dowhy.gcm.util.general import auto_apply_encoders, auto_fit_encoders, set_random_seed, shape_into_2d
 
 
@@ -20,7 +20,7 @@ def kernel_based(
     bootstrap_num_runs: int = 10,
     max_num_samples_run: int = 2000,
     bootstrap_n_jobs: Optional[int] = None,
-    p_value_adjust_func: Callable[[Union[np.ndarray, List[float]]], float] = quantile_based_fwer,
+    p_value_adjust_func: Callable[[Union[np.ndarray, List[float]]], float] = merge_p_values_average,
     **kwargs,
 ) -> float:
     """Prepares the data and uses kernel (conditional) independence test. The independence test estimates a p-value
@@ -124,7 +124,7 @@ def approx_kernel_based(
     bootstrap_num_runs: int = 10,
     bootstrap_num_samples: int = 1000,
     bootstrap_n_jobs: Optional[int] = None,
-    p_value_adjust_func: Callable[[Union[np.ndarray, List[float]]], float] = quantile_based_fwer,
+    p_value_adjust_func: Callable[[Union[np.ndarray, List[float]]], float] = merge_p_values_average,
 ) -> float:
     """Implementation of the Randomized Conditional Independence Test. The independence test estimates a p-value
     for the null hypothesis that X and Y are independent (given Z). Depending whether Z is given, a conditional or
diff --git a/dowhy/gcm/independence_test/regression.py b/dowhy/gcm/independence_test/regression.py
index f10e1aaa4b..2beb322a28 100644
--- a/dowhy/gcm/independence_test/regression.py
+++ b/dowhy/gcm/independence_test/regression.py
@@ -11,7 +11,7 @@
 from sklearn.preprocessing import scale
 
 import dowhy.gcm.config as config
-from dowhy.gcm.stats import estimate_ftest_pvalue, quantile_based_fwer
+from dowhy.gcm.stats import estimate_ftest_pvalue, merge_p_values_average
 from dowhy.gcm.util.general import auto_apply_encoders, auto_fit_encoders, set_random_seed, shape_into_2d
 
 
@@ -21,7 +21,7 @@ def regression_based(
     Z: Optional[np.ndarray] = None,
     max_num_components_all_inputs: int = 40,
     k_folds: int = 3,
-    p_value_adjust_func: Callable[[Union[np.ndarray, List[float]]], float] = quantile_based_fwer,
+    p_value_adjust_func: Callable[[Union[np.ndarray, List[float]]], float] = merge_p_values_average,
     max_samples_per_fold: int = -1,
     n_jobs: Optional[int] = None,
 ) -> float:
diff --git a/dowhy/gcm/model_evaluation.py b/dowhy/gcm/model_evaluation.py
index a80e37132b..09558b6a62 100644
--- a/dowhy/gcm/model_evaluation.py
+++ b/dowhy/gcm/model_evaluation.py
@@ -43,7 +43,7 @@
     create_polynom_logistic_regression_classifier,
 )
 from dowhy.gcm.ml.regression import create_ada_boost_regressor, create_extra_trees_regressor, create_polynom_regressor
-from dowhy.gcm.stats import quantile_based_fwer
+from dowhy.gcm.stats import merge_p_values_average
 from dowhy.gcm.util.general import is_categorical, set_random_seed, shape_into_2d
 from dowhy.graph import get_ordered_predecessors, is_root_node
 
@@ -599,7 +599,7 @@ def _evaluate_invertibility_assumptions(
                         parent_samples[random_indices],
                     )
                 )
-            all_pnl_p_values[node] = quantile_based_fwer(tmp_p_values)
+            all_pnl_p_values[node] = merge_p_values_average(tmp_p_values)
 
     if len(all_pnl_p_values) == 0:
         return all_pnl_p_values
diff --git a/dowhy/gcm/stats.py b/dowhy/gcm/stats.py
index bb554f7567..2d778bc108 100644
--- a/dowhy/gcm/stats.py
+++ b/dowhy/gcm/stats.py
@@ -9,14 +9,51 @@
 from dowhy.gcm.util.general import shape_into_2d
 
 
-def quantile_based_fwer(
+def merge_p_values_average(p_values: Union[np.ndarray, List[float]], randomization: bool = False) -> float:
+    """A statistically sound method to merge multiple potentially dependent p-values into one. This is a statistically
+    improved (i.e., more powerful) version of the "twice the average" rule, following Theorem 5.3
+    (second equation, F_UA) in
+
+    M. Gasparini, R. Wang, and A. Ramdas, *Combining exchangeable p-values*, arXiv 2404.03484, 2024
+
+    Note, if randomization is False, we have u = 1 here. Generally, randomization requires fewer assumptions but leads
+    to non-deterministic behavior.
+
+    :param p_values: A list or array of p-values.
+    :param randomization: If True, u is taken uniformly randomly from [0, 1] (non-deterministic). If False, u is set
+    to 1 (deterministic). Randomization is generally more powerful but provides non-deterministic results.
+    :return: A single p-value based on the given p-values.
+    """
+    if len(p_values) == 0:
+        raise ValueError("Given list of p-values is empty!")
+
+    if np.all(np.isnan(p_values)):
+        return float(np.nan)
+
+    if randomization:
+        u = float(np.random.uniform(0, 1))
+    else:
+        u = 1
+
+    p_values = np.array(p_values)
+    p_values = p_values[~np.isnan(p_values)]
+    p_values.sort()
+
+    K = len(p_values)
+
+    return min(
+        1.0, float(np.min([2 * np.mean(p_values[:m]) / (2 - (K * u / m)) for m in range(1, K + 1) if (K * u / m) < 2]))
+    )
+
+
+def merge_p_values_quantile(
     p_values: Union[np.ndarray, List[float]], p_values_scaling: Optional[np.ndarray] = None, quantile: float = 0.5
 ) -> float:
-    """Applies a quantile based family wise error rate (FWER) control to the given p-values. This is based on the
+    """Applies a quantile based approach to merge multiple potentially dependent p-values to one. This is based on the
     approach described in:
 
-    Meinshausen, N., Meier, L. and Buehlmann, P. (2009).
-    p-values for high-dimensional regression. J. Amer. Statist. Assoc.104 1671–1681
+    Meinshausen, N., Meier, L. and Buehlmann, P., *p-values for high-dimensional regression*,
+    J. Amer. Statist. Assoc.104 1671–1681, 2009
 
     :param p_values: A list or array of p-values.
     :param p_values_scaling: An optional list of scaling factors for each p-value.
diff --git a/tests/gcm/test_stats.py b/tests/gcm/test_stats.py
index fd6251d384..6b7597f27b 100644
--- a/tests/gcm/test_stats.py
+++ b/tests/gcm/test_stats.py
@@ -10,7 +10,7 @@
     create_linear_regressor,
     create_logistic_regression_classifier,
 )
-from dowhy.gcm.stats import estimate_ftest_pvalue, marginal_expectation, quantile_based_fwer
+from dowhy.gcm.stats import estimate_ftest_pvalue, marginal_expectation, merge_p_values_average, merge_p_values_quantile
 from dowhy.gcm.util.general import geometric_median
 
 
@@ -27,47 +27,65 @@ def test_when_estimate_geometric_median_then_returns_correct_median_vector():
     assert gm[1] == approx(-5, abs=0.5)
 
 
-def test_when_apply_quantile_based_fwer_control_then_returns_single_adjusted_pvalue():
+def test_when_merge_p_values_quantile_then_returns_single_adjusted_pvalue():
     p_values = np.array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1])
-    assert quantile_based_fwer(p_values, quantile=0.5) == 0.055 / 0.5
-    assert quantile_based_fwer(p_values, quantile=0.25) == 0.0325 / 0.25
-    assert quantile_based_fwer(p_values, quantile=0.75) == 0.0775 / 0.75
+    assert merge_p_values_quantile(p_values, quantile=0.5) == 0.055 / 0.5
+    assert merge_p_values_quantile(p_values, quantile=0.25) == 0.0325 / 0.25
+    assert merge_p_values_quantile(p_values, quantile=0.75) == 0.0775 / 0.75
 
     assert (
-        quantile_based_fwer(np.array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 1]), quantile=0.5)
+        merge_p_values_quantile(np.array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 1]), quantile=0.5)
         == 0.06 / 0.5
     )
-    assert quantile_based_fwer(np.array([0.9, 0.95, 1]), quantile=0.5) == 1
-    assert quantile_based_fwer(np.array([0, 0, 0]), quantile=0.5) == 0
-    assert quantile_based_fwer(np.array([0.33]), quantile=0.5) == 0.33
+    assert merge_p_values_quantile(np.array([0.9, 0.95, 1]), quantile=0.5) == 1
+    assert merge_p_values_quantile(np.array([0, 0, 0]), quantile=0.5) == 0
+    assert merge_p_values_quantile(np.array([0.33]), quantile=0.5) == 0.33
 
 
-def test_given_p_values_with_nans_when_using_quantile_based_fwer_then_ignores_the_nan_values():
+def test_given_p_values_with_nans_when_merge_p_values_quantile_then_ignores_the_nan_values():
     p_values = np.array([0.01, np.nan, 0.02, 0.03, 0.04, 0.05, np.nan, 0.06, 0.07, 0.08, 0.09, 0.1])
-    assert quantile_based_fwer(p_values, quantile=0.5) == 0.055 / 0.5
+    assert merge_p_values_quantile(p_values, quantile=0.5) == 0.055 / 0.5
 
 
-def test_given_p_values_with_scaling_when_apply_quantile_based_fwer_control_then_returns_single_adjusted_pvalue():
+def test_given_p_values_with_scaling_when_merge_p_values_quantile_then_returns_single_adjusted_pvalue():
     p_values = np.array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1])
     p_values_scaling = np.array([2, 2, 1, 2, 1, 3, 1, 2, 4, 1])
 
-    assert quantile_based_fwer(p_values, p_values_scaling, quantile=0.5) == approx(0.15)
-    assert quantile_based_fwer(p_values, p_values_scaling, quantile=0.25) == approx(0.17)
-    assert quantile_based_fwer(p_values, p_values_scaling, quantile=0.75) == approx(0.193, abs=0.001)
+    assert merge_p_values_quantile(p_values, p_values_scaling, quantile=0.5) == approx(0.15)
+    assert merge_p_values_quantile(p_values, p_values_scaling, quantile=0.25) == approx(0.17)
+    assert merge_p_values_quantile(p_values, p_values_scaling, quantile=0.75) == approx(0.193, abs=0.001)
 
 
-def test_given_invalid_inputs_when_apply_quantile_based_fwer_control_then_raises_error():
+def test_given_invalid_inputs_when_merge_p_values_quantile_then_raises_error():
     with pytest.raises(ValueError):
-        assert quantile_based_fwer(np.array([0.1, 0.5, 1]), quantile=0)
+        assert merge_p_values_quantile(np.array([0.1, 0.5, 1]), quantile=0)
 
     with pytest.raises(ValueError):
-        assert quantile_based_fwer(np.array([0.1, 0.5, 1]), np.array([1, 2]), quantile=0.1)
+        assert merge_p_values_quantile(np.array([0.1, 0.5, 1]), np.array([1, 2]), quantile=0.1)
 
     with pytest.raises(ValueError):
-        assert quantile_based_fwer(np.array([0.1, 0.5, 1]), quantile=1.1)
+        assert merge_p_values_quantile(np.array([0.1, 0.5, 1]), quantile=1.1)
 
     with pytest.raises(ValueError):
-        assert quantile_based_fwer(np.array([0.1, 0.5, 1]), quantile=-0.5)
+        assert merge_p_values_quantile(np.array([0.1, 0.5, 1]), quantile=-0.5)
+
+
+def test_when_merge_p_values_average_without_randomization_then_returns_expected_results():
+    assert merge_p_values_average([0]) == 0
+    assert merge_p_values_average([1]) == 1
+    assert merge_p_values_average([0, 1]) == approx(1.0)
+    assert merge_p_values_average([0, 0, 1]) == 0
+    assert merge_p_values_average([0, 0.5, 0.5, np.nan, 1, np.nan]) == approx(1.0)
+    assert merge_p_values_average([0, 0, 1, 1, 1]) == approx(1.0)
+
+
+@flaky(max_runs=3)
+def test_when_merge_p_values_average_with_randomization_then_returns_expected_results():
+    assert merge_p_values_average([0], randomization=True) == 0
+    assert merge_p_values_average([1], randomization=True) == 1
+    assert merge_p_values_average([0, 1], randomization=True) == approx(0.0, abs=0.01)
+    assert merge_p_values_average([0, 0, 1], randomization=True) == approx(0.0, abs=0.01)
+    assert merge_p_values_average([0, np.nan, 0, np.nan, 1, 1], randomization=True) == approx(0.0, abs=0.01)
 
 
 def test_when_evaluate_marginal_expectation_without_averaging_result_then_returned_results_have_correct_format():