py-why · carl-offerfit · Jan 27, 2025 · Jan 27, 2025 · Jan 27, 2025 · Jan 28, 2025
diff --git a/econml/_ortho_learner.py b/econml/_ortho_learner.py
@@ -1027,7 +1027,7 @@ def effect_inference(self, X=None, *, T0=0, T1=1):
 
     effect_inference.__doc__ = LinearCateEstimator.effect_inference.__doc__
 
-    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None, scoring=None):
         """
         Score the fitted CATE model on a new data set.
 
@@ -1055,6 +1055,9 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
             Weights for each samples
         groups: (n,) vector, optional
             All rows corresponding to the same group will be kept together during splitting.
+        scoring: name of an sklearn scoring function to use instead of the default, optional
+            Supports f1_score, log_loss, mean_absolute_error, mean_squared_error, r2_score,
+            and roc_auc_score.
 
         Returns
         -------
@@ -1113,9 +1116,24 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
 
             accumulated_nuisances += nuisances
 
+        score_kwargs = {
+            'X': X,
+            'W': W,
+            'Z': Z,
+            'sample_weight': sample_weight,
+            'groups': groups
+        }
+        # If using an _rlearner, the scoring parameter can be passed along, if provided
+        if scoring is not None:
+            # Cannot import in header, or circular imports
+            from .dml._rlearner import _ModelFinal
+            if isinstance(self._ortho_learner_model_final, _ModelFinal):
+                score_kwargs['scoring'] = scoring
+            else:
+                raise NotImplementedError("scoring parameter only implemented for "
+                                          "_rlearner._ModelFinal")
         return self._ortho_learner_model_final.score(Y, T, nuisances=accumulated_nuisances,
-                                                     **filter_none_kwargs(X=X, W=W, Z=Z,
-                                                                          sample_weight=sample_weight, groups=groups))
+                                                     **filter_none_kwargs(**score_kwargs))
 
     @property
     def ortho_learner_model_final_(self):

diff --git a/econml/dml/_rlearner.py b/econml/dml/_rlearner.py
@@ -27,7 +27,16 @@
 
 from abc import abstractmethod
 import numpy as np
-
+import pandas as pd
+from sklearn.metrics import (
+    f1_score,
+    log_loss,
+    mean_absolute_error,
+    mean_squared_error,
+    r2_score,
+    roc_auc_score
+)
+from scipy.stats import pearsonr
 from ..sklearn_extensions.model_selection import ModelSelector
 from ..utilities import (filter_none_kwargs)
 from .._ortho_learner import _OrthoLearner
@@ -54,10 +63,13 @@ def train(self, is_selecting, folds, Y, T, X=None, W=None, Z=None, sample_weight
                             filter_none_kwargs(sample_weight=sample_weight, groups=groups))
         return self
 
-    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None,
+              y_scoring='mean_squared_error', t_scoring='mean_squared_error', t_score_by_dim=False):
         # note that groups are not passed to score because they are only used for fitting
-        T_score = self._model_t.score(X, W, T, **filter_none_kwargs(sample_weight=sample_weight))
-        Y_score = self._model_y.score(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight))
+        T_score = self._model_t.score(X, W, T, **filter_none_kwargs(sample_weight=sample_weight),
+                                       scoring=t_scoring, score_by_dim=t_score_by_dim)
+        Y_score = self._model_y.score(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight),
+                                      scoring=y_scoring)
         return Y_score, T_score
 
     def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
@@ -98,18 +110,60 @@ def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None,
     def predict(self, X=None):
         return self._model_final.predict(X)
 
-    def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, groups=None):
+    def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, groups=None,
+              scoring='mean_squared_error'):
         Y_res, T_res = nuisances
         if Y_res.ndim == 1:
             Y_res = Y_res.reshape((-1, 1))
         if T_res.ndim == 1:
             T_res = T_res.reshape((-1, 1))
         effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1]))
         Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
-        if sample_weight is not None:
-            return np.mean(np.average((Y_res - Y_res_pred) ** 2, weights=sample_weight, axis=0))
+        return _ModelFinal._wrap_scoring(Y_true=Y_res, Y_pred=Y_res_pred, scoring=scoring, sample_weight=sample_weight)
+
+    @staticmethod
+    def _wrap_scoring(scoring, Y_true, Y_pred, sample_weight=None):
+        """
+        Wrap the option to call several sklearn scoring functions that accept sample weighting.
+
+        Unfortunately there is no utility like get_scorer that is both generic and supports
+        samples weights.
+        """
+        if scoring == 'f1':
+            return f1_score(Y_true, Y_pred, sample_weight=sample_weight)
+        elif scoring == 'mean_absolute_error':
+            return mean_absolute_error(Y_true, Y_pred, sample_weight=sample_weight)
+        elif scoring == 'mean_squared_error':
+            return mean_squared_error(Y_true, Y_pred, sample_weight=sample_weight)
+        elif scoring == 'r2':
+            return r2_score(Y_true, Y_pred, sample_weight=sample_weight)
+        elif scoring == 'roc_auc':
+            return roc_auc_score(Y_true, Y_pred, sample_weight=sample_weight)
+        elif scoring == 'log_loss':
+            return log_loss(Y_true, Y_pred, sample_weight=sample_weight)
+        elif scoring == 'pearsonr':
+            if sample_weight is not None:
+                raise NotImplementedError("pearsonr score does not support sample weighting")
+            return pearsonr(Y_true, Y_pred)
         else:
-            return np.mean((Y_res - Y_res_pred) ** 2)
+            raise NotImplementedError(f"wrap_weighted_scoring does not support '{scoring}'" )
+
+    @staticmethod
+    def wrap_scoring(scoring, Y_true, Y_pred, sample_weight=None, score_by_dim=False):
+        """
+        In case the caller wants a score for each dimension of a multiple treatment model.
+
+        Loop over the call to the single score wrapper.
+        """
+        if not score_by_dim:
+            return _ModelFinal._wrap_scoring(scoring, Y_true, Y_pred, sample_weight)
+        else:
+            assert Y_true.shape == Y_pred.shape, "Mismatch shape in wrap_scoring"
+            n_out = Y_pred.shape[1]
+            res = [None]*Y_pred.shape[1]
+            for yidx in range(n_out):
+                res[yidx]= _ModelFinal.wrap_scoring(scoring, Y_true[:,yidx], Y_pred[:,yidx], sample_weight)
+            return res
 
 
 class _RLearner(_OrthoLearner):
@@ -422,7 +476,7 @@ def fit(self, Y, T, *, X=None, W=None, sample_weight=None, freq_weight=None, sam
                            cache_values=cache_values,
                            inference=inference)
 
-    def score(self, Y, T, X=None, W=None, sample_weight=None):
+    def score(self, Y, T, X=None, W=None, sample_weight=None, scoring=None):
         """
         Score the fitted CATE model on a new data set.
 
@@ -453,7 +507,7 @@ def score(self, Y, T, X=None, W=None, sample_weight=None):
             The MSE of the final CATE model on the new data.
         """
         # Replacing score from _OrthoLearner, to enforce Z=None and improve the docstring
-        return super().score(Y, T, X=X, W=W, sample_weight=sample_weight)
+        return super().score(Y, T, X=X, W=W, sample_weight=sample_weight, scoring=scoring)
 
     @property
     def rlearner_model_final_(self):
@@ -493,3 +547,61 @@ def residuals_(self):
                                  "Set to `True` to enable residual storage.")
         Y_res, T_res = self._cached_values.nuisances
         return Y_res, T_res, self._cached_values.X, self._cached_values.W
+
+
+    def score_nuisances(self, Y, T, X=None, W=None, Z=None, sample_weight=None, y_scoring=None,
+                        t_scoring=None, t_score_by_dim=False):
+        """
+        Score the fitted nuisance models on arbitrary data and using any supported sklearn scoring.
+
+        Supported scorings depend on whether or not sample weights are sued: If no sample
+        weights are used, then any of those provided by sklearn.metrics.get_scorer_names() are
+        available, as well as non-negated versions of mean_squared_error, mean_absolute_error.
+        If sample weights are used, then supported scorings are     f1_score, log_loss,
+        mean_absolute_error, mean_squared_error, r2_score, roc_auc_score.
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        X: (n, d_x) matrix, optional
+            Features for each sample
+        W: (n, d_w) matrix, optional
+            Controls for each sample
+        Z: (n, d_z) matrix, optional
+            Instruments for each sample
+        sample_weight:(n,) vector, optional
+            Weights for each samples
+        t_scoring: str, optional
+            Name of an sklearn scoring function to use instead of the default for model_t
+        y_scoring: str, optional
+            Name of an sklearn scoring function to use instead of the default for model_y
+        t_score_by_dim: bool, default=False
+            Score prediction of treatment dimensions separately
+
+        Returns
+        -------
+        score_dict : dict[str,list[float]]
+            A dictionary where the keys indicate the Y and T scores used and the values are
+            lists of scores, one per CV fold model.
+        """
+        Y_key = 'Y_defscore' if not y_scoring else f"Y_{y_scoring}"
+        T_Key = 'T_defscore' if not t_scoring else f"T_{t_scoring}"
+        score_dict = {
+            Y_key : [],
+            T_Key : []
+        }
+
+        # For discrete treatments, these will have to be one hot encoded
+        Y_2_score = pd.get_dummies(Y) if self.discrete_outcome and (len(Y.shape) == 1 or Y.shape[1] == 1) else Y
+        T_2_score = pd.get_dummies(T) if self.discrete_treatment and (len(T.shape) == 1 or T.shape[1] == 1) else T
+
+        for m in self._models_nuisance[0]:
+            Y_score, T_score = m.score(Y_2_score, T_2_score, X=X, W=W, Z=Z, sample_weight=sample_weight,
+                                       y_scoring=y_scoring, t_scoring=t_scoring,
+                                       t_score_by_dim=t_score_by_dim)
+            score_dict[Y_key].append(Y_score)
+            score_dict[T_Key].append(T_score)
+        return score_dict
diff --git a/econml/dml/dml.py b/econml/dml/dml.py
@@ -9,9 +9,11 @@
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import (FunctionTransformer)
 from sklearn.utils import check_random_state
+from sklearn.metrics import get_scorer, get_scorer_names
+
 
 from .._ortho_learner import _OrthoLearner
-from ._rlearner import _RLearner
+from ._rlearner import _RLearner, _ModelFinal
 from .._cate_estimator import (DebiasedLassoCateEstimatorMixin,
                                LinearModelFinalCateEstimatorMixin,
                                StatsModelsCateEstimatorMixin,
@@ -52,20 +54,38 @@ def predict(self, X, W):
                 raise AttributeError("Cannot use a classifier as a first stage model when the target is continuous!")
             return self._model.predict(_combine(X, W, n_samples))
 
-    def score(self, X, W, Target, sample_weight=None):
-        if hasattr(self._model, 'score'):
-            if self._discrete_target:
-                # In this case, the Target is the one-hot-encoding of the treatment variable
-                # We need to go back to the label representation of the one-hot so as to call
-                # the classifier.
-                Target = inverse_onehot(Target)
+    def score(self, X, W, Target, sample_weight=None, scoring=None, score_by_dim=False):
+        XW_combined = _combine(X, W, Target.shape[0])
+        if self._discrete_target:
+            # In this case, the Target is the one-hot-encoding of the treatment variable
+            # We need to go back to the label representation of the one-hot so as to call
+            # the classifier.
+            Target = inverse_onehot(Target)
+        if hasattr(self._model, 'score') and scoring is None and not score_by_dim:
+            # Standard default model scoring
             if sample_weight is not None:
-                return self._model.score(_combine(X, W, Target.shape[0]), Target, sample_weight=sample_weight)
+                return self._model.score(XW_combined, Target, sample_weight=sample_weight)
             else:
-                return self._model.score(_combine(X, W, Target.shape[0]), Target)
+                return self._model.score(XW_combined, Target)
         else:
-            return None
+            return _FirstStageWrapper._wrap_scoring(scoring,Y_true=Target, X=XW_combined, est=self._model,
+                            sample_weight=sample_weight, score_by_dim=score_by_dim)
+
+    @staticmethod
+    def _wrap_scoring(scoring, Y_true, X, est, sample_weight=None, score_by_dim=False):
+        """
+        Wrap the alternative scoring functions get_scorer and _ModelFinal.wrap_scoring.
 
+        If there are no weights, use the get_scorer functionality to support ANY sklearn
+        evaluation metrics. Otherwise, use the static class method from _ModelFinal that supports
+        weights. That version takes the estimates, not the estimator.
+        """
+        if sample_weight is None and not score_by_dim and scoring in get_scorer_names():
+            scorer = get_scorer(scoring)
+            return scorer(est, X, Y_true)
+        else:
+            Y_pred = est.predict(X)
+            return _ModelFinal.wrap_scoring(scoring, Y_true, Y_pred, sample_weight, score_by_dim=score_by_dim)
 
 class _FirstStageSelector(SingleModelSelector):
     def __init__(self, model: SingleModelSelector, discrete_target):

diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py
@@ -713,6 +713,81 @@ def true_fn(x):
                 np.testing.assert_array_less(lb - .01, truth)
                 np.testing.assert_array_less(truth, ub + .01)
 
+    def test_forest_dml_score_fns(self):
+        np.random.seed(1234)
+        n = 20000  # number of raw samples
+        d = 10
+
+        Z = np.random.binomial(1, .5, size=(n, d))
+        T = np.random.binomial(1, .5, size=(n,))
+
+        def true_fn(x):
+            return -1 + 2 * x[:, 0] + x[:, 1] * x[:, 2]
+
+        y = true_fn(Z) * T + Z[:, 0] + (1 * Z[:, 0] + 1) * np.random.normal(0, 1, size=(n,))
+        X = Z[:, :4]
+        W = Z[:, 4:]
+
+        est = CausalForestDML(model_y=GradientBoostingRegressor(n_estimators=30, min_samples_leaf=30),
+                              model_t=GradientBoostingClassifier(n_estimators=30, min_samples_leaf=30),
+                              discrete_treatment=True,
+                              cv=2,
+                              n_jobs=None,
+                              n_estimators=1000,
+                              max_samples=.4,
+                              min_samples_leaf=10,
+                              min_impurity_decrease=0.001,
+                              verbose=0, min_var_fraction_leaf=.1,
+                              fit_intercept=False,
+                              random_state=12345)
+
+        est.fit(y, T, X=X, W=W)
+
+        s1 = est.score(Y=y,T=T,X=X, W=W, scoring='mean_squared_error')
+        s2 = est.score(Y=y,T=T,X=X, W=W)
+        assert s1 == s2
+        np.testing.assert_allclose(s1, 2.50, rtol=0, atol=.01)
+        s3 = est.score(Y=y, T=T, X=X, W=W, scoring='mean_absolute_error')
+        np.testing.assert_allclose(s3, 1.19, rtol=0, atol=.01)
+        s4 = est.score(Y=y, T=T, X=X, W=W, scoring='r2')
+        np.testing.assert_allclose(s4, 0.113, rtol=0, atol=.001)
+        s5 = est.score(Y=y, T=T, X=X, W=W, scoring='pearsonr')
+        np.testing.assert_allclose(s5[0], 0.337, rtol=0, atol=0.005 )
+
+        sn1 = est.score_nuisances(Y=y, T=T, X=X, W=W,
+                                  t_scoring='mean_squared_error',
+                                  y_scoring='mean_squared_error')
+        np.testing.assert_allclose(sn1['Y_mean_squared_error'], [2.8,2.8], rtol=0, atol=.1)
+        np.testing.assert_allclose(sn1['T_mean_squared_error'], [1.5,1.5], rtol=0, atol=.1)
+
+        sn2 = est.score_nuisances(Y=y, T=T, X=X, W=W,
+                                  t_scoring='mean_absolute_error',
+                                  y_scoring='mean_absolute_error')
+        np.testing.assert_allclose(sn2['Y_mean_absolute_error'], [1.3,1.3], rtol=0, atol=.1)
+        np.testing.assert_allclose(sn2['T_mean_absolute_error'], [1.0,1.0], rtol=0, atol=.1)
+
+        sn3 = est.score_nuisances(Y=y, T=T, X=X, W=W,
+                                  t_scoring='r2',
+                                  y_scoring='r2')
+        np.testing.assert_allclose(sn3['Y_r2'], [0.27,0.27], rtol=0, atol=.005)
+        np.testing.assert_allclose(sn3['T_r2'], [-5.1,-5.1], rtol=0, atol=0.25)
+
+        sn4 = est.score_nuisances(Y=y, T=T, X=X, W=W,
+                                  t_scoring='pearsonr',
+                                  y_scoring='pearsonr')
+        # Ignoring the p-values returned with the score
+        y_pearsonr = [s[0] for s in sn4['Y_pearsonr']]
+        t_pearsonr = [s[0] for s in sn4['T_pearsonr']]
+        np.testing.assert_allclose(y_pearsonr, [0.52, 0.52], rtol=0, atol=.01)
+        np.testing.assert_allclose(t_pearsonr, [.035, .035], rtol=0, atol=0.005)
+
+        # T is binary, and can be used to check binary eval functions
+        sn5 = est.score_nuisances(Y=y, T=T, X=X, W=W, t_scoring='roc_auc')
+        np.testing.assert_allclose(sn5['T_roc_auc'], [0.526,0.526], rtol=0, atol=.005)
+
+        sn6 = est.score_nuisances(Y=y, T=T, X=X, W=W, t_scoring='log_loss')
+        np.testing.assert_allclose(sn6['T_log_loss'], [17.4,17.4], rtol=0, atol=0.1)
+
     def test_aaforest_pandas(self):
         """Test that we can use CausalForest with pandas inputs."""
         df = pd.DataFrame({'a': np.random.normal(size=500),

diff --git a/econml/tests/test_ortho_learner.py b/econml/tests/test_ortho_learner.py
@@ -314,6 +314,9 @@ def _gen_ortho_learner_model_final(self):
         np.testing.assert_almost_equal(est.score_, sigma**2, decimal=2)
         np.testing.assert_almost_equal(est.ortho_learner_model_final_.model.coef_[0], 1, decimal=2)
 
+        # Test that non-standard scoring raise the appropriate exception
+        self.assertRaises(NotImplementedError, est.score, y, X[:, 0], W=X[:, 1:], scoring='mean_squared_error')
+
     @pytest.mark.ray
     def test_ol_with_ray(self):
         self._test_ol(True)