Merge branch 'scikit-learn:main' into main

virchan · Aug 29, 2024 · 5c46559 · 5c46559
2 parents 2b32a50 + 6b3f9bd
commit 5c46559
Show file tree

Hide file tree

Showing 11 changed files with 53 additions and 18 deletions.
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
@@ -20,6 +20,14 @@ Version 1.5.2
 
 **release date of 1.5.2**
 
+Changes impacting many modules
+------------------------------
+
+- |Fix| Fixed performance regression in a few Cython modules in
+  `sklearn._loss`, `sklearn.manifold`, `sklearn.metrics` and `sklearn.utils`,
+  which were built without OpenMP support.
+  :pr:`29694` by :user:`Loïc Estèvce <lesteve>`.
+
 Changelog
 ---------
 

diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -172,6 +172,12 @@ Changelog
   now accepts string format or callable to generate feature names. :pr:`28934` by
   :user:`Marc Bresson <MarcBresson>`.
 
+:mod:`sklearn.cross_decomposition`
+..................................
+
+- |Fix| :class:`cross_decomposition.PLSRegression` properly raises an error when
+  `n_components` is larger than `n_samples`. :pr:`29710` by `Thomas Fan`_.
+
 :mod:`sklearn.datasets`
 .......................
 
@@ -292,6 +298,10 @@ Changelog
   will show the function name in the label.
   :pr:`29158` by :user:`Yao Xiao <Charlie-XIAO>`.
 
+- |Fix| :class:`preprocessing.PowerTransformer` now uses `scipy.special.inv_boxcox`
+  to output `nan` if the input of BoxCox's inverse is invalid.
+  :pr:`27875` by :user:`Xuefeng Xu <xuefeng-xu>`.
+
 :mod:`sklearn.semi_supervised`
 ..............................
 

diff --git a/sklearn/_loss/meson.build b/sklearn/_loss/meson.build
@@ -17,6 +17,7 @@ _loss_pyx = custom_target(
 py.extension_module(
   '_loss',
   _loss_pyx,
+  dependencies: [openmp_dep],
   cython_args: cython_args,
   install: true,
   subdir: 'sklearn/_loss',

diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
@@ -291,7 +291,9 @@ def fit(self, X, y=None, Y=None):
         # With PLSRegression n_components is bounded by the rank of (X.T X) see
         # Wegelin page 25. With CCA and PLSCanonical, n_components is bounded
         # by the rank of X and the rank of Y: see Wegelin page 12
-        rank_upper_bound = p if self.deflation_mode == "regression" else min(n, p, q)
+        rank_upper_bound = (
+            min(n, p) if self.deflation_mode == "regression" else min(n, p, q)
+        )
         if n_components > rank_upper_bound:
             raise ValueError(
                 f"`n_components` upper bound is {rank_upper_bound}. "

diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py
@@ -480,6 +480,17 @@ def test_n_components_upper_bounds(Estimator):
         est.fit(X, Y)
 
 
+def test_n_components_upper_PLSRegression():
+    """Check the validation of `n_components` upper bounds for PLSRegression."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(20, 64)
+    Y = rng.randn(20, 3)
+    est = PLSRegression(n_components=30)
+    err_msg = "`n_components` upper bound is 20. Got 30 instead. Reduce `n_components`."
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X, Y)
+
+
 @pytest.mark.parametrize("n_samples, n_features", [(100, 10), (100, 200)])
 def test_singular_value_helpers(n_samples, n_features, global_random_seed):
     # Make sure SVD and power method give approximately the same results

diff --git a/sklearn/manifold/meson.build b/sklearn/manifold/meson.build
@@ -9,7 +9,7 @@ py.extension_module(
 py.extension_module(
   '_barnes_hut_tsne',
   '_barnes_hut_tsne.pyx',
-  dependencies: [np_dep],
+  dependencies: [np_dep, openmp_dep],
   cython_args: cython_args,
   subdir: 'sklearn/manifold',
   install: true

diff --git a/sklearn/metrics/_pairwise_distances_reduction/meson.build b/sklearn/metrics/_pairwise_distances_reduction/meson.build
@@ -172,7 +172,7 @@ _argkmin_classmode_pyx = custom_target(
 _argkmin_classmode = py.extension_module(
   '_argkmin_classmode',
   _argkmin_classmode_pyx,
-  dependencies: [np_dep],
+  dependencies: [np_dep, openmp_dep],
   override_options: ['cython_language=cpp'],
   cython_args: cython_args,
   # XXX: for some reason -fno-sized-deallocation is needed otherwise there is
@@ -199,7 +199,7 @@ _radius_neighbors_classmode_pyx = custom_target(
 _radius_neighbors_classmode = py.extension_module(
   '_radius_neighbors_classmode',
   _radius_neighbors_classmode_pyx,
-  dependencies: [np_dep],
+  dependencies: [np_dep, openmp_dep],
   override_options: ['cython_language=cpp'],
   cython_args: cython_args,
   subdir: 'sklearn/metrics/_pairwise_distances_reduction',

diff --git a/sklearn/metrics/meson.build b/sklearn/metrics/meson.build
@@ -41,6 +41,7 @@ _dist_metrics = py.extension_module(
 py.extension_module(
   '_pairwise_fast',
   ['_pairwise_fast.pyx', metrics_cython_tree],
+  dependencies: [openmp_dep],
   cython_args: cython_args,
   subdir: 'sklearn/metrics',
   install: true

diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 from scipy import optimize, sparse, stats
-from scipy.special import boxcox
+from scipy.special import boxcox, inv_boxcox
 
 from ..base import (
     BaseEstimator,
@@ -3376,7 +3376,7 @@ def inverse_transform(self, X):
             X = self._scaler.inverse_transform(X)
 
         inv_fun = {
-            "box-cox": self._box_cox_inverse_tranform,
+            "box-cox": inv_boxcox,
             "yeo-johnson": self._yeo_johnson_inverse_transform,
         }[self.method]
         for i, lmbda in enumerate(self.lambdas_):
@@ -3385,17 +3385,6 @@ def inverse_transform(self, X):
 
         return X
 
-    def _box_cox_inverse_tranform(self, x, lmbda):
-        """Return inverse-transformed input x following Box-Cox inverse
-        transform with parameter lambda.
-        """
-        if lmbda == 0:
-            x_inv = np.exp(x)
-        else:
-            x_inv = (x * lmbda + 1) ** (1 / lmbda)
-
-        return x_inv
-
     def _yeo_johnson_inverse_transform(self, x, lmbda):
         """Return inverse-transformed input x following Yeo-Johnson inverse
         transform with parameter lambda.

diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
@@ -2329,6 +2329,11 @@ def test_optimization_power_transformer(method, lmbda):
     n_samples = 20000
     X = rng.normal(loc=0, scale=1, size=(n_samples, 1))
 
+    if method == "box-cox":
+        # For box-cox, means that lmbda * y + 1 > 0 or y > - 1 / lmbda
+        # Clip the data here to make sure the inequality is valid.
+        X = np.clip(X, -1 / lmbda + 1e-5, None)
+
     pt = PowerTransformer(method=method, standardize=False)
     pt.lambdas_ = [lmbda]
     X_inv = pt.inverse_transform(X)
@@ -2341,6 +2346,14 @@ def test_optimization_power_transformer(method, lmbda):
     assert_almost_equal(1, X_inv_trans.std(), decimal=1)
 
 
+def test_invserse_box_cox():
+    # output nan if the input is invalid
+    pt = PowerTransformer(method="box-cox", standardize=False)
+    pt.lambdas_ = [0.5]
+    X_inv = pt.inverse_transform([[-2.1]])
+    assert np.isnan(X_inv)
+
+
 def test_yeo_johnson_darwin_example():
     # test from original paper "A new family of power transformations to
     # improve normality or symmetry" by Yeo and Johnson.

diff --git a/sklearn/utils/meson.build b/sklearn/utils/meson.build
@@ -18,7 +18,7 @@ utils_extension_metadata = {
   'sparsefuncs_fast':
     {'sources': ['sparsefuncs_fast.pyx']},
   '_cython_blas': {'sources': ['_cython_blas.pyx']},
-  'arrayfuncs': {'sources': ['arrayfuncs.pyx']},
+  'arrayfuncs': {'sources': ['arrayfuncs.pyx'], 'dependencies': [openmp_dep]},
   'murmurhash': {
       'sources': ['murmurhash.pyx', 'src' / 'MurmurHash3.cpp'],
   },