diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a6408b940119d..a8af7f023d34d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -844,7 +844,7 @@ Numeric - Bug in operations with array-likes with ``dtype="boolean"`` and :attr:`NA` incorrectly altering the array in-place (:issue:`45421`) - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`) - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`) -- +- Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 05a9bde700e32..81766dc91f271 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -162,6 +162,10 @@ def f( def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: # Bottleneck chokes on datetime64, PeriodDtype (or and EA) if not is_object_dtype(dtype) and not needs_i8_conversion(dtype): + # GH 42878 + # Bottleneck uses naive summation leading to O(n) loss of precision + # unlike numpy which implements pairwise summation, which has O(log(n)) loss + # crossref: https://github.com/pydata/bottleneck/issues/379 # GH 15507 # bottleneck does not properly upcast during the sum @@ -171,7 +175,7 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: # further we also want to preserve NaN when all elements # are NaN, unlike bottleneck/numpy which consider this # to be 0 - return name not in ["nansum", "nanprod"] + return name not in ["nansum", "nanprod", "nanmean"] return False diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 005f7b088271f..f46d5c8e2590e 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1120,3 +1120,25 @@ def test_check_below_min_count__large_shape(min_count, expected_result): shape = (2244367, 1253) result = nanops.check_below_min_count(shape, mask=None, min_count=min_count) assert result == expected_result + + +@pytest.mark.parametrize("func", ["nanmean", "nansum"]) +@pytest.mark.parametrize( + "dtype", + [ + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.int8, + np.int16, + np.int32, + np.int64, + np.float16, + np.float32, + np.float64, + ], +) +def test_check_bottleneck_disallow(dtype, func): + # GH 42878 bottleneck sometimes produces unreliable results for mean and sum + assert not nanops._bn_ok_dtype(dtype, func)