From 6786846764cd9cf9d5041e5bdc86430be38eeb8e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 11 Oct 2023 10:54:23 -0700 Subject: [PATCH 01/19] ENH: EA._get_repr_footer (#55478) --- pandas/core/arrays/base.py | 12 ++++++++++-- pandas/core/arrays/categorical.py | 4 ++-- pandas/io/formats/format.py | 13 +++++++------ 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 31c143ee012bb..e7b7ecba60e0b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1663,7 +1663,14 @@ def __repr__(self) -> str: self, self._formatter(), indent_for_name=False ).rstrip(", \n") class_name = f"<{type(self).__name__}>\n" - return f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" + footer = self._get_repr_footer() + return f"{class_name}{data}\n{footer}" + + def _get_repr_footer(self) -> str: + # GH#24278 + if self.ndim > 1: + return f"Shape: {self.shape}, dtype: {self.dtype}" + return f"Length: {len(self)}, dtype: {self.dtype}" def _repr_2d(self) -> str: from pandas.io.formats.printing import format_object_summary @@ -1679,7 +1686,8 @@ def _repr_2d(self) -> str: ] data = ",\n".join(lines) class_name = f"<{type(self).__name__}>" - return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}" + footer = self._get_repr_footer() + return f"{class_name}\n[\n{data}\n]\n{footer}" def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5059f5d000ccd..e19635af6ab0b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2177,7 +2177,7 @@ def _repr_categories(self) -> list[str]: category_strs = [x.strip() for x in category_strs] return category_strs - def _repr_categories_info(self) -> str: + def _get_repr_footer(self) -> str: """ Returns a string representation of the footer. """ @@ -2229,7 +2229,7 @@ def __repr__(self) -> str: """ String representation. """ - footer = self._repr_categories_info() + footer = self._get_repr_footer() length = len(self) max_len = 10 if length > max_len: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index cac83e2a48972..322f7f88494de 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -259,11 +259,12 @@ def _get_footer(self) -> str: name = self.series.name footer = "" - if getattr(self.series.index, "freq", None) is not None: - assert isinstance( - self.series.index, (DatetimeIndex, PeriodIndex, TimedeltaIndex) - ) - footer += f"Freq: {self.series.index.freqstr}" + index = self.series.index + if ( + isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)) + and index.freq is not None + ): + footer += f"Freq: {index.freqstr}" if self.name is not False and name is not None: if footer: @@ -289,7 +290,7 @@ def _get_footer(self) -> str: # level infos are added to the end and in a new line, like it is done # for Categoricals if isinstance(self.tr_series.dtype, CategoricalDtype): - level_info = self.tr_series._values._repr_categories_info() + level_info = self.tr_series._values._get_repr_footer() if footer: footer += "\n" footer += level_info From 943c3cb38d23e60905891724f878b91483597c83 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 11 Oct 2023 10:54:57 -0700 Subject: [PATCH 02/19] BUG: repr of inf values with use_inf_as_na (#55483) * BUG: repr of inf values with use_inf_as_na * GH ref --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_libs/missing.pyi | 1 - pandas/_libs/missing.pyx | 25 ------------------------- pandas/core/indexes/base.py | 13 ++----------- pandas/io/formats/format.py | 22 ++++++++++------------ pandas/tests/frame/test_repr_info.py | 2 +- 6 files changed, 14 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 29a2d5c0b5877..eec82ae26afcc 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -394,6 +394,7 @@ Other ^^^^^ - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) +- Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) - diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi index d5c9f1342a089..282dcee3ed6cf 100644 --- a/pandas/_libs/missing.pyi +++ b/pandas/_libs/missing.pyi @@ -14,4 +14,3 @@ def isneginf_scalar(val: object) -> bool: ... def checknull(val: object, inf_as_na: bool = ...) -> bool: ... def isnaobj(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ... def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ... -def is_float_nan(values: np.ndarray) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index e3e7d8daa03e1..8ef59b46ca25f 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -255,31 +255,6 @@ cdef bint checknull_with_nat_and_na(object obj): return checknull_with_nat(obj) or obj is C_NA -@cython.wraparound(False) -@cython.boundscheck(False) -def is_float_nan(values: ndarray) -> ndarray: - """ - True for elements which correspond to a float nan - - Returns - ------- - ndarray[bool] - """ - cdef: - ndarray[uint8_t] result - Py_ssize_t i, N - object val - - N = len(values) - result = np.zeros(N, dtype=np.uint8) - - for i in range(N): - val = values[i] - if util.is_nan(val): - result[i] = True - return result.view(bool) - - @cython.wraparound(False) @cython.boundscheck(False) def is_numeric_na(values: ndarray) -> ndarray: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c3cab965041e0..d749235e2cd2c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -37,7 +37,6 @@ is_datetime_array, no_default, ) -from pandas._libs.missing import is_float_nan from pandas._libs.tslibs import ( IncompatibleFrequency, OutOfBoundsDatetime, @@ -1390,16 +1389,8 @@ def _format_with_header(self, *, header: list[str_t], na_rep: str_t) -> list[str if is_object_dtype(values.dtype) or is_string_dtype(values.dtype): values = np.asarray(values) - values = lib.maybe_convert_objects(values, safe=True) - - result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values] - - # could have nans - mask = is_float_nan(values) - if mask.any(): - result_arr = np.array(result) - result_arr[mask] = na_rep - result = result_arr.tolist() + # TODO: why do we need different justify for these cases? + result = trim_front(format_array(values, None, justify="all")) else: result = trim_front(format_array(values, None, justify="left")) return header + result diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 322f7f88494de..bb976b3a0208e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1216,18 +1216,16 @@ def _format_strings(self) -> list[str]: def _format(x): if self.na_rep is not None and is_scalar(x) and isna(x): - try: - # try block for np.isnat specifically - # determine na_rep if x is None or NaT-like - if x is None: - return "None" - elif x is NA: - return str(NA) - elif x is NaT or np.isnat(x): - return "NaT" - except (TypeError, ValueError): - # np.isnat only handles datetime or timedelta objects - pass + if x is None: + return "None" + elif x is NA: + return str(NA) + elif lib.is_float(x) and np.isinf(x): + # TODO(3.0): this will be unreachable when use_inf_as_na + # deprecation is enforced + return str(x) + elif x is NaT or isinstance(x, (np.datetime64, np.timedelta64)): + return "NaT" return self.na_rep elif isinstance(x, PandasObject): return str(x) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 0634b8268c04c..63ecdfa5e001b 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -411,7 +411,7 @@ def test_to_records_with_na_record(self): def test_to_records_with_inf_as_na_record(self): # GH 48526 expected = """ NaN inf record -0 NaN b [0, inf, b] +0 inf b [0, inf, b] 1 NaN NaN [1, nan, nan] 2 e f [2, e, f]""" msg = "use_inf_as_na option is deprecated" From c9a98f0f0fe3a1ff2ce549b2f2c7551cc9afc58a Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Wed, 11 Oct 2023 19:55:10 +0200 Subject: [PATCH 03/19] =?UTF-8?q?DEPR:=20=20=E2=80=98AS=E2=80=99,=20?= =?UTF-8?q?=E2=80=98BA=E2=80=99=20and=20=E2=80=98BAS=E2=80=99=20in=20favou?= =?UTF-8?q?r=20of=20=E2=80=98YS=E2=80=99,=20=E2=80=98BY=E2=80=99=20and=20?= =?UTF-8?q?=E2=80=98BYS=E2=80=99=20(#55479)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * deprecate AS/BA/BAS in favour of YS/BY/BYS, fix tests * correct def get_start_end_field, correct docstrings and v0.20.0.rst * fix pre-commit error * add deprecated frequencies to dict DEPR_ABBREVS, add tests --- doc/source/user_guide/timeseries.rst | 14 +++--- doc/source/whatsnew/v0.20.0.rst | 20 +++++++-- pandas/_libs/tslibs/dtypes.pyx | 44 +++++++++++++++++-- pandas/_libs/tslibs/fields.pyx | 4 +- pandas/_libs/tslibs/offsets.pyx | 21 ++++----- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/indexes/datetimes.py | 4 +- pandas/core/resample.py | 4 +- pandas/core/series.py | 2 +- pandas/tests/arithmetic/test_datetime64.py | 6 +-- .../tests/frame/methods/test_to_timestamp.py | 10 ++--- pandas/tests/groupby/test_grouping.py | 2 +- .../datetimes/methods/test_to_period.py | 2 +- .../indexes/datetimes/test_constructors.py | 6 +-- .../indexes/datetimes/test_date_range.py | 14 +++--- .../tests/indexes/datetimes/test_datetime.py | 38 ++++++++++++++++ pandas/tests/indexes/datetimes/test_misc.py | 2 +- .../period/methods/test_to_timestamp.py | 2 +- pandas/tests/resample/test_datetime_index.py | 4 +- pandas/tests/resample/test_period_index.py | 2 +- .../tseries/frequencies/test_inference.py | 4 +- pandas/tests/tseries/offsets/test_offsets.py | 4 +- pandas/tseries/frequencies.py | 4 +- 23 files changed, 148 insertions(+), 67 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index e1415e2ca23ca..9f3077e266e98 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -896,9 +896,9 @@ into ``freq`` keyword arguments. The available date offsets and associated frequ :class:`~pandas.tseries.offsets.BQuarterBegin`, ``'BQS'``, "business quarter begin" :class:`~pandas.tseries.offsets.FY5253Quarter`, ``'REQ'``, "retail (aka 52-53 week) quarter" :class:`~pandas.tseries.offsets.YearEnd`, ``'Y'``, "calendar year end" - :class:`~pandas.tseries.offsets.YearBegin`, ``'AS'`` or ``'BYS'``,"calendar year begin" - :class:`~pandas.tseries.offsets.BYearEnd`, ``'BA'``, "business year end" - :class:`~pandas.tseries.offsets.BYearBegin`, ``'BAS'``, "business year begin" + :class:`~pandas.tseries.offsets.YearBegin`, ``'YS'`` or ``'BYS'``,"calendar year begin" + :class:`~pandas.tseries.offsets.BYearEnd`, ``'BY'``, "business year end" + :class:`~pandas.tseries.offsets.BYearBegin`, ``'BYS'``, "business year begin" :class:`~pandas.tseries.offsets.FY5253`, ``'RE'``, "retail (aka 52-53 week) year" :class:`~pandas.tseries.offsets.Easter`, None, "Easter holiday" :class:`~pandas.tseries.offsets.BusinessHour`, ``'bh'``, "business hour" @@ -1259,9 +1259,9 @@ frequencies. We will refer to these aliases as *offset aliases*. "QS", "quarter start frequency" "BQS", "business quarter start frequency" "Y", "year end frequency" - "BA, BY", "business year end frequency" - "AS, YS", "year start frequency" - "BAS, BYS", "business year start frequency" + "BY", "business year end frequency" + "YS", "year start frequency" + "BYS", "business year start frequency" "h", "hourly frequency" "bh", "business hour frequency" "cbh", "custom business hour frequency" @@ -1692,7 +1692,7 @@ the end of the interval. .. warning:: The default values for ``label`` and ``closed`` is '**left**' for all - frequency offsets except for 'ME', 'Y', 'Q', 'BM', 'BA', 'BQ', and 'W' + frequency offsets except for 'ME', 'Y', 'Q', 'BM', 'BY', 'BQ', and 'W' which all have a default of 'right'. This might unintendedly lead to looking ahead, where the value for a later diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index ae70eb078f6d9..09bf5428d0432 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -886,11 +886,23 @@ This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622 This is *unchanged* from prior versions, but shown for illustration purposes: -.. ipython:: python +.. code-block:: python - df = pd.DataFrame(np.arange(6), columns=['value'], - index=pd.MultiIndex.from_product([list('BA'), range(3)])) - df + In [81]: df = pd.DataFrame(np.arange(6), columns=['value'], + ....: index=pd.MultiIndex.from_product([list('BA'), range(3)])) + ....: + In [82]: df + + Out[82]: + value + B 0 0 + 1 1 + 2 2 + A 0 3 + 1 4 + 2 5 + + [6 rows x 1 columns] .. code-block:: python diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 86f620beeec3b..26181d8f15518 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -192,9 +192,6 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "BQS": "Q", "QS": "Q", "BQ": "Q", - "BA": "Y", - "AS": "Y", - "BAS": "Y", "MS": "M", "D": "D", "B": "B", @@ -205,9 +202,9 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "ns": "ns", "h": "h", "Q": "Q", - "Y": "Y", "W": "W", "ME": "M", + "Y": "Y", "BY": "Y", "YS": "Y", "BYS": "Y", @@ -244,6 +241,45 @@ DEPR_ABBREVS: dict[str, str]= { "A-SEP": "Y-SEP", "A-OCT": "Y-OCT", "A-NOV": "Y-NOV", + "BA": "BY", + "BA-DEC": "BY-DEC", + "BA-JAN": "BY-JAN", + "BA-FEB": "BY-FEB", + "BA-MAR": "BY-MAR", + "BA-APR": "BY-APR", + "BA-MAY": "BY-MAY", + "BA-JUN": "BY-JUN", + "BA-JUL": "BY-JUL", + "BA-AUG": "BY-AUG", + "BA-SEP": "BY-SEP", + "BA-OCT": "BY-OCT", + "BA-NOV": "BY-NOV", + "AS": "YS", + "AS-DEC": "YS-DEC", + "AS-JAN": "YS-JAN", + "AS-FEB": "YS-FEB", + "AS-MAR": "YS-MAR", + "AS-APR": "YS-APR", + "AS-MAY": "YS-MAY", + "AS-JUN": "YS-JUN", + "AS-JUL": "YS-JUL", + "AS-AUG": "YS-AUG", + "AS-SEP": "YS-SEP", + "AS-OCT": "YS-OCT", + "AS-NOV": "YS-NOV", + "BAS": "BYS", + "BAS-DEC": "BYS-DEC", + "BAS-JAN": "BYS-JAN", + "BAS-FEB": "BYS-FEB", + "BAS-MAR": "BYS-MAR", + "BAS-APR": "BYS-APR", + "BAS-MAY": "BYS-MAY", + "BAS-JUN": "BYS-JUN", + "BAS-JUL": "BYS-JUL", + "BAS-AUG": "BYS-AUG", + "BAS-SEP": "BYS-SEP", + "BAS-OCT": "BYS-OCT", + "BAS-NOV": "BYS-NOV", "H": "h", "BH": "bh", "CBH": "cbh", diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index ad37add17967d..a726c735bf9a1 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -253,8 +253,8 @@ def get_start_end_field( # month of year. Other offsets use month, startingMonth as ending # month of year. - if (freqstr[0:2] in ["MS", "QS", "AS"]) or ( - freqstr[1:3] in ["MS", "QS", "AS"]): + if (freqstr[0:2] in ["MS", "QS", "YS"]) or ( + freqstr[1:3] in ["MS", "QS", "YS"]): end_month = 12 if month_kw == 1 else month_kw - 1 start_month = month_kw else: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 042d5dafe3046..6a6f30de8dade 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -2414,7 +2414,7 @@ cdef class BYearEnd(YearOffset): _outputName = "BusinessYearEnd" _default_month = 12 - _prefix = "BA" + _prefix = "BY" _day_opt = "business_end" @@ -2453,7 +2453,7 @@ cdef class BYearBegin(YearOffset): _outputName = "BusinessYearBegin" _default_month = 1 - _prefix = "BAS" + _prefix = "BYS" _day_opt = "business_start" @@ -2552,7 +2552,7 @@ cdef class YearBegin(YearOffset): """ _default_month = 1 - _prefix = "AS" + _prefix = "YS" _day_opt = "start" @@ -4540,10 +4540,10 @@ CDay = CustomBusinessDay prefix_mapping = { offset._prefix: offset for offset in [ - YearBegin, # 'AS' + YearBegin, # 'YS' YearEnd, # 'Y' - BYearBegin, # 'BAS' - BYearEnd, # 'BA' + BYearBegin, # 'BYS' + BYearEnd, # 'BY' BusinessDay, # 'B' BusinessMonthBegin, # 'BMS' BusinessMonthEnd, # 'BM' @@ -4584,12 +4584,9 @@ _lite_rule_alias = { "Q": "Q-DEC", "Y": "Y-DEC", # YearEnd(month=12), - "AS": "AS-JAN", # YearBegin(month=1), - "YS": "AS-JAN", - "BA": "BA-DEC", # BYearEnd(month=12), - "BY": "BA-DEC", - "BAS": "BAS-JAN", # BYearBegin(month=1), - "BYS": "BAS-JAN", + "YS": "YS-JAN", # YearBegin(month=1), + "BY": "BY-DEC", # BYearEnd(month=12), + "BYS": "BYS-JAN", # BYearBegin(month=1), "Min": "min", "min": "min", diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 60c42c01e9f6f..c91f892936640 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2526,7 +2526,7 @@ def _round_temporally( raise ValueError(f"Must specify a valid frequency: {freq}") pa_supported_unit = { "Y": "year", - "AS": "year", + "YS": "year", "Q": "quarter", "QS": "quarter", "M": "month", diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index f3b2a35f379f4..12f93cf482a1d 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -992,11 +992,11 @@ def date_range( **Specify a unit** - >>> pd.date_range(start="2017-01-01", periods=10, freq="100AS", unit="s") + >>> pd.date_range(start="2017-01-01", periods=10, freq="100YS", unit="s") DatetimeIndex(['2017-01-01', '2117-01-01', '2217-01-01', '2317-01-01', '2417-01-01', '2517-01-01', '2617-01-01', '2717-01-01', '2817-01-01', '2917-01-01'], - dtype='datetime64[s]', freq='100AS-JAN') + dtype='datetime64[s]', freq='100YS-JAN') """ if freq is None and com.any_none(periods, start, end): freq = "D" diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 59e6a20915c18..8b3071a6f8582 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2101,7 +2101,7 @@ def __init__( else: freq = to_offset(freq) - end_types = {"ME", "Y", "Q", "BM", "BA", "BQ", "W"} + end_types = {"ME", "Y", "Q", "BM", "BY", "BQ", "W"} rule = freq.rule_code if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types): if closed is None: @@ -2299,7 +2299,7 @@ def _adjust_bin_edges( if self.freq.name in ("BM", "ME", "W") or self.freq.name.split("-")[0] in ( "BQ", - "BA", + "BY", "Q", "Y", "W", diff --git a/pandas/core/series.py b/pandas/core/series.py index c2eea371ddef3..fdd03debf6de4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5729,7 +5729,7 @@ def to_timestamp( 2023-01-01 1 2024-01-01 2 2025-01-01 3 - Freq: AS-JAN, dtype: int64 + Freq: YS-JAN, dtype: int64 Using `freq` which is the offset that the Timestamps will have diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index df6ccda27ab85..693b8d9483407 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1594,7 +1594,7 @@ def test_dt64arr_add_sub_offset_array( Timestamp("2016-04-01"), Timestamp("2017-04-01"), ], - "AS-APR", + "YS-APR", ), ( "__sub__", @@ -1616,7 +1616,7 @@ def test_dt64arr_add_sub_offset_array( Timestamp("2015-10-01"), Timestamp("2016-10-01"), ], - "AS-OCT", + "YS-OCT", ), ], ) @@ -1625,7 +1625,7 @@ def test_dti_add_sub_nonzero_mth_offset( ): # GH 26258 tz = tz_aware_fixture - date = date_range(start="01 Jan 2014", end="01 Jan 2017", freq="AS", tz=tz) + date = date_range(start="01 Jan 2014", end="01 Jan 2017", freq="YS", tz=tz) date = tm.box_expected(date, box_with_array, False) mth = getattr(date, op) result = mth(offset) diff --git a/pandas/tests/frame/methods/test_to_timestamp.py b/pandas/tests/frame/methods/test_to_timestamp.py index aeb65d98d8ab2..859dc56de4a25 100644 --- a/pandas/tests/frame/methods/test_to_timestamp.py +++ b/pandas/tests/frame/methods/test_to_timestamp.py @@ -44,7 +44,7 @@ def test_to_timestamp(self, frame_or_series): if frame_or_series is Series: assert result.name == "A" - exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + exp_index = date_range("1/1/2001", end="1/1/2009", freq="YS-JAN") result = obj.to_timestamp("D", "start") tm.assert_index_equal(result.index, exp_index) @@ -88,7 +88,7 @@ def test_to_timestamp_columns(self): tm.assert_index_equal(result.columns, exp_index) tm.assert_numpy_array_equal(result.values, df.values) - exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + exp_index = date_range("1/1/2001", end="1/1/2009", freq="YS-JAN") result = df.to_timestamp("D", "start", axis=1) tm.assert_index_equal(result.columns, exp_index) @@ -112,14 +112,14 @@ def test_to_timestamp_columns(self): result1 = df.to_timestamp("5min", axis=1) result2 = df.to_timestamp("min", axis=1) - expected = date_range("2001-01-01", "2009-01-01", freq="AS") + expected = date_range("2001-01-01", "2009-01-01", freq="YS") assert isinstance(result1.columns, DatetimeIndex) assert isinstance(result2.columns, DatetimeIndex) tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8) tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8) # PeriodIndex.to_timestamp always use 'infer' - assert result1.columns.freqstr == "AS-JAN" - assert result2.columns.freqstr == "AS-JAN" + assert result1.columns.freqstr == "YS-JAN" + assert result2.columns.freqstr == "YS-JAN" def test_to_timestamp_invalid_axis(self): index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 88ee8a35e5c94..76a543050097d 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -734,7 +734,7 @@ def test_list_grouper_with_nat(self): # GH 14715 df = DataFrame({"date": date_range("1/1/2011", periods=365, freq="D")}) df.iloc[-1] = pd.NaT - grouper = Grouper(key="date", freq="AS") + grouper = Grouper(key="date", freq="YS") # Grouper in a list grouping result = df.groupby([grouper]) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 8900c5cdbca14..6839fafcdc114 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -56,7 +56,7 @@ def test_to_period_quarterlyish(self, off): prng = rng.to_period() assert prng.freq == "Q-DEC" - @pytest.mark.parametrize("off", ["BA", "AS", "BAS"]) + @pytest.mark.parametrize("off", ["BY", "YS", "BYS"]) def test_to_period_annualish(self, off): rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 6da215715482d..077b4fa5a0696 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -648,7 +648,7 @@ def test_constructor_coverage(self): with pytest.raises(ValueError, match=msg): date_range(periods=10, freq="D") - @pytest.mark.parametrize("freq", ["AS", "W-SUN"]) + @pytest.mark.parametrize("freq", ["YS", "W-SUN"]) def test_constructor_datetime64_tzformat(self, freq): # see GH#6572: ISO 8601 format results in stdlib timezone object idx = date_range( @@ -981,8 +981,8 @@ def test_dti_constructor_years_only(self, tz_naive_fixture): rng3 = date_range("2014", "2020", freq="Y", tz=tz) expected3 = date_range("2014-12-31", "2019-12-31", freq="Y", tz=tz) - rng4 = date_range("2014", "2020", freq="AS", tz=tz) - expected4 = date_range("2014-01-01", "2020-01-01", freq="AS", tz=tz) + rng4 = date_range("2014", "2020", freq="YS", tz=tz) + expected4 = date_range("2014-01-01", "2020-01-01", freq="YS", tz=tz) for rng, expected in [ (rng1, expected1), diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index ededf78621699..a74d31747fbb0 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -243,13 +243,12 @@ def test_date_range_gen_error(self): rng = date_range("1/1/2000 00:00", "1/1/2000 00:18", freq="5min") assert len(rng) == 4 - @pytest.mark.parametrize("freq", ["AS", "YS"]) - def test_begin_year_alias(self, freq): + def test_begin_year_alias(self): # see gh-9313 - rng = date_range("1/1/2013", "7/1/2017", freq=freq) + rng = date_range("1/1/2013", "7/1/2017", freq="YS") exp = DatetimeIndex( ["2013-01-01", "2014-01-01", "2015-01-01", "2016-01-01", "2017-01-01"], - freq=freq, + freq="YS", ) tm.assert_index_equal(rng, exp) @@ -261,12 +260,11 @@ def test_end_year_alias(self): ) tm.assert_index_equal(rng, exp) - @pytest.mark.parametrize("freq", ["BA", "BY"]) - def test_business_end_year_alias(self, freq): + def test_business_end_year_alias(self): # see gh-9313 - rng = date_range("1/1/2013", "7/1/2017", freq=freq) + rng = date_range("1/1/2013", "7/1/2017", freq="BY") exp = DatetimeIndex( - ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-30"], freq=freq + ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-30"], freq="BY" ) tm.assert_index_equal(rng, exp) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 156075e3fafec..a18501a193b60 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -1,5 +1,6 @@ import datetime as dt from datetime import date +import re import dateutil import numpy as np @@ -226,3 +227,40 @@ def test_CBH_deprecated(self): ) tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq_depr, expected_values, expected_freq", + [ + ( + "2BA", + ["2020-12-31", "2022-12-30"], + "2BY-DEC", + ), + ( + "AS-AUG", + ["2021-08-01", "2022-08-01", "2023-08-01"], + "YS-AUG", + ), + ( + "1BAS-MAY", + ["2021-05-03", "2022-05-02", "2023-05-01"], + "1BYS-MAY", + ), + ], + ) + def test_AS_BA_BAS_deprecated(self, freq_depr, expected_values, expected_freq): + # GH#55479 + freq_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] + msg = f"'{freq_msg}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range( + dt.datetime(2020, 12, 1), dt.datetime(2023, 12, 1), freq=freq_depr + ) + result = DatetimeIndex( + expected_values, + dtype="datetime64[ns]", + freq=expected_freq, + ) + + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 185134af165f4..0a5287d154adc 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -146,7 +146,7 @@ def test_datetimeindex_accessors5(self): qsfeb = to_offset("QS-FEB") bq = to_offset("BQ") bqs_apr = to_offset("BQS-APR") - as_nov = to_offset("AS-NOV") + as_nov = to_offset("YS-NOV") tests = [ (freq_m.is_month_start(Timestamp("2013-06-01")), 1), diff --git a/pandas/tests/indexes/period/methods/test_to_timestamp.py b/pandas/tests/indexes/period/methods/test_to_timestamp.py index 2394efb353ab6..977ad8b26a369 100644 --- a/pandas/tests/indexes/period/methods/test_to_timestamp.py +++ b/pandas/tests/indexes/period/methods/test_to_timestamp.py @@ -49,7 +49,7 @@ def test_to_timestamp_non_contiguous(self): def test_to_timestamp_freq(self): idx = period_range("2017", periods=12, freq="Y-DEC") result = idx.to_timestamp() - expected = date_range("2017", periods=12, freq="AS-JAN") + expected = date_range("2017", periods=12, freq="YS-JAN") tm.assert_index_equal(result, expected) def test_to_timestamp_pi_nat(self): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 28d02576156a0..f66f5bf50974e 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1172,7 +1172,7 @@ def test_resample_anchored_intraday(simple_date_range_series, unit): assert len(resampled) == 1 -@pytest.mark.parametrize("freq", ["MS", "BMS", "QS-MAR", "AS-DEC", "AS-JUN"]) +@pytest.mark.parametrize("freq", ["MS", "BMS", "QS-MAR", "YS-DEC", "YS-JUN"]) def test_resample_anchored_monthstart(simple_date_range_series, freq, unit): ts = simple_date_range_series("1/1/2000", "12/31/2002") ts.index = ts.index.as_unit(unit) @@ -1320,7 +1320,7 @@ def test_resample_unequal_times(unit): df = DataFrame({"close": 1}, index=bad_ind) # it works! - df.resample("AS").sum() + df.resample("YS").sum() def test_resample_consistency(unit): diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index d214e1b4ae4ae..6ad09f12525b4 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -660,7 +660,7 @@ def test_default_right_closed_label(self, from_freq, to_freq): @pytest.mark.parametrize( "from_freq, to_freq", - [("D", "MS"), ("Q", "AS"), ("ME", "QS"), ("h", "D"), ("min", "h")], + [("D", "MS"), ("Q", "YS"), ("ME", "QS"), ("h", "D"), ("min", "h")], ) def test_default_left_closed_label(self, from_freq, to_freq): idx = date_range(start="8/15/2012", periods=100, freq=from_freq) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 51d0dd298f841..22ff7f8405a40 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -52,7 +52,7 @@ def base_delta_code_pair(request): freqs = ( [f"Q-{month}" for month in MONTHS] - + [f"{annual}-{month}" for annual in ["Y", "BA"] for month in MONTHS] + + [f"{annual}-{month}" for annual in ["Y", "BY"] for month in MONTHS] + ["ME", "BM", "BMS"] + [f"WOM-{count}{day}" for count in range(1, 5) for day in DAYS] + [f"W-{day}" for day in DAYS] @@ -215,7 +215,7 @@ def test_infer_freq_index(freq, expected): "expected,dates", list( { - "AS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"], + "YS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"], "Q-OCT": ["2009-01-31", "2009-04-30", "2009-07-31", "2009-10-31"], "ME": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], "W-SAT": ["2010-12-25", "2011-01-01", "2011-01-08", "2011-01-15"], diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 7f96ea98fa047..9389f78c9e672 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -839,7 +839,7 @@ def test_rule_code(self): "NOV", "DEC", ] - base_lst = ["Y", "AS", "BA", "BAS", "Q", "QS", "BQ", "BQS"] + base_lst = ["Y", "YS", "BY", "BYS", "Q", "QS", "BQ", "BQS"] for base in base_lst: for v in suffix_lst: alias = "-".join([base, v]) @@ -858,7 +858,7 @@ def test_freq_offsets(): class TestReprNames: def test_str_for_named_is_name(self): # look at all the amazing combinations! - month_prefixes = ["Y", "AS", "BA", "BAS", "Q", "BQ", "BQS", "QS"] + month_prefixes = ["Y", "YS", "BY", "BYS", "Q", "BQ", "BQS", "QS"] names = [ prefix + "-" + month for prefix in month_prefixes diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 0ed0fe4b87576..db4fdf0d24465 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -59,7 +59,7 @@ # -------------------------------------------------------------------- # Offset related functions -_need_suffix = ["QS", "BQ", "BQS", "YS", "AS", "BY", "BA", "BYS", "BAS"] +_need_suffix = ["QS", "BQ", "BQS", "YS", "BY", "BYS"] for _prefix in _need_suffix: for _m in MONTHS: @@ -345,7 +345,7 @@ def _get_annual_rule(self) -> str | None: if pos_check is None: return None else: - return {"cs": "AS", "bs": "BAS", "ce": "Y", "be": "BA"}.get(pos_check) + return {"cs": "YS", "bs": "BYS", "ce": "Y", "be": "BY"}.get(pos_check) def _get_quarterly_rule(self) -> str | None: if len(self.mdiffs) > 1: From 7e8148f4ff44482b8541959d5e095fef00adad23 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 12 Oct 2023 08:08:25 -0700 Subject: [PATCH 04/19] REF: De-special-case _format_with_header (#55491) * REF: De-special-case _format_with_header * simplify --- pandas/core/indexes/base.py | 16 ++++++++++++---- pandas/core/indexes/category.py | 10 ---------- pandas/core/indexes/datetimelike.py | 1 + pandas/core/indexes/interval.py | 7 ------- 4 files changed, 13 insertions(+), 21 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d749235e2cd2c..515f750f11219 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1387,12 +1387,20 @@ def _format_with_header(self, *, header: list[str_t], na_rep: str_t) -> list[str values = self._values - if is_object_dtype(values.dtype) or is_string_dtype(values.dtype): - values = np.asarray(values) + if ( + is_object_dtype(values.dtype) + or is_string_dtype(values.dtype) + or isinstance(self.dtype, (IntervalDtype, CategoricalDtype)) + ): # TODO: why do we need different justify for these cases? - result = trim_front(format_array(values, None, justify="all")) + justify = "all" else: - result = trim_front(format_array(values, None, justify="left")) + justify = "left" + # passing leading_space=False breaks test_format_missing, + # test_index_repr_in_frame_with_nan, but would otherwise make + # trim_front unnecessary + formatted = format_array(values, None, justify=justify) + result = trim_front(formatted) return header + result def _format_native_types( diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 9cf7e861584d9..b307be004ad6e 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -21,7 +21,6 @@ from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, - notna, ) from pandas.core.arrays.categorical import ( @@ -38,8 +37,6 @@ inherit_names, ) -from pandas.io.formats.printing import pprint_thing - if TYPE_CHECKING: from collections.abc import Hashable @@ -356,13 +353,6 @@ def _format_attrs(self): extra = super()._format_attrs() return attrs + extra - def _format_with_header(self, *, header: list[str], na_rep: str) -> list[str]: - result = [ - pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep - for x in self._values - ] - return header + result - # -------------------------------------------------------------------- @property diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 94ad556219b35..f02f7dcb65251 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -216,6 +216,7 @@ def format( def _format_with_header( self, *, header: list[str], na_rep: str, date_format: str | None = None ) -> list[str]: + # TODO: not reached in tests 2023-10-11 # matches base class except for whitespace padding and date_format return header + list( self._format_native_types(na_rep=na_rep, date_format=date_format) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 209ac84869e85..f73e2b5512262 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -842,13 +842,6 @@ def mid(self) -> Index: def length(self) -> Index: return Index(self._data.length, copy=False) - # -------------------------------------------------------------------- - # Rendering Methods - - def _format_with_header(self, *, header: list[str], na_rep: str) -> list[str]: - # matches base class except for whitespace padding - return header + list(self._format_native_types(na_rep=na_rep)) - # -------------------------------------------------------------------- # Set Operations From 7ef617e88e1e51bbfd5d0cf71f57ef4502b91b38 Mon Sep 17 00:00:00 2001 From: shiersansi <143710553+shiersansi@users.noreply.github.com> Date: Thu, 12 Oct 2023 23:10:59 +0800 Subject: [PATCH 05/19] DOC: Add DataFrame.index.levels (#55437) * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: ../pandas/core/indexes/multi.py * modified: ../pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * Update pandas/core/indexes/multi.py --------- Co-authored-by: Marc Garcia --- pandas/core/indexes/multi.py | 47 ++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c5cab225fa7b1..0c3593eca178d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -843,6 +843,53 @@ def size(self) -> int: @cache_readonly def levels(self) -> FrozenList: + """ + Levels of the MultiIndex. + + Levels refer to the different hierarchical levels or layers in a MultiIndex. + In a MultiIndex, each level represents a distinct dimension or category of + the index. + + To access the levels, you can use the levels attribute of the MultiIndex, + which returns a tuple of Index objects. Each Index object represents a + level in the MultiIndex and contains the unique values found in that + specific level. + + If a MultiIndex is created with levels A, B, C, and the DataFrame using + it filters out all rows of the level C, MultiIndex.levels will still + return A, B, C. + + Examples + -------- + >>> index = pd.MultiIndex.from_product([['mammal'], + ... ('goat', 'human', 'cat', 'dog')], names=['Category', 'Animals']) + >>> leg_num = pd.DataFrame(data=(4, 2, 4, 4), index=index, columns=['Legs']) + >>> leg_num + Legs + Category Animals + mammal goat 4 + human 2 + cat 4 + dog 4 + + >>> leg_num.index.levels + FrozenList([['mammal'], ['cat', 'dog', 'goat', 'human']]) + + MultiIndex levels will not change even if the DataFrame using the MultiIndex + does not contain all them anymore. + See how "human" is not in the DataFrame, but it is still in levels: + + >>> large_leg_num = leg_num[leg_num.Legs > 2] + >>> large_leg_num + Legs + Category Animals + mammal goat 4 + cat 4 + dog 4 + + >>> large_leg_num.index.levels + FrozenList([['mammal'], ['cat', 'dog', 'goat', 'human']]) + """ # Use cache_readonly to ensure that self.get_locs doesn't repeatedly # create new IndexEngine # https://github.com/pandas-dev/pandas/issues/31648 From ae177e88472a0f71e4fa8d41e773f6fb7029a8dc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 12 Oct 2023 05:12:37 -1000 Subject: [PATCH 06/19] TST: Close ad-hoc db connections (#55445) * TST: Close ad-hoc db connections * Add runtime import --- pandas/tests/io/test_sql.py | 227 +++++++++++++++++------------------- 1 file changed, 105 insertions(+), 122 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 82fb98615100f..11f95ff104767 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1,20 +1,3 @@ -"""SQL io tests - -The SQL tests are broken down in different classes: - -- `PandasSQLTest`: base class with common methods for all test classes -- Tests for the public API (only tests with sqlite3) - - `_TestSQLApi` base class - - `TestSQLApi`: test the public API with sqlalchemy engine - - `TestSQLiteFallbackApi`: test the public API with a sqlite DBAPI - connection -- Tests for the different SQL flavors (flavor specific type conversions) - - Tests for the sqlalchemy mode: `_TestSQLAlchemy` is the base class with - common methods. The different tested flavors (sqlite3, MySQL, - PostgreSQL) derive from the base class - - Tests for the fallback mode (`TestSQLiteFallback`) - -""" from __future__ import annotations import contextlib @@ -29,6 +12,7 @@ from io import StringIO from pathlib import Path import sqlite3 +from typing import TYPE_CHECKING import uuid import numpy as np @@ -69,13 +53,9 @@ read_sql_table, ) -try: +if TYPE_CHECKING: import sqlalchemy - SQLALCHEMY_INSTALLED = True -except ImportError: - SQLALCHEMY_INSTALLED = False - @pytest.fixture def sql_strings(): @@ -426,13 +406,16 @@ def drop_table( conn.commit() else: with conn.begin() as con: - sql.SQLDatabase(con).drop_table(table_name) + with sql.SQLDatabase(con) as db: + db.drop_table(table_name) def drop_view( view_name: str, conn: sqlite3.Connection | sqlalchemy.engine.Engine | sqlalchemy.engine.Connection, ): + import sqlalchemy + if isinstance(conn, sqlite3.Connection): conn.execute(f"DROP VIEW IF EXISTS {sql._get_valid_sqlite_name(view_name)}") conn.commit() @@ -1151,10 +1134,9 @@ def test_read_sql_iris_parameter(conn, request, sql_strings, flavor): conn = request.getfixturevalue(conn) query = sql_strings["read_parameters"][flavor(conn_name)] params = ("Iris-setosa", 5.1) - pandasSQL = pandasSQL_builder(conn) - - with pandasSQL.run_transaction(): - iris_frame = pandasSQL.read_query(query, params=params) + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + iris_frame = pandasSQL.read_query(query, params=params) check_iris_frame(iris_frame) @@ -1179,9 +1161,9 @@ def test_read_sql_iris_no_parameter_with_percent(conn, request, sql_strings, fla conn = request.getfixturevalue(conn) query = sql_strings["read_no_parameters_with_percent"][flavor(conn_name)] - pandasSQL = pandasSQL_builder(conn) - with pandasSQL.run_transaction(): - iris_frame = pandasSQL.read_query(query, params=None) + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + iris_frame = pandasSQL.read_query(query, params=None) check_iris_frame(iris_frame) @@ -1914,7 +1896,8 @@ def test_warning_case_insensitive_table_name(conn, request, test_frame1): r"sensitivity issues. Consider using lower case table names." ), ): - sql.SQLDatabase(conn).check_case_sensitive("TABLE1", "") + with sql.SQLDatabase(conn) as db: + db.check_case_sensitive("TABLE1", "") # Test that the warning is certainly NOT triggered in a normal case. with tm.assert_produces_warning(None): @@ -1930,10 +1913,10 @@ def test_sqlalchemy_type_mapping(conn, request): df = DataFrame( {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} ) - db = sql.SQLDatabase(conn) - table = sql.SQLTable("test_type", db, frame=df) - # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones - assert isinstance(table.table.c["time"].type, TIMESTAMP) + with sql.SQLDatabase(conn) as db: + table = sql.SQLTable("test_type", db, frame=df) + # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones + assert isinstance(table.table.c["time"].type, TIMESTAMP) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -1961,10 +1944,10 @@ def test_sqlalchemy_integer_mapping(conn, request, integer, expected): # GH35076 Map pandas integer to optimal SQLAlchemy integer type conn = request.getfixturevalue(conn) df = DataFrame([0, 1], columns=["a"], dtype=integer) - db = sql.SQLDatabase(conn) - table = sql.SQLTable("test_type", db, frame=df) + with sql.SQLDatabase(conn) as db: + table = sql.SQLTable("test_type", db, frame=df) - result = str(table.table.c.a.type) + result = str(table.table.c.a.type) assert result == expected @@ -1974,16 +1957,16 @@ def test_sqlalchemy_integer_overload_mapping(conn, request, integer): conn = request.getfixturevalue(conn) # GH35076 Map pandas integer to optimal SQLAlchemy integer type df = DataFrame([0, 1], columns=["a"], dtype=integer) - db = sql.SQLDatabase(conn) - with pytest.raises( - ValueError, match="Unsigned 64 bit integer datatype is not supported" - ): - sql.SQLTable("test_type", db, frame=df) + with sql.SQLDatabase(conn) as db: + with pytest.raises( + ValueError, match="Unsigned 64 bit integer datatype is not supported" + ): + sql.SQLTable("test_type", db, frame=df) -@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="fails without SQLAlchemy") @pytest.mark.parametrize("conn", all_connectable) def test_database_uri_string(conn, request, test_frame1): + pytest.importorskip("sqlalchemy") conn = request.getfixturevalue(conn) # Test read_sql and .to_sql method with a database URI (GH10654) # db_uri = 'sqlite:///:memory:' # raises @@ -2003,9 +1986,9 @@ def test_database_uri_string(conn, request, test_frame1): @td.skip_if_installed("pg8000") -@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="fails without SQLAlchemy") @pytest.mark.parametrize("conn", all_connectable) def test_pg8000_sqlalchemy_passthrough_error(conn, request): + pytest.importorskip("sqlalchemy") conn = request.getfixturevalue(conn) # using driver that will not be installed on CI to trigger error # in sqlalchemy.create_engine -> test passing of this error to user @@ -2076,7 +2059,7 @@ def test_sql_open_close(test_frame3): tm.assert_frame_equal(test_frame3, result) -@pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") +@td.skip_if_installed("sqlalchemy") def test_con_string_import_error(): conn = "mysql://root@localhost/pandas" msg = "Using URI string without sqlalchemy installed" @@ -2084,7 +2067,7 @@ def test_con_string_import_error(): sql.read_sql("SELECT * FROM iris", conn) -@pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") +@td.skip_if_installed("sqlalchemy") def test_con_unknown_dbapi2_class_does_not_error_without_sql_alchemy_installed(): class MockSqliteConnection: def __init__(self, *args, **kwargs) -> None: @@ -2167,20 +2150,20 @@ def test_drop_table(conn, request): from sqlalchemy import inspect temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}) - pandasSQL = sql.SQLDatabase(conn) - with pandasSQL.run_transaction(): - assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4 + with sql.SQLDatabase(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4 - insp = inspect(conn) - assert insp.has_table("temp_frame") + insp = inspect(conn) + assert insp.has_table("temp_frame") - with pandasSQL.run_transaction(): - pandasSQL.drop_table("temp_frame") - try: - insp.clear_cache() # needed with SQLAlchemy 2.0, unavailable prior - except AttributeError: - pass - assert not insp.has_table("temp_frame") + with pandasSQL.run_transaction(): + pandasSQL.drop_table("temp_frame") + try: + insp.clear_cache() # needed with SQLAlchemy 2.0, unavailable prior + except AttributeError: + pass + assert not insp.has_table("temp_frame") @pytest.mark.parametrize("conn", all_connectable) @@ -2206,9 +2189,10 @@ def test_roundtrip(conn, request, test_frame1): def test_execute_sql(conn, request): conn = request.getfixturevalue(conn) pandasSQL = pandasSQL_builder(conn) - with pandasSQL.run_transaction(): - iris_results = pandasSQL.execute("SELECT * FROM iris") - row = iris_results.fetchone() + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + iris_results = pandasSQL.execute("SELECT * FROM iris") + row = iris_results.fetchone() tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) @@ -2616,10 +2600,10 @@ def test_to_sql_save_index(conn, request): [(1, 2.1, "line1"), (2, 1.5, "line2")], columns=["A", "B", "C"], index=["A"] ) - pandasSQL = pandasSQL_builder(conn) tbl_name = "test_to_sql_saves_index" - with pandasSQL.run_transaction(): - assert pandasSQL.to_sql(df, tbl_name) == 2 + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(df, tbl_name) == 2 if conn_name in {"sqlite_buildin", "sqlite_str"}: ixs = sql.read_sql_query( @@ -2648,55 +2632,55 @@ def test_transactions(conn, request): conn = request.getfixturevalue(conn) stmt = "CREATE TABLE test_trans (A INT, B TEXT)" - pandasSQL = pandasSQL_builder(conn) if conn_name != "sqlite_buildin": from sqlalchemy import text stmt = text(stmt) - with pandasSQL.run_transaction() as trans: - trans.execute(stmt) + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction() as trans: + trans.execute(stmt) @pytest.mark.parametrize("conn", all_connectable) def test_transaction_rollback(conn, request): conn = request.getfixturevalue(conn) - pandasSQL = pandasSQL_builder(conn) - with pandasSQL.run_transaction() as trans: - stmt = "CREATE TABLE test_trans (A INT, B TEXT)" - if isinstance(pandasSQL, SQLiteDatabase): - trans.execute(stmt) - else: - from sqlalchemy import text + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction() as trans: + stmt = "CREATE TABLE test_trans (A INT, B TEXT)" + if isinstance(pandasSQL, SQLiteDatabase): + trans.execute(stmt) + else: + from sqlalchemy import text - stmt = text(stmt) - trans.execute(stmt) + stmt = text(stmt) + trans.execute(stmt) - class DummyException(Exception): - pass + class DummyException(Exception): + pass - # Make sure when transaction is rolled back, no rows get inserted - ins_sql = "INSERT INTO test_trans (A,B) VALUES (1, 'blah')" - if isinstance(pandasSQL, SQLDatabase): - from sqlalchemy import text + # Make sure when transaction is rolled back, no rows get inserted + ins_sql = "INSERT INTO test_trans (A,B) VALUES (1, 'blah')" + if isinstance(pandasSQL, SQLDatabase): + from sqlalchemy import text + + ins_sql = text(ins_sql) + try: + with pandasSQL.run_transaction() as trans: + trans.execute(ins_sql) + raise DummyException("error") + except DummyException: + # ignore raised exception + pass + with pandasSQL.run_transaction(): + res = pandasSQL.read_query("SELECT * FROM test_trans") + assert len(res) == 0 - ins_sql = text(ins_sql) - try: + # Make sure when transaction is committed, rows do get inserted with pandasSQL.run_transaction() as trans: trans.execute(ins_sql) - raise DummyException("error") - except DummyException: - # ignore raised exception - pass - with pandasSQL.run_transaction(): - res = pandasSQL.read_query("SELECT * FROM test_trans") - assert len(res) == 0 - - # Make sure when transaction is committed, rows do get inserted - with pandasSQL.run_transaction() as trans: - trans.execute(ins_sql) - res2 = pandasSQL.read_query("SELECT * FROM test_trans") - assert len(res2) == 1 + res2 = pandasSQL.read_query("SELECT * FROM test_trans") + assert len(res2) == 1 @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -2975,9 +2959,9 @@ def test_invalid_engine(conn, request, test_frame1): conn = request.getfixturevalue(conn) msg = "engine must be one of 'auto', 'sqlalchemy'" - pandasSQL = pandasSQL_builder(conn) - with pytest.raises(ValueError, match=msg): - pandasSQL.to_sql(test_frame1, "test_frame1", engine="bad_engine") + with pandasSQL_builder(conn) as pandasSQL: + with pytest.raises(ValueError, match=msg): + pandasSQL.to_sql(test_frame1, "test_frame1", engine="bad_engine") @pytest.mark.parametrize("conn", all_connectable) @@ -2985,10 +2969,10 @@ def test_to_sql_with_sql_engine(conn, request, test_frame1): """`to_sql` with the `engine` param""" # mostly copied from this class's `_to_sql()` method conn = request.getfixturevalue(conn) - pandasSQL = pandasSQL_builder(conn) - with pandasSQL.run_transaction(): - assert pandasSQL.to_sql(test_frame1, "test_frame1", engine="auto") == 4 - assert pandasSQL.has_table("test_frame1") + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1", engine="auto") == 4 + assert pandasSQL.has_table("test_frame1") num_entries = len(test_frame1) num_rows = count_rows(conn, "test_frame1") @@ -3000,10 +2984,10 @@ def test_options_sqlalchemy(conn, request, test_frame1): # use the set option conn = request.getfixturevalue(conn) with pd.option_context("io.sql.engine", "sqlalchemy"): - pandasSQL = pandasSQL_builder(conn) - with pandasSQL.run_transaction(): - assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4 - assert pandasSQL.has_table("test_frame1") + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4 + assert pandasSQL.has_table("test_frame1") num_entries = len(test_frame1) num_rows = count_rows(conn, "test_frame1") @@ -3015,18 +2999,18 @@ def test_options_auto(conn, request, test_frame1): # use the set option conn = request.getfixturevalue(conn) with pd.option_context("io.sql.engine", "auto"): - pandasSQL = pandasSQL_builder(conn) - with pandasSQL.run_transaction(): - assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4 - assert pandasSQL.has_table("test_frame1") + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4 + assert pandasSQL.has_table("test_frame1") num_entries = len(test_frame1) num_rows = count_rows(conn, "test_frame1") assert num_rows == num_entries -@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="fails without SQLAlchemy") def test_options_get_engine(): + pytest.importorskip("sqlalchemy") assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) with pd.option_context("io.sql.engine", "sqlalchemy"): @@ -3463,17 +3447,16 @@ def test_self_join_date_columns(postgresql_psycopg2_engine): def test_create_and_drop_table(sqlite_sqlalchemy_memory_engine): conn = sqlite_sqlalchemy_memory_engine temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}) - pandasSQL = sql.SQLDatabase(conn) - - with pandasSQL.run_transaction(): - assert pandasSQL.to_sql(temp_frame, "drop_test_frame") == 4 + with sql.SQLDatabase(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(temp_frame, "drop_test_frame") == 4 - assert pandasSQL.has_table("drop_test_frame") + assert pandasSQL.has_table("drop_test_frame") - with pandasSQL.run_transaction(): - pandasSQL.drop_table("drop_test_frame") + with pandasSQL.run_transaction(): + pandasSQL.drop_table("drop_test_frame") - assert not pandasSQL.has_table("drop_test_frame") + assert not pandasSQL.has_table("drop_test_frame") def test_sqlite_datetime_date(sqlite_buildin): From 9de2a19b9651336cc14a2830c8460e9ad1e2d505 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 12 Oct 2023 18:31:34 -0400 Subject: [PATCH 07/19] REF: Add tests.groupby.methods (#55312) * REF: Add tests.groupby.methods * Merge cleanup * Refactor * Refactor * Show value of ymin * fixup * Revert * Revert --- pandas/tests/groupby/methods/__init__.py | 0 pandas/tests/groupby/methods/test_corrwith.py | 24 + pandas/tests/groupby/methods/test_describe.py | 221 +++++ .../{ => methods}/test_groupby_shift_diff.py | 0 .../groupby/methods/test_is_monotonic.py | 78 ++ .../methods/test_nlargest_nsmallest.py | 115 +++ .../tests/groupby/{ => methods}/test_nth.py | 0 .../groupby/{ => methods}/test_quantile.py | 0 .../tests/groupby/{ => methods}/test_rank.py | 0 .../groupby/{ => methods}/test_sample.py | 0 .../tests/groupby/{ => methods}/test_size.py | 0 .../tests/groupby/{ => methods}/test_skew.py | 0 .../{ => methods}/test_value_counts.py | 0 pandas/tests/groupby/test_any_all.py | 188 ---- pandas/tests/groupby/test_cumulative.py | 291 ++++++ pandas/tests/groupby/test_function.py | 916 ------------------ pandas/tests/groupby/test_min_max.py | 272 ------ pandas/tests/groupby/test_nunique.py | 190 ---- pandas/tests/groupby/test_reductions.py | 838 ++++++++++++++++ 19 files changed, 1567 insertions(+), 1566 deletions(-) create mode 100644 pandas/tests/groupby/methods/__init__.py create mode 100644 pandas/tests/groupby/methods/test_corrwith.py create mode 100644 pandas/tests/groupby/methods/test_describe.py rename pandas/tests/groupby/{ => methods}/test_groupby_shift_diff.py (100%) create mode 100644 pandas/tests/groupby/methods/test_is_monotonic.py create mode 100644 pandas/tests/groupby/methods/test_nlargest_nsmallest.py rename pandas/tests/groupby/{ => methods}/test_nth.py (100%) rename pandas/tests/groupby/{ => methods}/test_quantile.py (100%) rename pandas/tests/groupby/{ => methods}/test_rank.py (100%) rename pandas/tests/groupby/{ => methods}/test_sample.py (100%) rename pandas/tests/groupby/{ => methods}/test_size.py (100%) rename pandas/tests/groupby/{ => methods}/test_skew.py (100%) rename pandas/tests/groupby/{ => methods}/test_value_counts.py (100%) delete mode 100644 pandas/tests/groupby/test_any_all.py create mode 100644 pandas/tests/groupby/test_cumulative.py delete mode 100644 pandas/tests/groupby/test_min_max.py delete mode 100644 pandas/tests/groupby/test_nunique.py create mode 100644 pandas/tests/groupby/test_reductions.py diff --git a/pandas/tests/groupby/methods/__init__.py b/pandas/tests/groupby/methods/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/groupby/methods/test_corrwith.py b/pandas/tests/groupby/methods/test_corrwith.py new file mode 100644 index 0000000000000..53e8bdc4534dc --- /dev/null +++ b/pandas/tests/groupby/methods/test_corrwith.py @@ -0,0 +1,24 @@ +import numpy as np + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +def test_corrwith_with_1_axis(): + # GH 47723 + df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]}) + gb = df.groupby("a") + + msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.corrwith(df, axis=1) + index = Index( + data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)], + name=("a", None), + ) + expected = Series([np.nan] * 6, index=index) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py new file mode 100644 index 0000000000000..f38de8faddb59 --- /dev/null +++ b/pandas/tests/groupby/methods/test_describe.py @@ -0,0 +1,221 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Timestamp, +) +import pandas._testing as tm + + +def test_apply_describe_bug(mframe): + grouped = mframe.groupby(level="first") + grouped.describe() # it works! + + +def test_series_describe_multikey(): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) + tm.assert_series_equal(result["std"], grouped.std(), check_names=False) + tm.assert_series_equal(result["min"], grouped.min(), check_names=False) + + +def test_series_describe_single(): + ts = tm.makeTimeSeries() + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x.describe()) + expected = grouped.describe().stack(future_stack=True) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]]) +def test_series_describe_as_index(as_index, keys): + # GH#49256 + df = DataFrame( + { + "key1": ["one", "two", "two", "three", "two"], + "key2": ["one", "two", "two", "three", "two"], + "foo2": [1, 2, 4, 4, 6], + } + ) + gb = df.groupby(keys, as_index=as_index)["foo2"] + result = gb.describe() + expected = DataFrame( + { + "key1": ["one", "three", "two"], + "count": [1.0, 1.0, 3.0], + "mean": [1.0, 4.0, 4.0], + "std": [np.nan, np.nan, 2.0], + "min": [1.0, 4.0, 2.0], + "25%": [1.0, 4.0, 3.0], + "50%": [1.0, 4.0, 4.0], + "75%": [1.0, 4.0, 5.0], + "max": [1.0, 4.0, 6.0], + } + ) + if len(keys) == 2: + expected.insert(1, "key2", expected["key1"]) + if as_index: + expected = expected.set_index(keys) + tm.assert_frame_equal(result, expected) + + +def test_frame_describe_multikey(tsframe): + grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + desc_groups = [] + for col in tsframe: + group = grouped[col].describe() + # GH 17464 - Remove duplicate MultiIndex levels + group_col = MultiIndex( + levels=[[col], group.columns], + codes=[[0] * len(group.columns), range(len(group.columns))], + ) + group = DataFrame(group.values, columns=group_col, index=group.index) + desc_groups.append(group) + expected = pd.concat(desc_groups, axis=1) + tm.assert_frame_equal(result, expected) + + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) + result = groupedT.describe() + expected = tsframe.describe().T + # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ + expected.index = MultiIndex( + levels=[[0, 1], expected.index], + codes=[[0, 0, 1, 1], range(len(expected.index))], + ) + tm.assert_frame_equal(result, expected) + + +def test_frame_describe_tupleindex(): + # GH 14848 - regression from 0.19.0 to 0.19.1 + df1 = DataFrame( + { + "x": [1, 2, 3, 4, 5] * 3, + "y": [10, 20, 30, 40, 50] * 3, + "z": [100, 200, 300, 400, 500] * 3, + } + ) + df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 + df2 = df1.rename(columns={"k": "key"}) + msg = "Names should be list-like for a MultiIndex" + with pytest.raises(ValueError, match=msg): + df1.groupby("k").describe() + with pytest.raises(ValueError, match=msg): + df2.groupby("key").describe() + + +def test_frame_describe_unstacked_format(): + # GH 4792 + prices = { + Timestamp("2011-01-06 10:59:05", tz=None): 24990, + Timestamp("2011-01-06 12:43:33", tz=None): 25499, + Timestamp("2011-01-06 12:54:09", tz=None): 25499, + } + volumes = { + Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, + Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, + Timestamp("2011-01-06 12:54:09", tz=None): 100000000, + } + df = DataFrame({"PRICE": prices, "VOLUME": volumes}) + result = df.groupby("PRICE").VOLUME.describe() + data = [ + df[df.PRICE == 24990].VOLUME.describe().values.tolist(), + df[df.PRICE == 25499].VOLUME.describe().values.tolist(), + ] + expected = DataFrame( + data, + index=Index([24990, 25499], name="PRICE"), + columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.filterwarnings( + "ignore:" + "indexing past lexsort depth may impact performance:" + "pandas.errors.PerformanceWarning" +) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) +def test_describe_with_duplicate_output_column_names(as_index, keys): + # GH 35314 + df = DataFrame( + { + "a1": [99, 99, 99, 88, 88, 88], + "a2": [99, 99, 99, 88, 88, 88], + "b": [1, 2, 3, 4, 5, 6], + "c": [10, 20, 30, 40, 50, 60], + }, + columns=["a1", "a2", "b", "b"], + copy=False, + ) + if keys == ["a1"]: + df = df.drop(columns="a2") + + expected = ( + DataFrame.from_records( + [ + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ], + ) + .set_index([0, 1]) + .T + ) + expected.columns.names = [None, None] + if len(keys) == 2: + expected.index = MultiIndex( + levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"] + ) + else: + expected.index = Index([88, 99], name="a1") + + if not as_index: + expected = expected.reset_index() + + result = df.groupby(keys, as_index=as_index).describe() + + tm.assert_frame_equal(result, expected) + + +def test_describe_duplicate_columns(): + # GH#50806 + df = DataFrame([[0, 1, 2, 3]]) + df.columns = [0, 1, 2, 0] + gb = df.groupby(df[1]) + result = gb.describe(percentiles=[]) + + columns = ["count", "mean", "std", "min", "50%", "max"] + frames = [ + DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) + for val in (0.0, 2.0, 3.0) + ] + expected = pd.concat(frames, axis=1) + expected.columns = MultiIndex( + levels=[[0, 2], columns], + codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], + ) + expected.index.names = [1] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/methods/test_groupby_shift_diff.py similarity index 100% rename from pandas/tests/groupby/test_groupby_shift_diff.py rename to pandas/tests/groupby/methods/test_groupby_shift_diff.py diff --git a/pandas/tests/groupby/methods/test_is_monotonic.py b/pandas/tests/groupby/methods/test_is_monotonic.py new file mode 100644 index 0000000000000..3428fc90f6e51 --- /dev/null +++ b/pandas/tests/groupby/methods/test_is_monotonic.py @@ -0,0 +1,78 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly increasing (T), strictly decreasing (F), + # abs val increasing (F), non-strictly increasing (T) + ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], + [True, False, True, False], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) +def test_is_monotonic_increasing(in_vals, out_vals): + # GH 17015 + source_dict = { + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } + df = DataFrame(source_dict) + result = df.groupby("B").C.is_monotonic_increasing + index = Index(list("abcd"), name="B") + expected = Series(index=index, data=out_vals, name="C") + tm.assert_series_equal(result, expected) + + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly decreasing (T), strictly increasing (F), + # abs val decreasing (F), non-strictly increasing (T) + ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], + [True, True, False, True], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) +def test_is_monotonic_decreasing(in_vals, out_vals): + # GH 17015 + source_dict = { + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } + + df = DataFrame(source_dict) + result = df.groupby("B").C.is_monotonic_decreasing + index = Index(list("abcd"), name="B") + expected = Series(index=index, data=out_vals, name="C") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_nlargest_nsmallest.py b/pandas/tests/groupby/methods/test_nlargest_nsmallest.py new file mode 100644 index 0000000000000..bf983f04a3f3f --- /dev/null +++ b/pandas/tests/groupby/methods/test_nlargest_nsmallest.py @@ -0,0 +1,115 @@ +import numpy as np +import pytest + +from pandas import ( + MultiIndex, + Series, + date_range, +) +import pandas._testing as tm + + +def test_nlargest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list("a" * 5 + "b" * 5)) + gb = a.groupby(b) + r = gb.nlargest(3) + e = Series( + [7, 5, 3, 10, 9, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]), + ) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series( + [3, 2, 1, 3, 3, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]), + ) + tm.assert_series_equal(gb.nlargest(3, keep="last"), e) + + +def test_nlargest_mi_grouper(): + # see gh-21411 + npr = np.random.default_rng(2) + + dts = date_range("20180101", periods=10) + iterables = [dts, ["one", "two"]] + + idx = MultiIndex.from_product(iterables, names=["first", "second"]) + s = Series(npr.standard_normal(20), index=idx) + + result = s.groupby("first").nlargest(1) + + exp_idx = MultiIndex.from_tuples( + [ + (dts[0], dts[0], "one"), + (dts[1], dts[1], "one"), + (dts[2], dts[2], "one"), + (dts[3], dts[3], "two"), + (dts[4], dts[4], "one"), + (dts[5], dts[5], "one"), + (dts[6], dts[6], "one"), + (dts[7], dts[7], "one"), + (dts[8], dts[8], "one"), + (dts[9], dts[9], "one"), + ], + names=["first", "first", "second"], + ) + + exp_values = [ + 0.18905338179353307, + -0.41306354339189344, + 1.799707382720902, + 0.7738065867276614, + 0.28121066979764925, + 0.9775674511260357, + -0.3288239040579627, + 0.45495807124085547, + 0.5452887139646817, + 0.12682784711186987, + ] + + expected = Series(exp_values, index=exp_idx) + tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3) + + +def test_nsmallest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list("a" * 5 + "b" * 5)) + gb = a.groupby(b) + r = gb.nsmallest(3) + e = Series( + [1, 2, 3, 0, 4, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]), + ) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series( + [0, 1, 1, 0, 1, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]), + ) + tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) + + +@pytest.mark.parametrize( + "data, groups", + [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])], +) +@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES]) +@pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) +def test_nlargest_and_smallest_noop(data, groups, dtype, method): + # GH 15272, GH 16345, GH 29129 + # Test nlargest/smallest when it results in a noop, + # i.e. input is sorted and group size <= n + if dtype is not None: + data = np.array(data, dtype=dtype) + if method == "nlargest": + data = list(reversed(data)) + ser = Series(data, name="a") + result = getattr(ser.groupby(groups), method)(n=2) + expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups + expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/methods/test_nth.py similarity index 100% rename from pandas/tests/groupby/test_nth.py rename to pandas/tests/groupby/methods/test_nth.py diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py similarity index 100% rename from pandas/tests/groupby/test_quantile.py rename to pandas/tests/groupby/methods/test_quantile.py diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/methods/test_rank.py similarity index 100% rename from pandas/tests/groupby/test_rank.py rename to pandas/tests/groupby/methods/test_rank.py diff --git a/pandas/tests/groupby/test_sample.py b/pandas/tests/groupby/methods/test_sample.py similarity index 100% rename from pandas/tests/groupby/test_sample.py rename to pandas/tests/groupby/methods/test_sample.py diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/methods/test_size.py similarity index 100% rename from pandas/tests/groupby/test_size.py rename to pandas/tests/groupby/methods/test_size.py diff --git a/pandas/tests/groupby/test_skew.py b/pandas/tests/groupby/methods/test_skew.py similarity index 100% rename from pandas/tests/groupby/test_skew.py rename to pandas/tests/groupby/methods/test_skew.py diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py similarity index 100% rename from pandas/tests/groupby/test_value_counts.py rename to pandas/tests/groupby/methods/test_value_counts.py diff --git a/pandas/tests/groupby/test_any_all.py b/pandas/tests/groupby/test_any_all.py deleted file mode 100644 index 57a83335be849..0000000000000 --- a/pandas/tests/groupby/test_any_all.py +++ /dev/null @@ -1,188 +0,0 @@ -import builtins - -import numpy as np -import pytest - -import pandas as pd -from pandas import ( - DataFrame, - Index, - Series, - isna, -) -import pandas._testing as tm - - -@pytest.mark.parametrize("agg_func", ["any", "all"]) -@pytest.mark.parametrize( - "vals", - [ - ["foo", "bar", "baz"], - ["foo", "", ""], - ["", "", ""], - [1, 2, 3], - [1, 0, 0], - [0, 0, 0], - [1.0, 2.0, 3.0], - [1.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [True, True, True], - [True, False, False], - [False, False, False], - [np.nan, np.nan, np.nan], - ], -) -def test_groupby_bool_aggs(skipna, agg_func, vals): - df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2}) - - # Figure out expectation using Python builtin - exp = getattr(builtins, agg_func)(vals) - - # edge case for missing data with skipna and 'any' - if skipna and all(isna(vals)) and agg_func == "any": - exp = False - - expected = DataFrame( - [exp] * 2, columns=["val"], index=Index(["a", "b"], name="key") - ) - result = getattr(df.groupby("key"), agg_func)(skipna=skipna) - tm.assert_frame_equal(result, expected) - - -def test_any(): - df = DataFrame( - [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], - columns=["A", "B", "C"], - ) - expected = DataFrame( - [[True, True], [False, True]], columns=["B", "C"], index=[1, 3] - ) - expected.index.name = "A" - result = df.groupby("A").any() - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_bool_aggs_dup_column_labels(bool_agg_func): - # GH#21668 - df = DataFrame([[True, True]], columns=["a", "a"]) - grp_by = df.groupby([0]) - result = getattr(grp_by, bool_agg_func)() - - expected = df.set_axis(np.array([0])) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -@pytest.mark.parametrize( - "data", - [ - [False, False, False], - [True, True, True], - [pd.NA, pd.NA, pd.NA], - [False, pd.NA, False], - [True, pd.NA, True], - [True, pd.NA, False], - ], -) -def test_masked_kleene_logic(bool_agg_func, skipna, data): - # GH#37506 - ser = Series(data, dtype="boolean") - - # The result should match aggregating on the whole series. Correctness - # there is verified in test_reductions.py::test_any_all_boolean_kleene_logic - expected_data = getattr(ser, bool_agg_func)(skipna=skipna) - expected = Series(expected_data, index=np.array([0]), dtype="boolean") - - result = ser.groupby([0, 0, 0]).agg(bool_agg_func, skipna=skipna) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype1,dtype2,exp_col1,exp_col2", - [ - ( - "float", - "Float64", - np.array([True], dtype=bool), - pd.array([pd.NA], dtype="boolean"), - ), - ( - "Int64", - "float", - pd.array([pd.NA], dtype="boolean"), - np.array([True], dtype=bool), - ), - ( - "Int64", - "Int64", - pd.array([pd.NA], dtype="boolean"), - pd.array([pd.NA], dtype="boolean"), - ), - ( - "Float64", - "boolean", - pd.array([pd.NA], dtype="boolean"), - pd.array([pd.NA], dtype="boolean"), - ), - ], -) -def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2): - # GH#37506 - data = [1.0, np.nan] - df = DataFrame( - {"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)} - ) - result = df.groupby([1, 1]).agg("all", skipna=False) - - expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=np.array([1])) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) -def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series): - # GH#40585 - obj = frame_or_series([pd.NA, 1], dtype=dtype) - expected_res = True - if not skipna and bool_agg_func == "all": - expected_res = pd.NA - expected = frame_or_series([expected_res], index=np.array([1]), dtype="boolean") - - result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna) - tm.assert_equal(result, expected) - - -@pytest.mark.parametrize( - "bool_agg_func,data,expected_res", - [ - ("any", [pd.NA, np.nan], False), - ("any", [pd.NA, 1, np.nan], True), - ("all", [pd.NA, pd.NaT], True), - ("all", [pd.NA, False, pd.NaT], False), - ], -) -def test_object_type_missing_vals(bool_agg_func, data, expected_res, frame_or_series): - # GH#37501 - obj = frame_or_series(data, dtype=object) - result = obj.groupby([1] * len(data)).agg(bool_agg_func) - expected = frame_or_series([expected_res], index=np.array([1]), dtype="bool") - tm.assert_equal(result, expected) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_object_NA_raises_with_skipna_false(bool_agg_func): - # GH#37501 - ser = Series([pd.NA], dtype=object) - with pytest.raises(TypeError, match="boolean value of NA is ambiguous"): - ser.groupby([1]).agg(bool_agg_func, skipna=False) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_empty(frame_or_series, bool_agg_func): - # GH 45231 - kwargs = {"columns": ["a"]} if frame_or_series is DataFrame else {"name": "a"} - obj = frame_or_series(**kwargs, dtype=object) - result = getattr(obj.groupby(obj.index), bool_agg_func)() - expected = frame_or_series(**kwargs, dtype=bool) - tm.assert_equal(result, expected) diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py new file mode 100644 index 0000000000000..eecb82cd5050b --- /dev/null +++ b/pandas/tests/groupby/test_cumulative.py @@ -0,0 +1,291 @@ +import numpy as np +import pytest + +from pandas.errors import UnsupportedFunctionCall +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +@pytest.fixture( + params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"], + ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"], +) +def dtypes_for_minmax(request): + """ + Fixture of dtypes with min and max values used for testing + cummin and cummax + """ + dtype = request.param + + np_type = dtype + if dtype == "Int64": + np_type = np.int64 + elif dtype == "Float64": + np_type = np.float64 + + min_val = ( + np.iinfo(np_type).min + if np.dtype(np_type).kind == "i" + else np.finfo(np_type).min + ) + max_val = ( + np.iinfo(np_type).max + if np.dtype(np_type).kind == "i" + else np.finfo(np_type).max + ) + + return (dtype, min_val, max_val) + + +def test_groupby_cumprod(): + # GH 4095 + df = DataFrame({"key": ["b"] * 10, "value": 2}) + + actual = df.groupby("key")["value"].cumprod() + expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) + expected.name = "value" + tm.assert_series_equal(actual, expected) + + df = DataFrame({"key": ["b"] * 100, "value": 2}) + df["value"] = df["value"].astype(float) + actual = df.groupby("key")["value"].cumprod() + expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) + expected.name = "value" + tm.assert_series_equal(actual, expected) + + +def test_groupby_cumprod_overflow(): + # GH#37493 if we overflow we return garbage consistent with numpy + df = DataFrame({"key": ["b"] * 4, "value": 100_000}) + actual = df.groupby("key")["value"].cumprod() + expected = Series( + [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920], + name="value", + ) + tm.assert_series_equal(actual, expected) + + numpy_result = df.groupby("key", group_keys=False)["value"].apply( + lambda x: x.cumprod() + ) + numpy_result.name = "value" + tm.assert_series_equal(actual, numpy_result) + + +def test_groupby_cumprod_nan_influences_other_columns(): + # GH#48064 + df = DataFrame( + { + "a": 1, + "b": [1, np.nan, 2], + "c": [1, 2, 3.0], + } + ) + result = df.groupby("a").cumprod(numeric_only=True, skipna=False) + expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]}) + tm.assert_frame_equal(result, expected) + + +def test_cummin(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + min_val = dtypes_for_minmax[1] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] + + df = base_df.astype(dtype) + + expected = DataFrame({"B": expected_mins}).astype(dtype) + result = df.groupby("A").cummin() + tm.assert_frame_equal(result, expected) + result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test w/ min value for dtype + df.loc[[2, 6], "B"] = min_val + df.loc[[1, 5], "B"] = min_val + 1 + expected.loc[[2, 3, 6, 7], "B"] = min_val + expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val + result = df.groupby("A").cummin() + tm.assert_frame_equal(result, expected, check_exact=True) + expected = ( + df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + ) + tm.assert_frame_equal(result, expected, check_exact=True) + + # Test nan in some values + # Explicit cast to float to avoid implicit cast when setting nan + base_df = base_df.astype({"B": "float"}) + base_df.loc[[0, 2, 4, 6], "B"] = np.nan + expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) + result = base_df.groupby("A").cummin() + tm.assert_frame_equal(result, expected) + expected = ( + base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + ) + tm.assert_frame_equal(result, expected) + + # GH 15561 + df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) + expected = Series(pd.to_datetime("2001"), index=[0], name="b") + + result = df.groupby("a")["b"].cummin() + tm.assert_series_equal(expected, result) + + # GH 15635 + df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]}) + result = df.groupby("a").b.cummin() + expected = Series([1, 2, 1], name="b") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"]) +def test_cummin_max_all_nan_column(method, dtype): + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) + base_df["B"] = base_df["B"].astype(dtype) + grouped = base_df.groupby("A") + + expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype) + result = getattr(grouped, method)() + tm.assert_frame_equal(expected, result) + + result = getattr(grouped["B"], method)().to_frame() + tm.assert_frame_equal(expected, result) + + +def test_cummax(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + max_val = dtypes_for_minmax[2] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] + + df = base_df.astype(dtype) + + expected = DataFrame({"B": expected_maxs}).astype(dtype) + result = df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test w/ max value for dtype + df.loc[[2, 6], "B"] = max_val + expected.loc[[2, 3, 6, 7], "B"] = max_val + result = df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + expected = ( + df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + ) + tm.assert_frame_equal(result, expected) + + # Test nan in some values + # Explicit cast to float to avoid implicit cast when setting nan + base_df = base_df.astype({"B": "float"}) + base_df.loc[[0, 2, 4, 6], "B"] = np.nan + expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) + result = base_df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + expected = ( + base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + ) + tm.assert_frame_equal(result, expected) + + # GH 15561 + df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) + expected = Series(pd.to_datetime("2001"), index=[0], name="b") + + result = df.groupby("a")["b"].cummax() + tm.assert_series_equal(expected, result) + + # GH 15635 + df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]}) + result = df.groupby("a").b.cummax() + expected = Series([2, 1, 2], name="b") + tm.assert_series_equal(result, expected) + + +def test_cummax_i8_at_implementation_bound(): + # the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT + # for int64 dtype GH#46382 + ser = Series([pd.NaT._value + n for n in range(5)]) + df = DataFrame({"A": 1, "B": ser, "C": ser.view("M8[ns]")}) + gb = df.groupby("A") + + res = gb.cummax() + exp = df[["B", "C"]] + tm.assert_frame_equal(res, exp) + + +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"]) +@pytest.mark.parametrize( + "groups,expected_data", + [ + ([1, 1, 1], [1, None, None]), + ([1, 2, 3], [1, None, 2]), + ([1, 3, 3], [1, None, None]), + ], +) +def test_cummin_max_skipna(method, dtype, groups, expected_data): + # GH-34047 + df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) + orig = df.copy() + gb = df.groupby(groups)["a"] + + result = getattr(gb, method)(skipna=False) + expected = Series(expected_data, dtype=dtype, name="a") + + # check we didn't accidentally alter df + tm.assert_frame_equal(df, orig) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +def test_cummin_max_skipna_multiple_cols(method): + # Ensure missing value in "a" doesn't cause "b" to be nan-filled + df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]}) + gb = df.groupby([1, 1, 1])[["a", "b"]] + + result = getattr(gb, method)(skipna=False) + expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]}) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", ["cumprod", "cumsum"]) +def test_numpy_compat(func): + # see gh-12811 + df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) + g = df.groupby("A") + + msg = "numpy operations are not valid with groupby" + + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(g, func)(1, 2, 3) + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(g, func)(foo=1) + + +@td.skip_if_32bit +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize( + "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)] +) +def test_nullable_int_not_cast_as_float(method, dtype, val): + data = [val, pd.NA] + df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype) + grouped = df.groupby("grp") + + result = grouped.transform(method) + expected = DataFrame({"b": data}, dtype=dtype) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 08372541988d0..4876267c72f12 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1,12 +1,10 @@ import builtins -from io import StringIO import re import numpy as np import pytest from pandas._libs import lib -from pandas.errors import UnsupportedFunctionCall import pandas as pd from pandas import ( @@ -22,37 +20,6 @@ from pandas.util import _test_decorators as td -@pytest.fixture( - params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"], - ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"], -) -def dtypes_for_minmax(request): - """ - Fixture of dtypes with min and max values used for testing - cummin and cummax - """ - dtype = request.param - - np_type = dtype - if dtype == "Int64": - np_type = np.int64 - elif dtype == "Float64": - np_type = np.float64 - - min_val = ( - np.iinfo(np_type).min - if np.dtype(np_type).kind == "i" - else np.finfo(np_type).min - ) - max_val = ( - np.iinfo(np_type).max - if np.dtype(np_type).kind == "i" - else np.finfo(np_type).max - ) - - return (dtype, min_val, max_val) - - def test_intercept_builtin_sum(): s = Series([1.0, 2.0, np.nan, 3.0]) grouped = s.groupby([0, 1, 2, 2]) @@ -372,39 +339,6 @@ def test_cython_api2(): tm.assert_frame_equal(result, expected) -def test_cython_median(): - arr = np.random.default_rng(2).standard_normal(1000) - arr[::2] = np.nan - df = DataFrame(arr) - - labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) - labels[::17] = np.nan - - result = df.groupby(labels).median() - msg = "using DataFrameGroupBy.median" - with tm.assert_produces_warning(FutureWarning, match=msg): - exp = df.groupby(labels).agg(np.nanmedian) - tm.assert_frame_equal(result, exp) - - df = DataFrame(np.random.default_rng(2).standard_normal((1000, 5))) - msg = "using DataFrameGroupBy.median" - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = df.groupby(labels).agg(np.median) - xp = df.groupby(labels).median() - tm.assert_frame_equal(rs, xp) - - -def test_median_empty_bins(observed): - df = DataFrame(np.random.default_rng(2).integers(0, 44, 500)) - - grps = range(0, 55, 5) - bins = pd.cut(df[0], grps) - - result = df.groupby(bins, observed=observed).median() - expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"] ) @@ -478,105 +412,6 @@ def test_groupby_non_arithmetic_agg_int_like_precision(i): assert res.iloc[0].b == data["expected"] -@pytest.mark.parametrize( - "func, values", - [ - ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}), - ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}), - ], -) -@pytest.mark.parametrize("numeric_only", [True, False]) -def test_idxmin_idxmax_returns_int_types(func, values, numeric_only): - # GH 25444 - df = DataFrame( - { - "name": ["A", "A", "B", "B"], - "c_int": [1, 2, 3, 4], - "c_float": [4.02, 3.03, 2.04, 1.05], - "c_date": ["2019", "2018", "2016", "2017"], - } - ) - df["c_date"] = pd.to_datetime(df["c_date"]) - df["c_date_tz"] = df["c_date"].dt.tz_localize("US/Pacific") - df["c_timedelta"] = df["c_date"] - df["c_date"].iloc[0] - df["c_period"] = df["c_date"].dt.to_period("W") - df["c_Integer"] = df["c_int"].astype("Int64") - df["c_Floating"] = df["c_float"].astype("Float64") - - result = getattr(df.groupby("name"), func)(numeric_only=numeric_only) - - expected = DataFrame(values, index=Index(["A", "B"], name="name")) - if numeric_only: - expected = expected.drop(columns=["c_date"]) - else: - expected["c_date_tz"] = expected["c_date"] - expected["c_timedelta"] = expected["c_date"] - expected["c_period"] = expected["c_date"] - expected["c_Integer"] = expected["c_int"] - expected["c_Floating"] = expected["c_float"] - - tm.assert_frame_equal(result, expected) - - -def test_idxmin_idxmax_axis1(): - df = DataFrame( - np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] - ) - df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] - - gb = df.groupby("A") - - warn_msg = "DataFrameGroupBy.idxmax with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - res = gb.idxmax(axis=1) - - alt = df.iloc[:, 1:].idxmax(axis=1) - indexer = res.index.get_level_values(1) - - tm.assert_series_equal(alt[indexer], res.droplevel("A")) - - df["E"] = date_range("2016-01-01", periods=10) - gb2 = df.groupby("A") - - msg = "'>' not supported between instances of 'Timestamp' and 'float'" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - gb2.idxmax(axis=1) - - -@pytest.mark.parametrize( - "func, values, expected_values, warn", - [ - ("idxmin", [0, 1, 2], [0, 2], None), - ("idxmax", [0, 1, 2], [1, 2], None), - ("idxmin", [0, np.nan, 2], [np.nan, 2], FutureWarning), - ("idxmax", [0, np.nan, 2], [np.nan, 2], FutureWarning), - ("idxmin", [1, 0, np.nan], [1, np.nan], FutureWarning), - ("idxmax", [1, 0, np.nan], [0, np.nan], FutureWarning), - ], -) -@pytest.mark.parametrize("test_series", [True, False]) -def test_idxmin_idxmax_skipna_false(func, values, expected_values, warn, test_series): - # GH#54234 - df = DataFrame( - { - "a": [1, 1, 2], - "b": values, - } - ) - gb = df.groupby("a") - index = Index([1, 2], name="a") - expected = DataFrame({"b": expected_values}, index=index) - if test_series: - gb = gb["b"] - expected = expected["b"] - klass = "Series" if test_series else "DataFrame" - msg = f"The behavior of {klass}GroupBy.{func} with all-NA values" - with tm.assert_produces_warning(warn, match=msg): - result = getattr(gb, func)(skipna=False) - tm.assert_equal(result, expected) - - @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_axis1_numeric_only(request, groupby_func, numeric_only): if groupby_func in ("idxmax", "idxmin"): @@ -658,54 +493,6 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only): tm.assert_equal(result, expected) -def test_groupby_cumprod(): - # GH 4095 - df = DataFrame({"key": ["b"] * 10, "value": 2}) - - actual = df.groupby("key")["value"].cumprod() - expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) - expected.name = "value" - tm.assert_series_equal(actual, expected) - - df = DataFrame({"key": ["b"] * 100, "value": 2}) - df["value"] = df["value"].astype(float) - actual = df.groupby("key")["value"].cumprod() - expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) - expected.name = "value" - tm.assert_series_equal(actual, expected) - - -def test_groupby_cumprod_overflow(): - # GH#37493 if we overflow we return garbage consistent with numpy - df = DataFrame({"key": ["b"] * 4, "value": 100_000}) - actual = df.groupby("key")["value"].cumprod() - expected = Series( - [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920], - name="value", - ) - tm.assert_series_equal(actual, expected) - - numpy_result = df.groupby("key", group_keys=False)["value"].apply( - lambda x: x.cumprod() - ) - numpy_result.name = "value" - tm.assert_series_equal(actual, numpy_result) - - -def test_groupby_cumprod_nan_influences_other_columns(): - # GH#48064 - df = DataFrame( - { - "a": 1, - "b": [1, np.nan, 2], - "c": [1, 2, 3.0], - } - ) - result = df.groupby("a").cumprod(numeric_only=True, skipna=False) - expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]}) - tm.assert_frame_equal(result, expected) - - def scipy_sem(*args, **kwargs): from scipy.stats import sem @@ -741,627 +528,12 @@ def test_ops_general(op, targop): tm.assert_frame_equal(result, expected) -def test_max_nan_bug(): - raw = """,Date,app,File --04-23,2013-04-23 00:00:00,,log080001.log --05-06,2013-05-06 00:00:00,,log.log --05-07,2013-05-07 00:00:00,OE,xlsx""" - - with tm.assert_produces_warning(UserWarning, match="Could not infer format"): - df = pd.read_csv(StringIO(raw), parse_dates=[0]) - gb = df.groupby("Date") - r = gb[["File"]].max() - e = gb["File"].max().to_frame() - tm.assert_frame_equal(r, e) - assert not r["File"].isna().any() - - -def test_nlargest(): - a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list("a" * 5 + "b" * 5)) - gb = a.groupby(b) - r = gb.nlargest(3) - e = Series( - [7, 5, 3, 10, 9, 6], - index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]), - ) - tm.assert_series_equal(r, e) - - a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) - gb = a.groupby(b) - e = Series( - [3, 2, 1, 3, 3, 2], - index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]), - ) - tm.assert_series_equal(gb.nlargest(3, keep="last"), e) - - -def test_nlargest_mi_grouper(): - # see gh-21411 - npr = np.random.default_rng(2) - - dts = date_range("20180101", periods=10) - iterables = [dts, ["one", "two"]] - - idx = MultiIndex.from_product(iterables, names=["first", "second"]) - s = Series(npr.standard_normal(20), index=idx) - - result = s.groupby("first").nlargest(1) - - exp_idx = MultiIndex.from_tuples( - [ - (dts[0], dts[0], "one"), - (dts[1], dts[1], "one"), - (dts[2], dts[2], "one"), - (dts[3], dts[3], "two"), - (dts[4], dts[4], "one"), - (dts[5], dts[5], "one"), - (dts[6], dts[6], "one"), - (dts[7], dts[7], "one"), - (dts[8], dts[8], "one"), - (dts[9], dts[9], "one"), - ], - names=["first", "first", "second"], - ) - - exp_values = [ - 0.18905338179353307, - -0.41306354339189344, - 1.799707382720902, - 0.7738065867276614, - 0.28121066979764925, - 0.9775674511260357, - -0.3288239040579627, - 0.45495807124085547, - 0.5452887139646817, - 0.12682784711186987, - ] - - expected = Series(exp_values, index=exp_idx) - tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3) - - -def test_nsmallest(): - a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list("a" * 5 + "b" * 5)) - gb = a.groupby(b) - r = gb.nsmallest(3) - e = Series( - [1, 2, 3, 0, 4, 6], - index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]), - ) - tm.assert_series_equal(r, e) - - a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) - gb = a.groupby(b) - e = Series( - [0, 1, 1, 0, 1, 2], - index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]), - ) - tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) - - -@pytest.mark.parametrize( - "data, groups", - [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])], -) -@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES]) -@pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) -def test_nlargest_and_smallest_noop(data, groups, dtype, method): - # GH 15272, GH 16345, GH 29129 - # Test nlargest/smallest when it results in a noop, - # i.e. input is sorted and group size <= n - if dtype is not None: - data = np.array(data, dtype=dtype) - if method == "nlargest": - data = list(reversed(data)) - ser = Series(data, name="a") - result = getattr(ser.groupby(groups), method)(n=2) - expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups - expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("func", ["cumprod", "cumsum"]) -def test_numpy_compat(func): - # see gh-12811 - df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) - g = df.groupby("A") - - msg = "numpy operations are not valid with groupby" - - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(g, func)(1, 2, 3) - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(g, func)(foo=1) - - -def test_cummin(dtypes_for_minmax): - dtype = dtypes_for_minmax[0] - min_val = dtypes_for_minmax[1] - - # GH 15048 - base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) - expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] - - df = base_df.astype(dtype) - - expected = DataFrame({"B": expected_mins}).astype(dtype) - result = df.groupby("A").cummin() - tm.assert_frame_equal(result, expected) - result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() - tm.assert_frame_equal(result, expected) - - # Test w/ min value for dtype - df.loc[[2, 6], "B"] = min_val - df.loc[[1, 5], "B"] = min_val + 1 - expected.loc[[2, 3, 6, 7], "B"] = min_val - expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val - result = df.groupby("A").cummin() - tm.assert_frame_equal(result, expected, check_exact=True) - expected = ( - df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() - ) - tm.assert_frame_equal(result, expected, check_exact=True) - - # Test nan in some values - # Explicit cast to float to avoid implicit cast when setting nan - base_df = base_df.astype({"B": "float"}) - base_df.loc[[0, 2, 4, 6], "B"] = np.nan - expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) - result = base_df.groupby("A").cummin() - tm.assert_frame_equal(result, expected) - expected = ( - base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() - ) - tm.assert_frame_equal(result, expected) - - # GH 15561 - df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) - expected = Series(pd.to_datetime("2001"), index=[0], name="b") - - result = df.groupby("a")["b"].cummin() - tm.assert_series_equal(expected, result) - - # GH 15635 - df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]}) - result = df.groupby("a").b.cummin() - expected = Series([1, 2, 1], name="b") - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"]) -def test_cummin_max_all_nan_column(method, dtype): - base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) - base_df["B"] = base_df["B"].astype(dtype) - grouped = base_df.groupby("A") - - expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype) - result = getattr(grouped, method)() - tm.assert_frame_equal(expected, result) - - result = getattr(grouped["B"], method)().to_frame() - tm.assert_frame_equal(expected, result) - - -def test_cummax(dtypes_for_minmax): - dtype = dtypes_for_minmax[0] - max_val = dtypes_for_minmax[2] - - # GH 15048 - base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) - expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] - - df = base_df.astype(dtype) - - expected = DataFrame({"B": expected_maxs}).astype(dtype) - result = df.groupby("A").cummax() - tm.assert_frame_equal(result, expected) - result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() - tm.assert_frame_equal(result, expected) - - # Test w/ max value for dtype - df.loc[[2, 6], "B"] = max_val - expected.loc[[2, 3, 6, 7], "B"] = max_val - result = df.groupby("A").cummax() - tm.assert_frame_equal(result, expected) - expected = ( - df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() - ) - tm.assert_frame_equal(result, expected) - - # Test nan in some values - # Explicit cast to float to avoid implicit cast when setting nan - base_df = base_df.astype({"B": "float"}) - base_df.loc[[0, 2, 4, 6], "B"] = np.nan - expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) - result = base_df.groupby("A").cummax() - tm.assert_frame_equal(result, expected) - expected = ( - base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() - ) - tm.assert_frame_equal(result, expected) - - # GH 15561 - df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) - expected = Series(pd.to_datetime("2001"), index=[0], name="b") - - result = df.groupby("a")["b"].cummax() - tm.assert_series_equal(expected, result) - - # GH 15635 - df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]}) - result = df.groupby("a").b.cummax() - expected = Series([2, 1, 2], name="b") - tm.assert_series_equal(result, expected) - - -def test_cummax_i8_at_implementation_bound(): - # the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT - # for int64 dtype GH#46382 - ser = Series([pd.NaT._value + n for n in range(5)]) - df = DataFrame({"A": 1, "B": ser, "C": ser.view("M8[ns]")}) - gb = df.groupby("A") - - res = gb.cummax() - exp = df[["B", "C"]] - tm.assert_frame_equal(res, exp) - - -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"]) -@pytest.mark.parametrize( - "groups,expected_data", - [ - ([1, 1, 1], [1, None, None]), - ([1, 2, 3], [1, None, 2]), - ([1, 3, 3], [1, None, None]), - ], -) -def test_cummin_max_skipna(method, dtype, groups, expected_data): - # GH-34047 - df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) - orig = df.copy() - gb = df.groupby(groups)["a"] - - result = getattr(gb, method)(skipna=False) - expected = Series(expected_data, dtype=dtype, name="a") - - # check we didn't accidentally alter df - tm.assert_frame_equal(df, orig) - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -def test_cummin_max_skipna_multiple_cols(method): - # Ensure missing value in "a" doesn't cause "b" to be nan-filled - df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]}) - gb = df.groupby([1, 1, 1])[["a", "b"]] - - result = getattr(gb, method)(skipna=False) - expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]}) - - tm.assert_frame_equal(result, expected) - - -@td.skip_if_32bit -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -@pytest.mark.parametrize( - "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)] -) -def test_nullable_int_not_cast_as_float(method, dtype, val): - data = [val, pd.NA] - df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype) - grouped = df.groupby("grp") - - result = grouped.transform(method) - expected = DataFrame({"b": data}, dtype=dtype) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "in_vals, out_vals", - [ - # Basics: strictly increasing (T), strictly decreasing (F), - # abs val increasing (F), non-strictly increasing (T) - ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), - # Test with inf vals - ( - [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], - [True, False, True, False], - ), - # Test with nan vals; should always be False - ( - [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False], - ), - ], -) -def test_is_monotonic_increasing(in_vals, out_vals): - # GH 17015 - source_dict = { - "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], - "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], - "C": in_vals, - } - df = DataFrame(source_dict) - result = df.groupby("B").C.is_monotonic_increasing - index = Index(list("abcd"), name="B") - expected = Series(index=index, data=out_vals, name="C") - tm.assert_series_equal(result, expected) - - # Also check result equal to manually taking x.is_monotonic_increasing. - expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "in_vals, out_vals", - [ - # Basics: strictly decreasing (T), strictly increasing (F), - # abs val decreasing (F), non-strictly increasing (T) - ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), - # Test with inf vals - ( - [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], - [True, True, False, True], - ), - # Test with nan vals; should always be False - ( - [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False], - ), - ], -) -def test_is_monotonic_decreasing(in_vals, out_vals): - # GH 17015 - source_dict = { - "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], - "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], - "C": in_vals, - } - - df = DataFrame(source_dict) - result = df.groupby("B").C.is_monotonic_decreasing - index = Index(list("abcd"), name="B") - expected = Series(index=index, data=out_vals, name="C") - tm.assert_series_equal(result, expected) - - -# describe -# -------------------------------- - - -def test_apply_describe_bug(mframe): - grouped = mframe.groupby(level="first") - grouped.describe() # it works! - - -def test_series_describe_multikey(): - ts = tm.makeTimeSeries() - grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) - tm.assert_series_equal(result["std"], grouped.std(), check_names=False) - tm.assert_series_equal(result["min"], grouped.min(), check_names=False) - - -def test_series_describe_single(): - ts = tm.makeTimeSeries() - grouped = ts.groupby(lambda x: x.month) - result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe().stack(future_stack=True) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]]) -def test_series_describe_as_index(as_index, keys): - # GH#49256 - df = DataFrame( - { - "key1": ["one", "two", "two", "three", "two"], - "key2": ["one", "two", "two", "three", "two"], - "foo2": [1, 2, 4, 4, 6], - } - ) - gb = df.groupby(keys, as_index=as_index)["foo2"] - result = gb.describe() - expected = DataFrame( - { - "key1": ["one", "three", "two"], - "count": [1.0, 1.0, 3.0], - "mean": [1.0, 4.0, 4.0], - "std": [np.nan, np.nan, 2.0], - "min": [1.0, 4.0, 2.0], - "25%": [1.0, 4.0, 3.0], - "50%": [1.0, 4.0, 4.0], - "75%": [1.0, 4.0, 5.0], - "max": [1.0, 4.0, 6.0], - } - ) - if len(keys) == 2: - expected.insert(1, "key2", expected["key1"]) - if as_index: - expected = expected.set_index(keys) - tm.assert_frame_equal(result, expected) - - def test_series_index_name(df): grouped = df.loc[:, ["C"]].groupby(df["A"]) result = grouped.agg(lambda x: x.mean()) assert result.index.name == "A" -def test_frame_describe_multikey(tsframe): - grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - desc_groups = [] - for col in tsframe: - group = grouped[col].describe() - # GH 17464 - Remove duplicate MultiIndex levels - group_col = MultiIndex( - levels=[[col], group.columns], - codes=[[0] * len(group.columns), range(len(group.columns))], - ) - group = DataFrame(group.values, columns=group_col, index=group.index) - desc_groups.append(group) - expected = pd.concat(desc_groups, axis=1) - tm.assert_frame_equal(result, expected) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) - result = groupedT.describe() - expected = tsframe.describe().T - # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ - expected.index = MultiIndex( - levels=[[0, 1], expected.index], - codes=[[0, 0, 1, 1], range(len(expected.index))], - ) - tm.assert_frame_equal(result, expected) - - -def test_frame_describe_tupleindex(): - # GH 14848 - regression from 0.19.0 to 0.19.1 - df1 = DataFrame( - { - "x": [1, 2, 3, 4, 5] * 3, - "y": [10, 20, 30, 40, 50] * 3, - "z": [100, 200, 300, 400, 500] * 3, - } - ) - df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 - df2 = df1.rename(columns={"k": "key"}) - msg = "Names should be list-like for a MultiIndex" - with pytest.raises(ValueError, match=msg): - df1.groupby("k").describe() - with pytest.raises(ValueError, match=msg): - df2.groupby("key").describe() - - -def test_frame_describe_unstacked_format(): - # GH 4792 - prices = { - Timestamp("2011-01-06 10:59:05", tz=None): 24990, - Timestamp("2011-01-06 12:43:33", tz=None): 25499, - Timestamp("2011-01-06 12:54:09", tz=None): 25499, - } - volumes = { - Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, - Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, - Timestamp("2011-01-06 12:54:09", tz=None): 100000000, - } - df = DataFrame({"PRICE": prices, "VOLUME": volumes}) - result = df.groupby("PRICE").VOLUME.describe() - data = [ - df[df.PRICE == 24990].VOLUME.describe().values.tolist(), - df[df.PRICE == 25499].VOLUME.describe().values.tolist(), - ] - expected = DataFrame( - data, - index=Index([24990, 25499], name="PRICE"), - columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.filterwarnings( - "ignore:" - "indexing past lexsort depth may impact performance:" - "pandas.errors.PerformanceWarning" -) -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) -def test_describe_with_duplicate_output_column_names(as_index, keys): - # GH 35314 - df = DataFrame( - { - "a1": [99, 99, 99, 88, 88, 88], - "a2": [99, 99, 99, 88, 88, 88], - "b": [1, 2, 3, 4, 5, 6], - "c": [10, 20, 30, 40, 50, 60], - }, - columns=["a1", "a2", "b", "b"], - copy=False, - ) - if keys == ["a1"]: - df = df.drop(columns="a2") - - expected = ( - DataFrame.from_records( - [ - ("b", "count", 3.0, 3.0), - ("b", "mean", 5.0, 2.0), - ("b", "std", 1.0, 1.0), - ("b", "min", 4.0, 1.0), - ("b", "25%", 4.5, 1.5), - ("b", "50%", 5.0, 2.0), - ("b", "75%", 5.5, 2.5), - ("b", "max", 6.0, 3.0), - ("b", "count", 3.0, 3.0), - ("b", "mean", 5.0, 2.0), - ("b", "std", 1.0, 1.0), - ("b", "min", 4.0, 1.0), - ("b", "25%", 4.5, 1.5), - ("b", "50%", 5.0, 2.0), - ("b", "75%", 5.5, 2.5), - ("b", "max", 6.0, 3.0), - ], - ) - .set_index([0, 1]) - .T - ) - expected.columns.names = [None, None] - if len(keys) == 2: - expected.index = MultiIndex( - levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"] - ) - else: - expected.index = Index([88, 99], name="a1") - - if not as_index: - expected = expected.reset_index() - - result = df.groupby(keys, as_index=as_index).describe() - - tm.assert_frame_equal(result, expected) - - -def test_describe_duplicate_columns(): - # GH#50806 - df = DataFrame([[0, 1, 2, 3]]) - df.columns = [0, 1, 2, 0] - gb = df.groupby(df[1]) - result = gb.describe(percentiles=[]) - - columns = ["count", "mean", "std", "min", "50%", "max"] - frames = [ - DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) - for val in (0.0, 2.0, 3.0) - ] - expected = pd.concat(frames, axis=1) - expected.columns = MultiIndex( - levels=[[0, 2], columns], - codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], - ) - expected.index.names = [1] - tm.assert_frame_equal(result, expected) - - -def test_groupby_mean_no_overflow(): - # Regression test for (#22487) - df = DataFrame( - { - "user": ["A", "A", "A", "A", "A"], - "connections": [4970, 4749, 4719, 4704, 18446744073699999744], - } - ) - assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 - - @pytest.mark.parametrize( "values", [ @@ -1393,78 +565,6 @@ def test_apply_to_nullable_integer_returns_float(values, function): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("min_count", [0, 10]) -def test_groupby_sum_mincount_boolean(min_count): - b = True - a = False - na = np.nan - dfg = pd.array([b, b, na, na, a, a, b], dtype="boolean") - - df = DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": dfg}) - result = df.groupby("A").sum(min_count=min_count) - if min_count == 0: - expected = DataFrame( - {"B": pd.array([3, 0, 0], dtype="Int64")}, - index=Index([1, 2, 3], name="A"), - ) - tm.assert_frame_equal(result, expected) - else: - expected = DataFrame( - {"B": pd.array([pd.NA] * 3, dtype="Int64")}, - index=Index([1, 2, 3], name="A"), - ) - tm.assert_frame_equal(result, expected) - - -def test_groupby_sum_below_mincount_nullable_integer(): - # https://github.com/pandas-dev/pandas/issues/32861 - df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") - grouped = df.groupby("a") - idx = Index([0, 1, 2], name="a", dtype="Int64") - - result = grouped["b"].sum(min_count=2) - expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") - tm.assert_series_equal(result, expected) - - result = grouped.sum(min_count=2) - expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx) - tm.assert_frame_equal(result, expected) - - -def test_mean_on_timedelta(): - # GH 17382 - df = DataFrame({"time": pd.to_timedelta(range(10)), "cat": ["A", "B"] * 5}) - result = df.groupby("cat")["time"].mean() - expected = Series( - pd.to_timedelta([4, 5]), name="time", index=Index(["A", "B"], name="cat") - ) - tm.assert_series_equal(result, expected) - - -def test_groupby_sum_timedelta_with_nat(): - # GH#42659 - df = DataFrame( - { - "a": [1, 1, 2, 2], - "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], - } - ) - td3 = pd.Timedelta(days=3) - - gb = df.groupby("a") - - res = gb.sum() - expected = DataFrame({"b": [td3, td3]}, index=Index([1, 2], name="a")) - tm.assert_frame_equal(res, expected) - - res = gb["b"].sum() - tm.assert_series_equal(res, expected["b"]) - - res = gb["b"].sum(min_count=2) - expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index) - tm.assert_series_equal(res, expected) - - @pytest.mark.parametrize( "kernel, has_arg", [ @@ -1706,22 +806,6 @@ def test_groupby_empty_dataset(dtype, kwargs): tm.assert_frame_equal(result, expected) -def test_corrwith_with_1_axis(): - # GH 47723 - df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]}) - gb = df.groupby("a") - - msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.corrwith(df, axis=1) - index = Index( - data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)], - name=("a", None), - ) - expected = Series([np.nan] * 6, index=index) - tm.assert_series_equal(result, expected) - - def test_multiindex_group_all_columns_when_empty(groupby_func): # GH 32464 df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"]) diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py deleted file mode 100644 index 30c7e1df1e691..0000000000000 --- a/pandas/tests/groupby/test_min_max.py +++ /dev/null @@ -1,272 +0,0 @@ -import numpy as np -import pytest - -from pandas._libs.tslibs import iNaT - -import pandas as pd -from pandas import ( - DataFrame, - Index, - Series, -) -import pandas._testing as tm - - -def test_max_min_non_numeric(): - # #2700 - aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]}) - - result = aa.groupby("nn").max() - assert "ss" in result - - result = aa.groupby("nn").max(numeric_only=False) - assert "ss" in result - - result = aa.groupby("nn").min() - assert "ss" in result - - result = aa.groupby("nn").min(numeric_only=False) - assert "ss" in result - - -def test_max_min_object_multiple_columns(using_array_manager): - # GH#41111 case where the aggregation is valid for some columns but not - # others; we split object blocks column-wise, consistent with - # DataFrame._reduce - - df = DataFrame( - { - "A": [1, 1, 2, 2, 3], - "B": [1, "foo", 2, "bar", False], - "C": ["a", "b", "c", "d", "e"], - } - ) - df._consolidate_inplace() # should already be consolidate, but double-check - if not using_array_manager: - assert len(df._mgr.blocks) == 2 - - gb = df.groupby("A") - - result = gb[["C"]].max() - # "max" is valid for column "C" but not for "B" - ei = Index([1, 2, 3], name="A") - expected = DataFrame({"C": ["b", "d", "e"]}, index=ei) - tm.assert_frame_equal(result, expected) - - result = gb[["C"]].min() - # "min" is valid for column "C" but not for "B" - ei = Index([1, 2, 3], name="A") - expected = DataFrame({"C": ["a", "c", "e"]}, index=ei) - tm.assert_frame_equal(result, expected) - - -def test_min_date_with_nans(): - # GH26321 - dates = pd.to_datetime( - Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d" - ).dt.date - df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates}) - - result = df.groupby("b", as_index=False)["c"].min()["c"] - expected = pd.to_datetime( - Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d" - ).dt.date - tm.assert_series_equal(result, expected) - - result = df.groupby("b")["c"].min() - expected.index.name = "b" - tm.assert_series_equal(result, expected) - - -def test_max_inat(): - # GH#40767 dont interpret iNaT as NaN - ser = Series([1, iNaT]) - key = np.array([1, 1], dtype=np.int64) - gb = ser.groupby(key) - - result = gb.max(min_count=2) - expected = Series({1: 1}, dtype=np.int64) - tm.assert_series_equal(result, expected, check_exact=True) - - result = gb.min(min_count=2) - expected = Series({1: iNaT}, dtype=np.int64) - tm.assert_series_equal(result, expected, check_exact=True) - - # not enough entries -> gets masked to NaN - result = gb.min(min_count=3) - expected = Series({1: np.nan}) - tm.assert_series_equal(result, expected, check_exact=True) - - -def test_max_inat_not_all_na(): - # GH#40767 dont interpret iNaT as NaN - - # make sure we dont round iNaT+1 to iNaT - ser = Series([1, iNaT, 2, iNaT + 1]) - gb = ser.groupby([1, 2, 3, 3]) - result = gb.min(min_count=2) - - # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy - expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) - expected.index = expected.index.astype(int) - tm.assert_series_equal(result, expected, check_exact=True) - - -@pytest.mark.parametrize("func", ["min", "max"]) -def test_groupby_aggregate_period_column(func): - # GH 31471 - groups = [1, 2] - periods = pd.period_range("2020", periods=2, freq="Y") - df = DataFrame({"a": groups, "b": periods}) - - result = getattr(df.groupby("a")["b"], func)() - idx = Index([1, 2], name="a") - expected = Series(periods, index=idx, name="b") - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("func", ["min", "max"]) -def test_groupby_aggregate_period_frame(func): - # GH 31471 - groups = [1, 2] - periods = pd.period_range("2020", periods=2, freq="Y") - df = DataFrame({"a": groups, "b": periods}) - - result = getattr(df.groupby("a"), func)() - idx = Index([1, 2], name="a") - expected = DataFrame({"b": periods}, index=idx) - - tm.assert_frame_equal(result, expected) - - -def test_aggregate_numeric_object_dtype(): - # https://github.com/pandas-dev/pandas/issues/39329 - # simplified case: multiple object columns where one is all-NaN - # -> gets split as the all-NaN is inferred as float - df = DataFrame( - {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, - ).astype(object) - result = df.groupby("key").min() - expected = ( - DataFrame( - {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]}, - ) - .set_index("key") - .astype(object) - ) - tm.assert_frame_equal(result, expected) - - # same but with numbers - df = DataFrame( - {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)}, - ).astype(object) - result = df.groupby("key").min() - expected = ( - DataFrame({"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}) - .set_index("key") - .astype(object) - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("func", ["min", "max"]) -def test_aggregate_categorical_lost_index(func: str): - # GH: 28641 groupby drops index, when grouping over categorical column with min/max - ds = Series(["b"], dtype="category").cat.as_ordered() - df = DataFrame({"A": [1997], "B": ds}) - result = df.groupby("A").agg({"B": func}) - expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A")) - - # ordered categorical dtype should be preserved - expected["B"] = expected["B"].astype(ds.dtype) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"]) -def test_groupby_min_max_nullable(dtype): - if dtype == "Int64": - # GH#41743 avoid precision loss - ts = 1618556707013635762 - elif dtype == "boolean": - ts = 0 - else: - ts = 4.0 - - df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]}) - df["ts"] = df["ts"].astype(dtype) - - gb = df.groupby("id") - - result = gb.min() - expected = df.iloc[:1].set_index("id") - tm.assert_frame_equal(result, expected) - - res_max = gb.max() - expected_max = df.iloc[1:].set_index("id") - tm.assert_frame_equal(res_max, expected_max) - - result2 = gb.min(min_count=3) - expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype) - tm.assert_frame_equal(result2, expected2) - - res_max2 = gb.max(min_count=3) - tm.assert_frame_equal(res_max2, expected2) - - # Case with NA values - df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]}) - df2["ts"] = df2["ts"].astype(dtype) - gb2 = df2.groupby("id") - - result3 = gb2.min() - tm.assert_frame_equal(result3, expected) - - res_max3 = gb2.max() - tm.assert_frame_equal(res_max3, expected_max) - - result4 = gb2.min(min_count=100) - tm.assert_frame_equal(result4, expected2) - - res_max4 = gb2.max(min_count=100) - tm.assert_frame_equal(res_max4, expected2) - - -def test_min_max_nullable_uint64_empty_group(): - # don't raise NotImplementedError from libgroupby - cat = pd.Categorical([0] * 10, categories=[0, 1]) - df = DataFrame({"A": cat, "B": pd.array(np.arange(10, dtype=np.uint64))}) - gb = df.groupby("A", observed=False) - - res = gb.min() - - idx = pd.CategoricalIndex([0, 1], dtype=cat.dtype, name="A") - expected = DataFrame({"B": pd.array([0, pd.NA], dtype="UInt64")}, index=idx) - tm.assert_frame_equal(res, expected) - - res = gb.max() - expected.iloc[0, 0] = 9 - tm.assert_frame_equal(res, expected) - - -@pytest.mark.parametrize("func", ["first", "last", "min", "max"]) -def test_groupby_min_max_categorical(func): - # GH: 52151 - df = DataFrame( - { - "col1": pd.Categorical(["A"], categories=list("AB"), ordered=True), - "col2": pd.Categorical([1], categories=[1, 2], ordered=True), - "value": 0.1, - } - ) - result = getattr(df.groupby("col1", observed=False), func)() - - idx = pd.CategoricalIndex(data=["A", "B"], name="col1", ordered=True) - expected = DataFrame( - { - "col2": pd.Categorical([1, None], categories=[1, 2], ordered=True), - "value": [0.1, None], - }, - index=idx, - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_nunique.py b/pandas/tests/groupby/test_nunique.py deleted file mode 100644 index 9c9e32d9ce226..0000000000000 --- a/pandas/tests/groupby/test_nunique.py +++ /dev/null @@ -1,190 +0,0 @@ -import datetime as dt -from string import ascii_lowercase - -import numpy as np -import pytest - -import pandas as pd -from pandas import ( - DataFrame, - MultiIndex, - NaT, - Series, - Timestamp, - date_range, -) -import pandas._testing as tm - - -@pytest.mark.slow -@pytest.mark.parametrize("sort", [False, True]) -@pytest.mark.parametrize("dropna", [False, True]) -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize("with_nan", [True, False]) -@pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]]) -def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys): - n = 100 - m = 10 - days = date_range("2015-08-23", periods=10) - df = DataFrame( - { - "jim": np.random.default_rng(2).choice(list(ascii_lowercase), n), - "joe": np.random.default_rng(2).choice(days, n), - "julie": np.random.default_rng(2).integers(0, m, n), - } - ) - if with_nan: - df = df.astype({"julie": float}) # Explicit cast to avoid implicit cast below - df.loc[1::17, "jim"] = None - df.loc[3::37, "joe"] = None - df.loc[7::19, "julie"] = None - df.loc[8::19, "julie"] = None - df.loc[9::19, "julie"] = None - original_df = df.copy() - gr = df.groupby(keys, as_index=as_index, sort=sort) - left = gr["julie"].nunique(dropna=dropna) - - gr = df.groupby(keys, as_index=as_index, sort=sort) - right = gr["julie"].apply(Series.nunique, dropna=dropna) - if not as_index: - right = right.reset_index(drop=True) - - if as_index: - tm.assert_series_equal(left, right, check_names=False) - else: - tm.assert_frame_equal(left, right, check_names=False) - tm.assert_frame_equal(df, original_df) - - -def test_nunique(): - df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")}) - - expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]}) - result = df.groupby("A", as_index=False).nunique() - tm.assert_frame_equal(result, expected) - - # as_index - expected.index = list("abc") - expected.index.name = "A" - expected = expected.drop(columns="A") - result = df.groupby("A").nunique() - tm.assert_frame_equal(result, expected) - - # with na - result = df.replace({"x": None}).groupby("A").nunique(dropna=False) - tm.assert_frame_equal(result, expected) - - # dropna - expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc")) - expected.index.name = "A" - result = df.replace({"x": None}).groupby("A").nunique() - tm.assert_frame_equal(result, expected) - - -def test_nunique_with_object(): - # GH 11077 - data = DataFrame( - [ - [100, 1, "Alice"], - [200, 2, "Bob"], - [300, 3, "Charlie"], - [-400, 4, "Dan"], - [500, 5, "Edith"], - ], - columns=["amount", "id", "name"], - ) - - result = data.groupby(["id", "amount"])["name"].nunique() - index = MultiIndex.from_arrays([data.id, data.amount]) - expected = Series([1] * 5, name="name", index=index) - tm.assert_series_equal(result, expected) - - -def test_nunique_with_empty_series(): - # GH 12553 - data = Series(name="name", dtype=object) - result = data.groupby(level=0).nunique() - expected = Series(name="name", dtype="int64") - tm.assert_series_equal(result, expected) - - -def test_nunique_with_timegrouper(): - # GH 13453 - test = DataFrame( - { - "time": [ - Timestamp("2016-06-28 09:35:35"), - Timestamp("2016-06-28 16:09:30"), - Timestamp("2016-06-28 16:46:28"), - ], - "data": ["1", "2", "3"], - } - ).set_index("time") - result = test.groupby(pd.Grouper(freq="h"))["data"].nunique() - expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(Series.nunique) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "key, data, dropna, expected", - [ - ( - ["x", "x", "x"], - [Timestamp("2019-01-01"), NaT, Timestamp("2019-01-01")], - True, - Series([1], index=pd.Index(["x"], name="key"), name="data"), - ), - ( - ["x", "x", "x"], - [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], - True, - Series([1], index=pd.Index(["x"], name="key"), name="data"), - ), - ( - ["x", "x", "x", "y", "y"], - [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], - False, - Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"), - ), - ( - ["x", "x", "x", "x", "y"], - [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], - False, - Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"), - ), - ], -) -def test_nunique_with_NaT(key, data, dropna, expected): - # GH 27951 - df = DataFrame({"key": key, "data": data}) - result = df.groupby(["key"])["data"].nunique(dropna=dropna) - tm.assert_series_equal(result, expected) - - -def test_nunique_preserves_column_level_names(): - # GH 23222 - test = DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) - result = test.groupby([0, 0, 0]).nunique() - expected = DataFrame([2], index=np.array([0]), columns=test.columns) - tm.assert_frame_equal(result, expected) - - -def test_nunique_transform_with_datetime(): - # GH 35109 - transform with nunique on datetimes results in integers - df = DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"]) - result = df.groupby([0, 0, 1])["date"].transform("nunique") - expected = Series([2, 2, 1], name="date") - tm.assert_series_equal(result, expected) - - -def test_empty_categorical(observed): - # GH#21334 - cat = Series([1]).astype("category") - ser = cat[:0] - gb = ser.groupby(ser, observed=observed) - result = gb.nunique() - if observed: - expected = Series([], index=cat[:0], dtype="int64") - else: - expected = Series([0], index=cat, dtype="int64") - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py new file mode 100644 index 0000000000000..fdfb211ac2269 --- /dev/null +++ b/pandas/tests/groupby/test_reductions.py @@ -0,0 +1,838 @@ +import builtins +import datetime as dt +from io import StringIO +from string import ascii_lowercase + +import numpy as np +import pytest + +from pandas._libs.tslibs import iNaT + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, + Series, + Timestamp, + date_range, + isna, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("agg_func", ["any", "all"]) +@pytest.mark.parametrize( + "vals", + [ + ["foo", "bar", "baz"], + ["foo", "", ""], + ["", "", ""], + [1, 2, 3], + [1, 0, 0], + [0, 0, 0], + [1.0, 2.0, 3.0], + [1.0, 0.0, 0.0], + [0.0, 0.0, 0.0], + [True, True, True], + [True, False, False], + [False, False, False], + [np.nan, np.nan, np.nan], + ], +) +def test_groupby_bool_aggs(skipna, agg_func, vals): + df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2}) + + # Figure out expectation using Python builtin + exp = getattr(builtins, agg_func)(vals) + + # edge case for missing data with skipna and 'any' + if skipna and all(isna(vals)) and agg_func == "any": + exp = False + + expected = DataFrame( + [exp] * 2, columns=["val"], index=pd.Index(["a", "b"], name="key") + ) + result = getattr(df.groupby("key"), agg_func)(skipna=skipna) + tm.assert_frame_equal(result, expected) + + +def test_any(): + df = DataFrame( + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], + columns=["A", "B", "C"], + ) + expected = DataFrame( + [[True, True], [False, True]], columns=["B", "C"], index=[1, 3] + ) + expected.index.name = "A" + result = df.groupby("A").any() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_bool_aggs_dup_column_labels(bool_agg_func): + # GH#21668 + df = DataFrame([[True, True]], columns=["a", "a"]) + grp_by = df.groupby([0]) + result = getattr(grp_by, bool_agg_func)() + + expected = df.set_axis(np.array([0])) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +@pytest.mark.parametrize( + "data", + [ + [False, False, False], + [True, True, True], + [pd.NA, pd.NA, pd.NA], + [False, pd.NA, False], + [True, pd.NA, True], + [True, pd.NA, False], + ], +) +def test_masked_kleene_logic(bool_agg_func, skipna, data): + # GH#37506 + ser = Series(data, dtype="boolean") + + # The result should match aggregating on the whole series. Correctness + # there is verified in test_reductions.py::test_any_all_boolean_kleene_logic + expected_data = getattr(ser, bool_agg_func)(skipna=skipna) + expected = Series(expected_data, index=np.array([0]), dtype="boolean") + + result = ser.groupby([0, 0, 0]).agg(bool_agg_func, skipna=skipna) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype1,dtype2,exp_col1,exp_col2", + [ + ( + "float", + "Float64", + np.array([True], dtype=bool), + pd.array([pd.NA], dtype="boolean"), + ), + ( + "Int64", + "float", + pd.array([pd.NA], dtype="boolean"), + np.array([True], dtype=bool), + ), + ( + "Int64", + "Int64", + pd.array([pd.NA], dtype="boolean"), + pd.array([pd.NA], dtype="boolean"), + ), + ( + "Float64", + "boolean", + pd.array([pd.NA], dtype="boolean"), + pd.array([pd.NA], dtype="boolean"), + ), + ], +) +def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2): + # GH#37506 + data = [1.0, np.nan] + df = DataFrame( + {"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)} + ) + result = df.groupby([1, 1]).agg("all", skipna=False) + + expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=np.array([1])) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) +def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series): + # GH#40585 + obj = frame_or_series([pd.NA, 1], dtype=dtype) + expected_res = True + if not skipna and bool_agg_func == "all": + expected_res = pd.NA + expected = frame_or_series([expected_res], index=np.array([1]), dtype="boolean") + + result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "bool_agg_func,data,expected_res", + [ + ("any", [pd.NA, np.nan], False), + ("any", [pd.NA, 1, np.nan], True), + ("all", [pd.NA, pd.NaT], True), + ("all", [pd.NA, False, pd.NaT], False), + ], +) +def test_object_type_missing_vals(bool_agg_func, data, expected_res, frame_or_series): + # GH#37501 + obj = frame_or_series(data, dtype=object) + result = obj.groupby([1] * len(data)).agg(bool_agg_func) + expected = frame_or_series([expected_res], index=np.array([1]), dtype="bool") + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_object_NA_raises_with_skipna_false(bool_agg_func): + # GH#37501 + ser = Series([pd.NA], dtype=object) + with pytest.raises(TypeError, match="boolean value of NA is ambiguous"): + ser.groupby([1]).agg(bool_agg_func, skipna=False) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_empty(frame_or_series, bool_agg_func): + # GH 45231 + kwargs = {"columns": ["a"]} if frame_or_series is DataFrame else {"name": "a"} + obj = frame_or_series(**kwargs, dtype=object) + result = getattr(obj.groupby(obj.index), bool_agg_func)() + expected = frame_or_series(**kwargs, dtype=bool) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "func, values", + [ + ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}), + ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}), + ], +) +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_idxmin_idxmax_returns_int_types(func, values, numeric_only): + # GH 25444 + df = DataFrame( + { + "name": ["A", "A", "B", "B"], + "c_int": [1, 2, 3, 4], + "c_float": [4.02, 3.03, 2.04, 1.05], + "c_date": ["2019", "2018", "2016", "2017"], + } + ) + df["c_date"] = pd.to_datetime(df["c_date"]) + df["c_date_tz"] = df["c_date"].dt.tz_localize("US/Pacific") + df["c_timedelta"] = df["c_date"] - df["c_date"].iloc[0] + df["c_period"] = df["c_date"].dt.to_period("W") + df["c_Integer"] = df["c_int"].astype("Int64") + df["c_Floating"] = df["c_float"].astype("Float64") + + result = getattr(df.groupby("name"), func)(numeric_only=numeric_only) + + expected = DataFrame(values, index=pd.Index(["A", "B"], name="name")) + if numeric_only: + expected = expected.drop(columns=["c_date"]) + else: + expected["c_date_tz"] = expected["c_date"] + expected["c_timedelta"] = expected["c_date"] + expected["c_period"] = expected["c_date"] + expected["c_Integer"] = expected["c_int"] + expected["c_Floating"] = expected["c_float"] + + tm.assert_frame_equal(result, expected) + + +def test_idxmin_idxmax_axis1(): + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] + ) + df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] + + gb = df.groupby("A") + + warn_msg = "DataFrameGroupBy.idxmax with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + res = gb.idxmax(axis=1) + + alt = df.iloc[:, 1:].idxmax(axis=1) + indexer = res.index.get_level_values(1) + + tm.assert_series_equal(alt[indexer], res.droplevel("A")) + + df["E"] = date_range("2016-01-01", periods=10) + gb2 = df.groupby("A") + + msg = "'>' not supported between instances of 'Timestamp' and 'float'" + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + gb2.idxmax(axis=1) + + +def test_groupby_mean_no_overflow(): + # Regression test for (#22487) + df = DataFrame( + { + "user": ["A", "A", "A", "A", "A"], + "connections": [4970, 4749, 4719, 4704, 18446744073699999744], + } + ) + assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 + + +def test_mean_on_timedelta(): + # GH 17382 + df = DataFrame({"time": pd.to_timedelta(range(10)), "cat": ["A", "B"] * 5}) + result = df.groupby("cat")["time"].mean() + expected = Series( + pd.to_timedelta([4, 5]), name="time", index=pd.Index(["A", "B"], name="cat") + ) + tm.assert_series_equal(result, expected) + + +def test_cython_median(): + arr = np.random.default_rng(2).standard_normal(1000) + arr[::2] = np.nan + df = DataFrame(arr) + + labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) + labels[::17] = np.nan + + result = df.groupby(labels).median() + msg = "using DataFrameGroupBy.median" + with tm.assert_produces_warning(FutureWarning, match=msg): + exp = df.groupby(labels).agg(np.nanmedian) + tm.assert_frame_equal(result, exp) + + df = DataFrame(np.random.default_rng(2).standard_normal((1000, 5))) + msg = "using DataFrameGroupBy.median" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = df.groupby(labels).agg(np.median) + xp = df.groupby(labels).median() + tm.assert_frame_equal(rs, xp) + + +def test_median_empty_bins(observed): + df = DataFrame(np.random.default_rng(2).integers(0, 44, 500)) + + grps = range(0, 55, 5) + bins = pd.cut(df[0], grps) + + result = df.groupby(bins, observed=observed).median() + expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) + tm.assert_frame_equal(result, expected) + + +def test_max_min_non_numeric(): + # #2700 + aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]}) + + result = aa.groupby("nn").max() + assert "ss" in result + + result = aa.groupby("nn").max(numeric_only=False) + assert "ss" in result + + result = aa.groupby("nn").min() + assert "ss" in result + + result = aa.groupby("nn").min(numeric_only=False) + assert "ss" in result + + +def test_max_min_object_multiple_columns(using_array_manager): + # GH#41111 case where the aggregation is valid for some columns but not + # others; we split object blocks column-wise, consistent with + # DataFrame._reduce + + df = DataFrame( + { + "A": [1, 1, 2, 2, 3], + "B": [1, "foo", 2, "bar", False], + "C": ["a", "b", "c", "d", "e"], + } + ) + df._consolidate_inplace() # should already be consolidate, but double-check + if not using_array_manager: + assert len(df._mgr.blocks) == 2 + + gb = df.groupby("A") + + result = gb[["C"]].max() + # "max" is valid for column "C" but not for "B" + ei = pd.Index([1, 2, 3], name="A") + expected = DataFrame({"C": ["b", "d", "e"]}, index=ei) + tm.assert_frame_equal(result, expected) + + result = gb[["C"]].min() + # "min" is valid for column "C" but not for "B" + ei = pd.Index([1, 2, 3], name="A") + expected = DataFrame({"C": ["a", "c", "e"]}, index=ei) + tm.assert_frame_equal(result, expected) + + +def test_min_date_with_nans(): + # GH26321 + dates = pd.to_datetime( + Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d" + ).dt.date + df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates}) + + result = df.groupby("b", as_index=False)["c"].min()["c"] + expected = pd.to_datetime( + Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d" + ).dt.date + tm.assert_series_equal(result, expected) + + result = df.groupby("b")["c"].min() + expected.index.name = "b" + tm.assert_series_equal(result, expected) + + +def test_max_inat(): + # GH#40767 dont interpret iNaT as NaN + ser = Series([1, iNaT]) + key = np.array([1, 1], dtype=np.int64) + gb = ser.groupby(key) + + result = gb.max(min_count=2) + expected = Series({1: 1}, dtype=np.int64) + tm.assert_series_equal(result, expected, check_exact=True) + + result = gb.min(min_count=2) + expected = Series({1: iNaT}, dtype=np.int64) + tm.assert_series_equal(result, expected, check_exact=True) + + # not enough entries -> gets masked to NaN + result = gb.min(min_count=3) + expected = Series({1: np.nan}) + tm.assert_series_equal(result, expected, check_exact=True) + + +def test_max_inat_not_all_na(): + # GH#40767 dont interpret iNaT as NaN + + # make sure we dont round iNaT+1 to iNaT + ser = Series([1, iNaT, 2, iNaT + 1]) + gb = ser.groupby([1, 2, 3, 3]) + result = gb.min(min_count=2) + + # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy + expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) + expected.index = expected.index.astype(int) + tm.assert_series_equal(result, expected, check_exact=True) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_groupby_aggregate_period_column(func): + # GH 31471 + groups = [1, 2] + periods = pd.period_range("2020", periods=2, freq="Y") + df = DataFrame({"a": groups, "b": periods}) + + result = getattr(df.groupby("a")["b"], func)() + idx = pd.Index([1, 2], name="a") + expected = Series(periods, index=idx, name="b") + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_groupby_aggregate_period_frame(func): + # GH 31471 + groups = [1, 2] + periods = pd.period_range("2020", periods=2, freq="Y") + df = DataFrame({"a": groups, "b": periods}) + + result = getattr(df.groupby("a"), func)() + idx = pd.Index([1, 2], name="a") + expected = DataFrame({"b": periods}, index=idx) + + tm.assert_frame_equal(result, expected) + + +def test_aggregate_numeric_object_dtype(): + # https://github.com/pandas-dev/pandas/issues/39329 + # simplified case: multiple object columns where one is all-NaN + # -> gets split as the all-NaN is inferred as float + df = DataFrame( + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, + ).astype(object) + result = df.groupby("key").min() + expected = ( + DataFrame( + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]}, + ) + .set_index("key") + .astype(object) + ) + tm.assert_frame_equal(result, expected) + + # same but with numbers + df = DataFrame( + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)}, + ).astype(object) + result = df.groupby("key").min() + expected = ( + DataFrame({"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}) + .set_index("key") + .astype(object) + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_aggregate_categorical_lost_index(func: str): + # GH: 28641 groupby drops index, when grouping over categorical column with min/max + ds = Series(["b"], dtype="category").cat.as_ordered() + df = DataFrame({"A": [1997], "B": ds}) + result = df.groupby("A").agg({"B": func}) + expected = DataFrame({"B": ["b"]}, index=pd.Index([1997], name="A")) + + # ordered categorical dtype should be preserved + expected["B"] = expected["B"].astype(ds.dtype) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"]) +def test_groupby_min_max_nullable(dtype): + if dtype == "Int64": + # GH#41743 avoid precision loss + ts = 1618556707013635762 + elif dtype == "boolean": + ts = 0 + else: + ts = 4.0 + + df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]}) + df["ts"] = df["ts"].astype(dtype) + + gb = df.groupby("id") + + result = gb.min() + expected = df.iloc[:1].set_index("id") + tm.assert_frame_equal(result, expected) + + res_max = gb.max() + expected_max = df.iloc[1:].set_index("id") + tm.assert_frame_equal(res_max, expected_max) + + result2 = gb.min(min_count=3) + expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype) + tm.assert_frame_equal(result2, expected2) + + res_max2 = gb.max(min_count=3) + tm.assert_frame_equal(res_max2, expected2) + + # Case with NA values + df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]}) + df2["ts"] = df2["ts"].astype(dtype) + gb2 = df2.groupby("id") + + result3 = gb2.min() + tm.assert_frame_equal(result3, expected) + + res_max3 = gb2.max() + tm.assert_frame_equal(res_max3, expected_max) + + result4 = gb2.min(min_count=100) + tm.assert_frame_equal(result4, expected2) + + res_max4 = gb2.max(min_count=100) + tm.assert_frame_equal(res_max4, expected2) + + +def test_min_max_nullable_uint64_empty_group(): + # don't raise NotImplementedError from libgroupby + cat = pd.Categorical([0] * 10, categories=[0, 1]) + df = DataFrame({"A": cat, "B": pd.array(np.arange(10, dtype=np.uint64))}) + gb = df.groupby("A", observed=False) + + res = gb.min() + + idx = pd.CategoricalIndex([0, 1], dtype=cat.dtype, name="A") + expected = DataFrame({"B": pd.array([0, pd.NA], dtype="UInt64")}, index=idx) + tm.assert_frame_equal(res, expected) + + res = gb.max() + expected.iloc[0, 0] = 9 + tm.assert_frame_equal(res, expected) + + +@pytest.mark.parametrize("func", ["first", "last", "min", "max"]) +def test_groupby_min_max_categorical(func): + # GH: 52151 + df = DataFrame( + { + "col1": pd.Categorical(["A"], categories=list("AB"), ordered=True), + "col2": pd.Categorical([1], categories=[1, 2], ordered=True), + "value": 0.1, + } + ) + result = getattr(df.groupby("col1", observed=False), func)() + + idx = pd.CategoricalIndex(data=["A", "B"], name="col1", ordered=True) + expected = DataFrame( + { + "col2": pd.Categorical([1, None], categories=[1, 2], ordered=True), + "value": [0.1, None], + }, + index=idx, + ) + tm.assert_frame_equal(result, expected) + + +def test_max_nan_bug(): + raw = """,Date,app,File +-04-23,2013-04-23 00:00:00,,log080001.log +-05-06,2013-05-06 00:00:00,,log.log +-05-07,2013-05-07 00:00:00,OE,xlsx""" + + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + df = pd.read_csv(StringIO(raw), parse_dates=[0]) + gb = df.groupby("Date") + r = gb[["File"]].max() + e = gb["File"].max().to_frame() + tm.assert_frame_equal(r, e) + assert not r["File"].isna().any() + + +@pytest.mark.slow +@pytest.mark.parametrize("sort", [False, True]) +@pytest.mark.parametrize("dropna", [False, True]) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("with_nan", [True, False]) +@pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]]) +def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys): + n = 100 + m = 10 + days = date_range("2015-08-23", periods=10) + df = DataFrame( + { + "jim": np.random.default_rng(2).choice(list(ascii_lowercase), n), + "joe": np.random.default_rng(2).choice(days, n), + "julie": np.random.default_rng(2).integers(0, m, n), + } + ) + if with_nan: + df = df.astype({"julie": float}) # Explicit cast to avoid implicit cast below + df.loc[1::17, "jim"] = None + df.loc[3::37, "joe"] = None + df.loc[7::19, "julie"] = None + df.loc[8::19, "julie"] = None + df.loc[9::19, "julie"] = None + original_df = df.copy() + gr = df.groupby(keys, as_index=as_index, sort=sort) + left = gr["julie"].nunique(dropna=dropna) + + gr = df.groupby(keys, as_index=as_index, sort=sort) + right = gr["julie"].apply(Series.nunique, dropna=dropna) + if not as_index: + right = right.reset_index(drop=True) + + if as_index: + tm.assert_series_equal(left, right, check_names=False) + else: + tm.assert_frame_equal(left, right, check_names=False) + tm.assert_frame_equal(df, original_df) + + +def test_nunique(): + df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")}) + + expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]}) + result = df.groupby("A", as_index=False).nunique() + tm.assert_frame_equal(result, expected) + + # as_index + expected.index = list("abc") + expected.index.name = "A" + expected = expected.drop(columns="A") + result = df.groupby("A").nunique() + tm.assert_frame_equal(result, expected) + + # with na + result = df.replace({"x": None}).groupby("A").nunique(dropna=False) + tm.assert_frame_equal(result, expected) + + # dropna + expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc")) + expected.index.name = "A" + result = df.replace({"x": None}).groupby("A").nunique() + tm.assert_frame_equal(result, expected) + + +def test_nunique_with_object(): + # GH 11077 + data = DataFrame( + [ + [100, 1, "Alice"], + [200, 2, "Bob"], + [300, 3, "Charlie"], + [-400, 4, "Dan"], + [500, 5, "Edith"], + ], + columns=["amount", "id", "name"], + ) + + result = data.groupby(["id", "amount"])["name"].nunique() + index = MultiIndex.from_arrays([data.id, data.amount]) + expected = Series([1] * 5, name="name", index=index) + tm.assert_series_equal(result, expected) + + +def test_nunique_with_empty_series(): + # GH 12553 + data = Series(name="name", dtype=object) + result = data.groupby(level=0).nunique() + expected = Series(name="name", dtype="int64") + tm.assert_series_equal(result, expected) + + +def test_nunique_with_timegrouper(): + # GH 13453 + test = DataFrame( + { + "time": [ + Timestamp("2016-06-28 09:35:35"), + Timestamp("2016-06-28 16:09:30"), + Timestamp("2016-06-28 16:46:28"), + ], + "data": ["1", "2", "3"], + } + ).set_index("time") + result = test.groupby(pd.Grouper(freq="h"))["data"].nunique() + expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(Series.nunique) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "key, data, dropna, expected", + [ + ( + ["x", "x", "x"], + [Timestamp("2019-01-01"), pd.NaT, Timestamp("2019-01-01")], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x"], + [dt.date(2019, 1, 1), pd.NaT, dt.date(2019, 1, 1)], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "y", "y"], + [ + dt.date(2019, 1, 1), + pd.NaT, + dt.date(2019, 1, 1), + pd.NaT, + dt.date(2019, 1, 1), + ], + False, + Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "x", "y"], + [ + dt.date(2019, 1, 1), + pd.NaT, + dt.date(2019, 1, 1), + pd.NaT, + dt.date(2019, 1, 1), + ], + False, + Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ], +) +def test_nunique_with_NaT(key, data, dropna, expected): + # GH 27951 + df = DataFrame({"key": key, "data": data}) + result = df.groupby(["key"])["data"].nunique(dropna=dropna) + tm.assert_series_equal(result, expected) + + +def test_nunique_preserves_column_level_names(): + # GH 23222 + test = DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) + result = test.groupby([0, 0, 0]).nunique() + expected = DataFrame([2], index=np.array([0]), columns=test.columns) + tm.assert_frame_equal(result, expected) + + +def test_nunique_transform_with_datetime(): + # GH 35109 - transform with nunique on datetimes results in integers + df = DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"]) + result = df.groupby([0, 0, 1])["date"].transform("nunique") + expected = Series([2, 2, 1], name="date") + tm.assert_series_equal(result, expected) + + +def test_empty_categorical(observed): + # GH#21334 + cat = Series([1]).astype("category") + ser = cat[:0] + gb = ser.groupby(ser, observed=observed) + result = gb.nunique() + if observed: + expected = Series([], index=cat[:0], dtype="int64") + else: + expected = Series([0], index=cat, dtype="int64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("min_count", [0, 10]) +def test_groupby_sum_mincount_boolean(min_count): + b = True + a = False + na = np.nan + dfg = pd.array([b, b, na, na, a, a, b], dtype="boolean") + + df = DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": dfg}) + result = df.groupby("A").sum(min_count=min_count) + if min_count == 0: + expected = DataFrame( + {"B": pd.array([3, 0, 0], dtype="Int64")}, + index=pd.Index([1, 2, 3], name="A"), + ) + tm.assert_frame_equal(result, expected) + else: + expected = DataFrame( + {"B": pd.array([pd.NA] * 3, dtype="Int64")}, + index=pd.Index([1, 2, 3], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_sum_below_mincount_nullable_integer(): + # https://github.com/pandas-dev/pandas/issues/32861 + df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") + grouped = df.groupby("a") + idx = pd.Index([0, 1, 2], name="a", dtype="Int64") + + result = grouped["b"].sum(min_count=2) + expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") + tm.assert_series_equal(result, expected) + + result = grouped.sum(min_count=2) + expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx) + tm.assert_frame_equal(result, expected) + + +def test_groupby_sum_timedelta_with_nat(): + # GH#42659 + df = DataFrame( + { + "a": [1, 1, 2, 2], + "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], + } + ) + td3 = pd.Timedelta(days=3) + + gb = df.groupby("a") + + res = gb.sum() + expected = DataFrame({"b": [td3, td3]}, index=pd.Index([1, 2], name="a")) + tm.assert_frame_equal(res, expected) + + res = gb["b"].sum() + tm.assert_series_equal(res, expected["b"]) + + res = gb["b"].sum(min_count=2) + expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index) + tm.assert_series_equal(res, expected) From fe07fd5a201c551df679053fbe068302a1337a4a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 12 Oct 2023 18:32:58 -0400 Subject: [PATCH 08/19] BLD: Pin numpy to < 2 (#55488) * BLD: Pin numpy to < 2 * typo * Update action.yml * Update unit-tests.yml --- .github/actions/build_pandas/action.yml | 4 ++-- .github/workflows/unit-tests.yml | 2 +- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311-pyarrownightly.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-39-minimum_versions.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/actions-pypy-39.yaml | 2 +- ci/deps/circle-310-arm64.yaml | 2 +- environment.yml | 2 +- pyproject.toml | 8 ++++---- requirements-dev.txt | 2 +- 13 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 73d7723e2fb49..3ee10efaaf96f 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -25,8 +25,8 @@ runs: - name: Build Pandas run: | if [[ ${{ inputs.editable }} == "true" ]]; then - pip install -e . --no-build-isolation -v + pip install -e . --no-build-isolation -v --no-deps else - pip install . --no-build-isolation -v + pip install . --no-build-isolation -v --no-deps fi shell: bash -el {0} diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 96010a4a0227d..53a1f5b95374d 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -348,7 +348,7 @@ jobs: python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata "cython<3.0.3" hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 - python -m pip install -ve . --no-build-isolation --no-index + python -m pip install -ve . --no-build-isolation --no-index --no-deps python -m pip list - name: Run Tests diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index ebd1556b8a5f5..180d425b07d82 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 4d0406814c873..c8a5d8cfb7640 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -21,7 +21,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index c259286a5359c..1a770d74043bf 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -19,7 +19,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz - pip diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index f6df5a6e894a7..3d1df9e3bcd59 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 586768765325e..691bdf25d4d96 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -22,7 +22,7 @@ dependencies: # required dependencies - python-dateutil=2.8.2 - - numpy=1.22.4 + - numpy=1.22.4, <2 - pytz=2020.1 # optional dependencies diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 3751651a2a2f2..199ce4dea1ac0 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index db0723cd3b8fa..a2f4d6395783a 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -21,7 +21,7 @@ dependencies: - hypothesis>=6.46.1 # required - - numpy + - numpy<2 - python-dateutil - pytz - pip: diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 65f72278a0291..c9f46b98273b3 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/environment.yml b/environment.yml index db6138c34f37c..a9648f3298198 100644 --- a/environment.yml +++ b/environment.yml @@ -21,7 +21,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/pyproject.toml b/pyproject.toml index a8388a9ff52de..a5aaa72289209 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ requires = [ # we don't want to force users to compile with 1.25 though # (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x) "oldest-supported-numpy>=2022.8.16; python_version<'3.12'", - "numpy>=1.26.0; python_version>='3.12'", + "numpy>=1.26.0,<2; python_version>='3.12'", "versioneer[toml]" ] @@ -29,9 +29,9 @@ authors = [ license = {file = 'LICENSE'} requires-python = '>=3.9' dependencies = [ - "numpy>=1.22.4; python_version<'3.11'", - "numpy>=1.23.2; python_version=='3.11'", - "numpy>=1.26.0; python_version>='3.12'", + "numpy>=1.22.4,<2; python_version<'3.11'", + "numpy>=1.23.2,<2; python_version=='3.11'", + "numpy>=1.26.0,<2; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.1" diff --git a/requirements-dev.txt b/requirements-dev.txt index 98339f45a5052..6e1a6058dce0e 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -12,7 +12,7 @@ pytest-xdist>=2.2.0 pytest-asyncio>=0.17.0 coverage python-dateutil -numpy +numpy<2 pytz beautifulsoup4>=4.11.1 blosc From 579b8268474c5c89133ae7106a323dd94e1e33fa Mon Sep 17 00:00:00 2001 From: Paras Gupta Date: Fri, 13 Oct 2023 21:57:36 +0530 Subject: [PATCH 09/19] BUG: categorical dtype equality for level in different type (#55486) * BUG: categorical dtype equality for level in different type * BUG: categorical dtype equality for level in different type - Comments#1 --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/dtypes/dtypes.py | 2 +- pandas/tests/dtypes/test_dtypes.py | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index eec82ae26afcc..ef1a34c4c27c6 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -285,6 +285,7 @@ Bug fixes Categorical ^^^^^^^^^^^ - :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`) +- Bug in :meth:`CategoricalDtype.__eq__` returning false for unordered categorical data with mixed types (:issue:`55468`) - Datetimelike diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 196c95c3673e9..9f85f82df666f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -456,7 +456,7 @@ def __eq__(self, other: object) -> bool: # With object-dtype we need a comparison that identifies # e.g. int(2) as distinct from float(2) - return hash(self) == hash(other) + return set(left) == set(right) def __repr__(self) -> str_type: if self.categories is None: diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 1f9c371c50ad4..27994708d2bdb 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -917,6 +917,24 @@ def test_equal_but_different(self): assert c1 is not c2 assert c1 != c2 + def test_equal_but_different_mixed_dtypes(self): + c1 = CategoricalDtype([1, 2, "3"]) + c2 = CategoricalDtype(["3", 1, 2]) + assert c1 is not c2 + assert c1 == c2 + + def test_equal_empty_ordered(self): + c1 = CategoricalDtype([], ordered=True) + c2 = CategoricalDtype([], ordered=True) + assert c1 is not c2 + assert c1 == c2 + + def test_equal_empty_unordered(self): + c1 = CategoricalDtype([]) + c2 = CategoricalDtype([]) + assert c1 is not c2 + assert c1 == c2 + @pytest.mark.parametrize("v1, v2", [([1, 2, 3], [1, 2, 3]), ([1, 2, 3], [3, 2, 1])]) def test_order_hashes_different(self, v1, v2): c1 = CategoricalDtype(v1, ordered=False) From c1d36044bd80e987105518a8986b016f7b54d584 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 13 Oct 2023 12:32:41 -0400 Subject: [PATCH 10/19] Update get_utc declarations (#55507) * updated parameter definitions * revert memview -> ndarray declaration --- pandas/_libs/tslibs/tzconversion.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index a96779aa33255..9c8865fbdf428 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -425,7 +425,11 @@ timedelta-like} return result.base # .base to get underlying ndarray -cdef Py_ssize_t bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n): +cdef Py_ssize_t bisect_right_i8( + const int64_t *data, + int64_t val, + Py_ssize_t n +) noexcept: # Caller is responsible for checking n > 0 # This looks very similar to local_search_right in the ndarray.searchsorted # implementation. @@ -463,7 +467,7 @@ cdef str _render_tstamp(int64_t val, NPY_DATETIMEUNIT creso): cdef _get_utc_bounds( ndarray[int64_t] vals, - int64_t* tdata, + const int64_t* tdata, Py_ssize_t ntrans, const int64_t[::1] deltas, NPY_DATETIMEUNIT creso, From e1368cfd506a782f386a0c3afe02b32c31e2856e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 13 Oct 2023 09:35:01 -0700 Subject: [PATCH 11/19] BUG: tslibs uncaught overflows (#55503) * BUG: tslibs uncaught overflows * GH refs * windows/32bit builds --- doc/source/whatsnew/v2.2.0.rst | 8 +++--- pandas/_libs/tslibs/offsets.pyx | 7 ++++- pandas/_libs/tslibs/period.pyx | 7 ++--- pandas/_libs/tslibs/timedeltas.pyx | 26 ++++++++++++++----- pandas/tests/scalar/period/test_period.py | 20 ++++++++++++++ .../scalar/timedelta/test_constructors.py | 10 +++++++ .../tests/scalar/timestamp/test_arithmetic.py | 9 ++----- pandas/tests/tseries/offsets/test_ticks.py | 9 +++++++ 8 files changed, 75 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ef1a34c4c27c6..e495c3c204de5 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -291,12 +291,14 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) -- +- Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) + Timedelta ^^^^^^^^^ +- Bug in :class:`Timedelta` construction raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in rendering (``__repr__``) of :class:`TimedeltaIndex` and :class:`Series` with timedelta64 values with non-nanosecond resolution entries that are all multiples of 24 hours failing to use the compact representation used in the nanosecond cases (:issue:`55405`) -- Timezones ^^^^^^^^^ @@ -353,7 +355,7 @@ I/O Period ^^^^^^ -- +- Bug in :class:`Period` addition silently wrapping around instead of raising ``OverflowError`` (:issue:`55503`) - Plotting diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 6a6f30de8dade..6c5cdde20da5f 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -961,7 +961,12 @@ cdef class Tick(SingleConstructorOffset): @property def delta(self): - return self.n * Timedelta(self._nanos_inc) + try: + return self.n * Timedelta(self._nanos_inc) + except OverflowError as err: + # GH#55503 as_unit will raise a more useful OutOfBoundsTimedelta + Timedelta(self).as_unit("ns") + raise AssertionError("This should not be reached.") @property def nanos(self) -> int64_t: diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index cacfe43b236d8..d305f27dd1090 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1814,7 +1814,7 @@ cdef class _Period(PeriodMixin): def _add_timedeltalike_scalar(self, other) -> "Period": cdef: - int64_t inc + int64_t inc, ordinal if not self._dtype._is_tick_like(): raise IncompatibleFrequency("Input cannot be converted to " @@ -1832,8 +1832,8 @@ cdef class _Period(PeriodMixin): except ValueError as err: raise IncompatibleFrequency("Input cannot be converted to " f"Period(freq={self.freqstr})") from err - # TODO: overflow-check here - ordinal = self.ordinal + inc + with cython.overflowcheck(True): + ordinal = self.ordinal + inc return Period(ordinal=ordinal, freq=self.freq) def _add_offset(self, other) -> "Period": @@ -1846,6 +1846,7 @@ cdef class _Period(PeriodMixin): ordinal = self.ordinal + other.n return Period(ordinal=ordinal, freq=self.freq) + @cython.overflowcheck(True) def __add__(self, other): if not is_period_object(self): # cython semantics; this is analogous to a call to __radd__ diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index e5d81bd5928b9..a573d9a8ed0c0 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1784,7 +1784,7 @@ class Timedelta(_Timedelta): ) # GH43764, convert any input to nanoseconds first and then - # create the timestamp. This ensures that any potential + # create the timedelta. This ensures that any potential # nanosecond contributions from kwargs parsed as floats # are taken into consideration. seconds = int(( @@ -1797,12 +1797,24 @@ class Timedelta(_Timedelta): ) * 1_000_000_000 ) - value = np.timedelta64( - int(kwargs.get("nanoseconds", 0)) - + int(kwargs.get("microseconds", 0) * 1_000) - + int(kwargs.get("milliseconds", 0) * 1_000_000) - + seconds - ) + ns = kwargs.get("nanoseconds", 0) + us = kwargs.get("microseconds", 0) + ms = kwargs.get("milliseconds", 0) + try: + value = np.timedelta64( + int(ns) + + int(us * 1_000) + + int(ms * 1_000_000) + + seconds + ) + except OverflowError as err: + # GH#55503 + msg = ( + f"seconds={seconds}, milliseconds={ms}, " + f"microseconds={us}, nanoseconds={ns}" + ) + raise OutOfBoundsTimedelta(msg) from err + if unit in {"Y", "y", "M"}: raise ValueError( "Units 'M', 'Y', and 'y' are no longer supported, as they do not " diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 6c27881e44b56..dc2938ec345f3 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -1182,6 +1182,26 @@ def test_comparison_numpy_zerodim_arr(self, zerodim_arr, expected): class TestArithmetic: + def test_add_overflow_raises(self): + # GH#55503 + per = Timestamp.max.to_period("ns") + + msg = "|".join( + [ + "Python int too large to convert to C long", + # windows, 32bit linux builds + "int too big to convert", + ] + ) + with pytest.raises(OverflowError, match=msg): + per + 1 + + msg = "value too large" + with pytest.raises(OverflowError, match=msg): + per + Timedelta(1) + with pytest.raises(OverflowError, match=msg): + per + offsets.Nano(1) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "m"]) def test_add_sub_td64_nat(self, unit): # GH#47196 diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 0d876fbb9bde8..858ab29d79c5e 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -15,6 +15,16 @@ ) +def test_construct_from_kwargs_overflow(): + # GH#55503 + msg = "seconds=86400000000000000000, milliseconds=0, microseconds=0, nanoseconds=0" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + Timedelta(days=10**6) + msg = "seconds=60000000000000000000, milliseconds=0, microseconds=0, nanoseconds=0" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + Timedelta(minutes=10**9) + + def test_construct_with_weeks_unit_overflow(): # GH#47268 don't silently wrap around with pytest.raises(OutOfBoundsTimedelta, match="without overflow"): diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index f5c9c576abc24..9c24d364841d1 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -40,17 +40,12 @@ def test_overflow_offset_raises(self): stamp = Timestamp("2017-01-13 00:00:00").as_unit("ns") offset_overflow = 20169940 * offsets.Day(1) - msg = ( - "the add operation between " - r"\<-?\d+ \* Days\> and \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} " - "will overflow" - ) lmsg2 = r"Cannot cast -?20169940 days \+?00:00:00 to unit='ns' without overflow" with pytest.raises(OutOfBoundsTimedelta, match=lmsg2): stamp + offset_overflow - with pytest.raises(OverflowError, match=msg): + with pytest.raises(OutOfBoundsTimedelta, match=lmsg2): offset_overflow + stamp with pytest.raises(OutOfBoundsTimedelta, match=lmsg2): @@ -68,7 +63,7 @@ def test_overflow_offset_raises(self): with pytest.raises(OutOfBoundsTimedelta, match=lmsg3): stamp + offset_overflow - with pytest.raises(OverflowError, match=msg): + with pytest.raises(OutOfBoundsTimedelta, match=lmsg3): offset_overflow + stamp with pytest.raises(OutOfBoundsTimedelta, match=lmsg3): diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index 69953955ebbce..abf187ace7cb3 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -15,6 +15,7 @@ import pytest from pandas._libs.tslibs.offsets import delta_to_tick +from pandas.errors import OutOfBoundsTimedelta from pandas import ( Timedelta, @@ -237,6 +238,14 @@ def test_tick_addition(kls, expected): assert result == expected +def test_tick_delta_overflow(): + # GH#55503 raise OutOfBoundsTimedelta, not OverflowError + tick = offsets.Day(10**9) + msg = "Cannot cast 1000000000 days 00:00:00 to unit='ns' without overflow" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + tick.delta + + @pytest.mark.parametrize("cls", tick_classes) def test_tick_division(cls): off = cls(10) From abba4e2df1ba1651f89ca4b02f8eeeffc17375c2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 13 Oct 2023 09:41:22 -0700 Subject: [PATCH 12/19] REF: share Index/Block get_values_for_csv (#55485) * REF: share Index/Block get_values_for_csv * mypy fixup --- pandas/core/indexes/base.py | 154 +++++++++++++++--- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/datetimes.py | 1 - pandas/core/indexes/multi.py | 6 +- pandas/core/indexes/period.py | 2 +- pandas/core/indexes/timedeltas.py | 1 - pandas/core/internals/array_manager.py | 2 +- pandas/core/internals/blocks.py | 93 +---------- pandas/io/formats/csvs.py | 15 +- pandas/io/formats/format.py | 2 +- .../tests/indexes/datetimes/test_formats.py | 16 +- pandas/tests/indexes/interval/test_formats.py | 2 +- pandas/tests/indexes/period/test_formats.py | 10 +- 13 files changed, 158 insertions(+), 148 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 515f750f11219..f53c5606db4d3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -30,6 +30,7 @@ algos as libalgos, index as libindex, lib, + writers, ) from pandas._libs.internals import BlockValuesRefs import pandas._libs.join as libjoin @@ -97,7 +98,6 @@ is_bool_dtype, is_ea_or_datetimelike_dtype, is_float, - is_float_dtype, is_hashable, is_integer, is_iterator, @@ -119,6 +119,7 @@ ExtensionDtype, IntervalDtype, PeriodDtype, + SparseDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -151,7 +152,9 @@ ArrowExtensionArray, BaseMaskedArray, Categorical, + DatetimeArray, ExtensionArray, + TimedeltaArray, ) from pandas.core.arrays.string_ import StringArray from pandas.core.base import ( @@ -199,7 +202,10 @@ MultiIndex, Series, ) - from pandas.core.arrays import PeriodArray + from pandas.core.arrays import ( + IntervalArray, + PeriodArray, + ) __all__ = ["Index"] @@ -1403,7 +1409,7 @@ def _format_with_header(self, *, header: list[str_t], na_rep: str_t) -> list[str result = trim_front(formatted) return header + result - def _format_native_types( + def _get_values_for_csv( self, *, na_rep: str_t = "", @@ -1412,30 +1418,14 @@ def _format_native_types( date_format=None, quoting=None, ) -> npt.NDArray[np.object_]: - """ - Actually format specific types of the index. - """ - from pandas.io.formats.format import FloatArrayFormatter - - if is_float_dtype(self.dtype) and not isinstance(self.dtype, ExtensionDtype): - formatter = FloatArrayFormatter( - self._values, - na_rep=na_rep, - float_format=float_format, - decimal=decimal, - quoting=quoting, - fixed_width=False, - ) - return formatter.get_result_as_array() - - mask = isna(self) - if self.dtype != object and not quoting: - values = np.asarray(self).astype(str) - else: - values = np.array(self, dtype=object, copy=True) - - values[mask] = na_rep - return values + return get_values_for_csv( + self._values, + na_rep=na_rep, + decimal=decimal, + float_format=float_format, + date_format=date_format, + quoting=quoting, + ) def _summary(self, name=None) -> str_t: """ @@ -7629,3 +7619,113 @@ def _maybe_try_sort(result: Index | ArrayLike, sort: bool | None): stacklevel=find_stack_level(), ) return result + + +def get_values_for_csv( + values: ArrayLike, + *, + date_format, + na_rep: str = "nan", + quoting=None, + float_format=None, + decimal: str = ".", +) -> npt.NDArray[np.object_]: + """ + Convert to types which can be consumed by the standard library's + csv.writer.writerows. + """ + if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm": + # GH#40754 Convert categorical datetimes to datetime array + values = algos.take_nd( + values.categories._values, + ensure_platform_int(values._codes), + fill_value=na_rep, + ) + + values = ensure_wrapped_if_datetimelike(values) + + if isinstance(values, (DatetimeArray, TimedeltaArray)): + if values.ndim == 1: + result = values._format_native_types(na_rep=na_rep, date_format=date_format) + result = result.astype(object, copy=False) + return result + + # GH#21734 Process every column separately, they might have different formats + results_converted = [] + for i in range(len(values)): + result = values[i, :]._format_native_types( + na_rep=na_rep, date_format=date_format + ) + results_converted.append(result.astype(object, copy=False)) + return np.vstack(results_converted) + + elif isinstance(values.dtype, PeriodDtype): + # TODO: tests that get here in column path + values = cast("PeriodArray", values) + res = values._format_native_types(na_rep=na_rep, date_format=date_format) + return res + + elif isinstance(values.dtype, IntervalDtype): + # TODO: tests that get here in column path + values = cast("IntervalArray", values) + mask = values.isna() + if not quoting: + result = np.asarray(values).astype(str) + else: + result = np.array(values, dtype=object, copy=True) + + result[mask] = na_rep + return result + + elif values.dtype.kind == "f" and not isinstance(values.dtype, SparseDtype): + # see GH#13418: no special formatting is desired at the + # output (important for appropriate 'quoting' behaviour), + # so do not pass it through the FloatArrayFormatter + if float_format is None and decimal == ".": + mask = isna(values) + + if not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype="object") + + values[mask] = na_rep + values = values.astype(object, copy=False) + return values + + from pandas.io.formats.format import FloatArrayFormatter + + formatter = FloatArrayFormatter( + values, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + quoting=quoting, + fixed_width=False, + ) + res = formatter.get_result_as_array() + res = res.astype(object, copy=False) + return res + + elif isinstance(values, ExtensionArray): + mask = isna(values) + + new_values = np.asarray(values.astype(object)) + new_values[mask] = na_rep + return new_values + + else: + mask = isna(values) + itemsize = writers.word_len(na_rep) + + if values.dtype != _dtype_obj and not quoting and itemsize: + values = values.astype(str) + if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize: + # enlarge for the na_rep + values = values.astype(f" npt.NDArray[np.object_]: new_levels = [] @@ -1392,7 +1392,7 @@ def _format_native_types( # go through the levels and format them for level, level_codes in zip(self.levels, self.codes): - level_strs = level._format_native_types(na_rep=na_rep, **kwargs) + level_strs = level._get_values_for_csv(na_rep=na_rep, **kwargs) # add nan values, if there are any mask = level_codes == -1 if mask.any(): @@ -1408,7 +1408,7 @@ def _format_native_types( if len(new_levels) == 1: # a single-level multi-index - return Index(new_levels[0].take(new_codes[0]))._format_native_types() + return Index(new_levels[0].take(new_codes[0]))._get_values_for_csv() else: # reconstruct the multi-index mi = MultiIndex( diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index b1023febe813d..09b41d9c32ec2 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -80,7 +80,7 @@ def _new_PeriodIndex(cls, **d): PeriodArray, wrap=True, ) -@inherit_names(["is_leap_year", "_format_native_types"], PeriodArray) +@inherit_names(["is_leap_year"], PeriodArray) class PeriodIndex(DatetimeIndexOpsMixin): """ Immutable ndarray holding ordinal values indicating regular periods in time. diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 498fe56a7ae7f..b1d8d0efb60e8 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -48,7 +48,6 @@ "sum", "std", "median", - "_format_native_types", ], TimedeltaArray, ) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 99af4f51661b1..9987908f407b3 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -68,6 +68,7 @@ Index, ensure_index, ) +from pandas.core.indexes.base import get_values_for_csv from pandas.core.internals.base import ( DataManager, SingleDataManager, @@ -79,7 +80,6 @@ ensure_block_shape, external_values, extract_pandas_array, - get_values_for_csv, maybe_coerce_values, new_block, ) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f0c14eec81c3c..330effe0f0a9f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -24,7 +24,6 @@ NaT, internals as libinternals, lib, - writers, ) from pandas._libs.internals import ( BlockPlacement, @@ -61,7 +60,6 @@ np_can_hold_element, ) from pandas.core.dtypes.common import ( - ensure_platform_int, is_1d_only_ea_dtype, is_float_dtype, is_integer_dtype, @@ -75,7 +73,6 @@ IntervalDtype, NumpyEADtype, PeriodDtype, - SparseDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -122,6 +119,7 @@ extract_array, ) from pandas.core.indexers import check_setitem_lengths +from pandas.core.indexes.base import get_values_for_csv if TYPE_CHECKING: from collections.abc import ( @@ -2602,95 +2600,6 @@ def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: return values -def get_values_for_csv( - values: ArrayLike, - *, - date_format, - na_rep: str = "nan", - quoting=None, - float_format=None, - decimal: str = ".", -) -> npt.NDArray[np.object_]: - """convert to our native types format""" - if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm": - # GH#40754 Convert categorical datetimes to datetime array - values = algos.take_nd( - values.categories._values, - ensure_platform_int(values._codes), - fill_value=na_rep, - ) - - values = ensure_wrapped_if_datetimelike(values) - - if isinstance(values, (DatetimeArray, TimedeltaArray)): - if values.ndim == 1: - result = values._format_native_types(na_rep=na_rep, date_format=date_format) - result = result.astype(object, copy=False) - return result - - # GH#21734 Process every column separately, they might have different formats - results_converted = [] - for i in range(len(values)): - result = values[i, :]._format_native_types( - na_rep=na_rep, date_format=date_format - ) - results_converted.append(result.astype(object, copy=False)) - return np.vstack(results_converted) - - elif values.dtype.kind == "f" and not isinstance(values.dtype, SparseDtype): - # see GH#13418: no special formatting is desired at the - # output (important for appropriate 'quoting' behaviour), - # so do not pass it through the FloatArrayFormatter - if float_format is None and decimal == ".": - mask = isna(values) - - if not quoting: - values = values.astype(str) - else: - values = np.array(values, dtype="object") - - values[mask] = na_rep - values = values.astype(object, copy=False) - return values - - from pandas.io.formats.format import FloatArrayFormatter - - formatter = FloatArrayFormatter( - values, - na_rep=na_rep, - float_format=float_format, - decimal=decimal, - quoting=quoting, - fixed_width=False, - ) - res = formatter.get_result_as_array() - res = res.astype(object, copy=False) - return res - - elif isinstance(values, ExtensionArray): - mask = isna(values) - - new_values = np.asarray(values.astype(object)) - new_values[mask] = na_rep - return new_values - - else: - mask = isna(values) - itemsize = writers.word_len(na_rep) - - if values.dtype != _dtype_obj and not quoting and itemsize: - values = values.astype(str) - if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize: - # enlarge for the na_rep - values = values.astype(f" ArrayLike: """ The array that Series.values returns (public attribute). diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 717dae6eea97c..50503e862ef43 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -44,6 +44,7 @@ IndexLabel, StorageOptions, WriteBuffer, + npt, ) from pandas.io.formats.format import DataFrameFormatter @@ -53,7 +54,7 @@ class CSVFormatter: - cols: np.ndarray + cols: npt.NDArray[np.object_] def __init__( self, @@ -149,7 +150,9 @@ def _initialize_quotechar(self, quotechar: str | None) -> str | None: def has_mi_columns(self) -> bool: return bool(isinstance(self.obj.columns, ABCMultiIndex)) - def _initialize_columns(self, cols: Iterable[Hashable] | None) -> np.ndarray: + def _initialize_columns( + self, cols: Iterable[Hashable] | None + ) -> npt.NDArray[np.object_]: # validate mi options if self.has_mi_columns: if cols is not None: @@ -158,7 +161,7 @@ def _initialize_columns(self, cols: Iterable[Hashable] | None) -> np.ndarray: if cols is not None: if isinstance(cols, ABCIndex): - cols = cols._format_native_types(**self._number_format) + cols = cols._get_values_for_csv(**self._number_format) else: cols = list(cols) self.obj = self.obj.loc[:, cols] @@ -166,7 +169,7 @@ def _initialize_columns(self, cols: Iterable[Hashable] | None) -> np.ndarray: # update columns to include possible multiplicity of dupes # and make sure cols is just a list of labels new_cols = self.obj.columns - return new_cols._format_native_types(**self._number_format) + return new_cols._get_values_for_csv(**self._number_format) def _initialize_chunksize(self, chunksize: int | None) -> int: if chunksize is None: @@ -223,7 +226,7 @@ def write_cols(self) -> SequenceNotStr[Hashable]: ) return self.header else: - # self.cols is an ndarray derived from Index._format_native_types, + # self.cols is an ndarray derived from Index._get_values_for_csv, # so its entries are strings, i.e. hashable return cast(SequenceNotStr[Hashable], self.cols) @@ -317,7 +320,7 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: res = df._get_values_for_csv(**self._number_format) data = list(res._iter_column_arrays()) - ix = self.data_index[slicer]._format_native_types(**self._number_format) + ix = self.data_index[slicer]._get_values_for_csv(**self._number_format) libwriters.write_csv_rows( data, ix, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index bb976b3a0208e..c87b8261916f2 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1105,7 +1105,7 @@ def format_array( the leading space to pad between columns. When formatting an Index subclass - (e.g. IntervalIndex._format_native_types), we don't want the + (e.g. IntervalIndex._get_values_for_csv), we don't want the leading space since it should be left-aligned. fallback_formatter diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 9fb5db9e034ee..caeb7fcb86f49 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -13,38 +13,38 @@ import pandas._testing as tm -def test_format_native_types(): +def test_get_values_for_csv(): index = pd.date_range(freq="1D", periods=3, start="2017-01-01") # First, with no arguments. expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype=object) - result = index._format_native_types() + result = index._get_values_for_csv() tm.assert_numpy_array_equal(result, expected) # No NaN values, so na_rep has no effect - result = index._format_native_types(na_rep="pandas") + result = index._get_values_for_csv(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) # Make sure date formatting works expected = np.array(["01-2017-01", "01-2017-02", "01-2017-03"], dtype=object) - result = index._format_native_types(date_format="%m-%Y-%d") + result = index._get_values_for_csv(date_format="%m-%Y-%d") tm.assert_numpy_array_equal(result, expected) # NULL object handling should work index = DatetimeIndex(["2017-01-01", pd.NaT, "2017-01-03"]) expected = np.array(["2017-01-01", "NaT", "2017-01-03"], dtype=object) - result = index._format_native_types() + result = index._get_values_for_csv(na_rep="NaT") tm.assert_numpy_array_equal(result, expected) expected = np.array(["2017-01-01", "pandas", "2017-01-03"], dtype=object) - result = index._format_native_types(na_rep="pandas") + result = index._get_values_for_csv(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) - result = index._format_native_types(date_format="%Y-%m-%d %H:%M:%S.%f") + result = index._get_values_for_csv(na_rep="NaT", date_format="%Y-%m-%d %H:%M:%S.%f") expected = np.array( ["2017-01-01 00:00:00.000000", "NaT", "2017-01-03 00:00:00.000000"], dtype=object, @@ -52,7 +52,7 @@ def test_format_native_types(): tm.assert_numpy_array_equal(result, expected) # invalid format - result = index._format_native_types(date_format="foo") + result = index._get_values_for_csv(na_rep="NaT", date_format="foo") expected = np.array(["foo", "NaT", "foo"], dtype=object) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index acb330c190d6f..5b509edc9ff88 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -104,7 +104,7 @@ def test_repr_floats(self): def test_to_native_types(self, tuples, closed, expected_data): # GH 28210 index = IntervalIndex.from_tuples(tuples, closed=closed) - result = index._format_native_types(na_rep="NaN") + result = index._get_values_for_csv(na_rep="NaN") expected = np.array(expected_data) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 9441f56a75f03..7245c6a7116fc 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -15,29 +15,29 @@ def test_to_native_types(): # First, with no arguments. expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype=object) - result = index._format_native_types() + result = index._get_values_for_csv() tm.assert_numpy_array_equal(result, expected) # No NaN values, so na_rep has no effect - result = index._format_native_types(na_rep="pandas") + result = index._get_values_for_csv(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) # Make sure date formatting works expected = np.array(["01-2017-01", "01-2017-02", "01-2017-03"], dtype=object) - result = index._format_native_types(date_format="%m-%Y-%d") + result = index._get_values_for_csv(date_format="%m-%Y-%d") tm.assert_numpy_array_equal(result, expected) # NULL object handling should work index = PeriodIndex(["2017-01-01", pd.NaT, "2017-01-03"], freq="D") expected = np.array(["2017-01-01", "NaT", "2017-01-03"], dtype=object) - result = index._format_native_types() + result = index._get_values_for_csv(na_rep="NaT") tm.assert_numpy_array_equal(result, expected) expected = np.array(["2017-01-01", "pandas", "2017-01-03"], dtype=object) - result = index._format_native_types(na_rep="pandas") + result = index._get_values_for_csv(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) From 2f3b0eda7231f33131c5fabf065483e61ac1ea59 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 13 Oct 2023 09:57:40 -0700 Subject: [PATCH 13/19] DEPR: Index.format (#55439) * REF: implement Index._format_flat, _format_multi * de-duplicate, change keyword * DEPR: Index.format * add formatter kwd * Post-merge fixup --- doc/source/whatsnew/v2.2.0.rst | 2 + pandas/core/indexes/base.py | 33 +++++++++ pandas/core/indexes/datetimelike.py | 11 +++ pandas/core/indexes/multi.py | 72 +++++++++++++++++++ pandas/io/formats/excel.py | 8 +-- pandas/io/formats/format.py | 23 +++--- pandas/io/formats/html.py | 14 ++-- pandas/io/formats/style_render.py | 4 +- .../tests/indexes/base_class/test_formats.py | 9 ++- .../tests/indexes/categorical/test_formats.py | 5 +- .../indexes/datetimes/test_datetimelike.py | 5 +- .../tests/indexes/datetimes/test_formats.py | 8 ++- pandas/tests/indexes/multi/test_formats.py | 20 ++++-- pandas/tests/indexes/period/test_period.py | 7 +- pandas/tests/indexes/ranges/test_range.py | 11 ++- pandas/tests/indexes/test_base.py | 12 +++- pandas/tests/indexes/test_old_base.py | 15 ++-- pandas/tests/io/excel/test_writers.py | 2 +- pandas/tests/io/formats/test_format.py | 57 ++++++++++----- pandas/tests/series/test_repr.py | 4 +- 20 files changed, 259 insertions(+), 63 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index e495c3c204de5..93ca2541d7ecd 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -234,6 +234,7 @@ For example: Other Deprecations ^^^^^^^^^^^^^^^^^^ - Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) +- Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict`. (:issue:`54229`) @@ -261,6 +262,7 @@ Other Deprecations - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecating downcasting the results of :meth:`DataFrame.fillna`, :meth:`Series.fillna`, :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, :meth:`Series.bfill` in object-dtype cases. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54261`) +- .. --------------------------------------------------------------------------- .. _whatsnew_220.performance: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f53c5606db4d3..252e88d7c7d51 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1375,6 +1375,14 @@ def format( """ Render a string representation of the Index. """ + warnings.warn( + # GH#55413 + f"{type(self).__name__}.format is deprecated and will be removed " + "in a future version. Convert using index.astype(str) or " + "index.map(formatter) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) header = [] if name: header.append( @@ -1388,6 +1396,31 @@ def format( return self._format_with_header(header=header, na_rep=na_rep) + _default_na_rep = "NaN" + + @final + def _format_flat( + self, + *, + include_name: bool, + formatter: Callable | None = None, + ) -> list[str_t]: + """ + Render a string representation of the Index. + """ + header = [] + if include_name: + header.append( + pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) + if self.name is not None + else "" + ) + + if formatter is not None: + return header + list(self.map(formatter)) + + return self._format_with_header(header=header, na_rep=self._default_na_rep) + def _format_with_header(self, *, header: list[str_t], na_rep: str_t) -> list[str_t]: from pandas.io.formats.format import format_array diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 9ab3f26f88152..a3e6c50b21642 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -14,6 +14,7 @@ cast, final, ) +import warnings import numpy as np @@ -42,6 +43,7 @@ cache_readonly, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_integer, @@ -187,6 +189,7 @@ def _convert_tolerance(self, tolerance, target): # -------------------------------------------------------------------- # Rendering Methods + _default_na_rep = "NaT" def format( self, @@ -198,6 +201,14 @@ def format( """ Render a string representation of the Index. """ + warnings.warn( + # GH#55413 + f"{type(self).__name__}.format is deprecated and will be removed " + "in a future version. Convert using index.astype(str) or " + "index.map(formatter) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) header = [] if name: header.append( diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d88eb3f18a3bd..86693f241ddb1 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1430,6 +1430,15 @@ def format( sparsify=None, adjoin: bool = True, ) -> list: + warnings.warn( + # GH#55413 + f"{type(self).__name__}.format is deprecated and will be removed " + "in a future version. Convert using index.astype(str) or " + "index.map(formatter) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if name is not None: names = name @@ -1492,6 +1501,69 @@ def format( else: return result_levels + def _format_multi( + self, + *, + include_names: bool, + sparsify: bool | None | lib.NoDefault, + formatter: Callable | None = None, + ) -> list: + if len(self) == 0: + return [] + + stringified_levels = [] + for lev, level_codes in zip(self.levels, self.codes): + na = _get_na_rep(lev.dtype) + + if len(lev) > 0: + taken = formatted = lev.take(level_codes) + formatted = taken._format_flat(include_name=False, formatter=formatter) + + # we have some NA + mask = level_codes == -1 + if mask.any(): + formatted = np.array(formatted, dtype=object) + formatted[mask] = na + formatted = formatted.tolist() + + else: + # weird all NA case + formatted = [ + pprint_thing(na if isna(x) else x, escape_chars=("\t", "\r", "\n")) + for x in algos.take_nd(lev._values, level_codes) + ] + stringified_levels.append(formatted) + + result_levels = [] + for lev, lev_name in zip(stringified_levels, self.names): + level = [] + + if include_names: + level.append( + pprint_thing(lev_name, escape_chars=("\t", "\r", "\n")) + if lev_name is not None + else "" + ) + + level.extend(np.array(lev, dtype=object)) + result_levels.append(level) + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + if sparsify: + sentinel: Literal[""] | bool | lib.NoDefault = "" + # GH3547 use value of sparsify as sentinel if it's "Falsey" + assert isinstance(sparsify, bool) or sparsify is lib.no_default + if sparsify is lib.no_default: + sentinel = sparsify + # little bit of a kludge job for #1217 + result_levels = sparsify_labels( + result_levels, start=int(include_names), sentinel=sentinel + ) + + return result_levels + # -------------------------------------------------------------------- # Names Methods diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index b344d9849f16c..684cd4340cd2b 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -623,8 +623,8 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: return columns = self.columns - level_strs = columns.format( - sparsify=self.merge_cells, adjoin=False, names=False + level_strs = columns._format_multi( + sparsify=self.merge_cells, include_names=False ) level_lengths = get_level_lengths(level_strs) coloffset = 0 @@ -813,8 +813,8 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: if self.merge_cells: # Format hierarchical rows as merged cells. - level_strs = self.df.index.format( - sparsify=True, adjoin=False, names=False + level_strs = self.df.index._format_multi( + sparsify=True, include_names=False ) level_lengths = get_level_lengths(level_strs) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c87b8261916f2..1a7e4d7a80e13 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -313,8 +313,14 @@ def to_string(self) -> str: if len(series) == 0: return f"{type(self.series).__name__}([], {footer})" - have_header = _has_names(series.index) - fmt_index = self.tr_series.index.format(name=True) + index = series.index + have_header = _has_names(index) + if isinstance(index, MultiIndex): + fmt_index = index._format_multi(include_names=True, sparsify=None) + adj = printing.get_adjustment() + fmt_index = adj.adjoin(2, *fmt_index).split("\n") + else: + fmt_index = index._format_flat(include_name=True) fmt_values = self._get_formatted_values() if self.is_truncated_vertically: @@ -777,7 +783,7 @@ def _get_formatted_column_labels(self, frame: DataFrame) -> list[list[str]]: columns = frame.columns if isinstance(columns, MultiIndex): - fmt_columns = columns.format(sparsify=False, adjoin=False) + fmt_columns = columns._format_multi(sparsify=False, include_names=False) fmt_columns = list(zip(*fmt_columns)) dtypes = self.frame.dtypes._values @@ -802,7 +808,7 @@ def space_format(x, y): str_columns = [list(x) for x in zip(*str_columns)] else: - fmt_columns = columns.format() + fmt_columns = columns._format_flat(include_name=False) dtypes = self.frame.dtypes need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) str_columns = [ @@ -821,14 +827,15 @@ def _get_formatted_index(self, frame: DataFrame) -> list[str]: fmt = self._get_formatter("__index__") if isinstance(index, MultiIndex): - fmt_index = index.format( + fmt_index = index._format_multi( sparsify=self.sparsify, - adjoin=False, - names=self.show_row_idx_names, + include_names=self.show_row_idx_names, formatter=fmt, ) else: - fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)] + fmt_index = [ + index._format_flat(include_name=self.show_row_idx_names, formatter=fmt) + ] fmt_index = [ tuple( diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index b1a3504d46b27..794ce77b3b45e 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -282,7 +282,7 @@ def _write_col_header(self, indent: int) -> None: sentinel = lib.no_default else: sentinel = False - levels = self.columns.format(sparsify=sentinel, adjoin=False, names=False) + levels = self.columns._format_multi(sparsify=sentinel, include_names=False) level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 for lnum, (records, values) in enumerate(zip(level_lengths, levels)): @@ -437,7 +437,8 @@ def _write_regular_rows( if fmt is not None: index_values = self.fmt.tr_frame.index.map(fmt) else: - index_values = self.fmt.tr_frame.index.format() + # only reached with non-Multi index + index_values = self.fmt.tr_frame.index._format_flat(include_name=False) row: list[str] = [] for i in range(nrows): @@ -480,13 +481,13 @@ def _write_hierarchical_rows( nrows = len(frame) assert isinstance(frame.index, MultiIndex) - idx_values = frame.index.format(sparsify=False, adjoin=False, names=False) + idx_values = frame.index._format_multi(sparsify=False, include_names=False) idx_values = list(zip(*idx_values)) if self.fmt.sparsify: # GH3547 sentinel = lib.no_default - levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False) + levels = frame.index._format_multi(sparsify=sentinel, include_names=False) level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 @@ -579,7 +580,7 @@ def _write_hierarchical_rows( ) idx_values = list( - zip(*frame.index.format(sparsify=False, adjoin=False, names=False)) + zip(*frame.index._format_multi(sparsify=False, include_names=False)) ) row = [] row.extend(idx_values[i]) @@ -606,7 +607,8 @@ def _get_formatted_values(self) -> dict[int, list[str]]: return {i: self.fmt.format_col(i) for i in range(self.ncols)} def _get_columns_formatted_values(self) -> list[str]: - return self.columns.format() + # only reached with non-Multi Index + return self.columns._format_flat(include_name=False) def write_style(self) -> None: # We use the "scoped" attribute here so that the desired diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 829ed4a33f6a4..416b263ba8497 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -1652,9 +1652,9 @@ def _get_level_lengths( Result is a dictionary of (level, initial_position): span """ if isinstance(index, MultiIndex): - levels = index.format(sparsify=lib.no_default, adjoin=False) + levels = index._format_multi(sparsify=lib.no_default, include_names=False) else: - levels = index.format() + levels = index._format_flat(include_name=False) if hidden_elements is None: hidden_elements = [] diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index 9053d45dee623..20f94010f56f8 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -4,6 +4,7 @@ import pandas._config.config as cf from pandas import Index +import pandas._testing as tm class TestIndexRendering: @@ -133,7 +134,9 @@ def test_summary_bug(self): def test_index_repr_bool_nan(self): # GH32146 arr = Index([True, False, np.nan], dtype=object) - exp1 = arr.format() + msg = "Index.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + exp1 = arr.format() out1 = ["True", "False", "NaN"] assert out1 == exp1 @@ -145,4 +148,6 @@ def test_format_different_scalar_lengths(self): # GH#35439 idx = Index(["aaaaaaaaa", "b"]) expected = ["aaaaaaaaa", "b"] - assert idx.format() == expected + msg = r"Index\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index 7dbcaaa8d4ba6..ea3e4ce213e67 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -4,6 +4,7 @@ import pandas._config.config as cf from pandas import CategoricalIndex +import pandas._testing as tm class TestCategoricalIndexRepr: @@ -11,7 +12,9 @@ def test_format_different_scalar_lengths(self): # GH#35439 idx = CategoricalIndex(["aaaaaaaaa", "b"]) expected = ["aaaaaaaaa", "b"] - assert idx.format() == expected + msg = r"CategoricalIndex\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected def test_string_categorical_index_repr(self): # short diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index a6bee20d3d3ec..a012a2985b41c 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -1,5 +1,6 @@ """ generic tests from the Datetimelike class """ from pandas import date_range +import pandas._testing as tm class TestDatetimeIndex: @@ -7,4 +8,6 @@ def test_format(self): # GH35439 idx = date_range("20130101", periods=5) expected = [f"{x:%Y-%m-%d}" for x in idx] - assert idx.format() == expected + msg = r"DatetimeIndex\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index caeb7fcb86f49..6f75ac1b569c0 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -285,13 +285,17 @@ def test_format_with_name_time_info(self): # bug I fixed 12/20/2011 dates = pd.date_range("2011-01-01 04:00:00", periods=10, name="something") - formatted = dates.format(name=True) + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dates.format(name=True) assert formatted[0] == "something" def test_format_datetime_with_time(self): dti = DatetimeIndex([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)]) - result = dti.format() + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = dti.format() expected = ["2012-02-07 00:00:00", "2012-02-07 23:00:00"] assert len(result) == 2 assert result == expected diff --git a/pandas/tests/indexes/multi/test_formats.py b/pandas/tests/indexes/multi/test_formats.py index 011f61fac90e8..bbe94824eefa1 100644 --- a/pandas/tests/indexes/multi/test_formats.py +++ b/pandas/tests/indexes/multi/test_formats.py @@ -6,24 +6,31 @@ Index, MultiIndex, ) +import pandas._testing as tm def test_format(idx): - idx.format() - idx[:0].format() + msg = "MultiIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx.format() + idx[:0].format() def test_format_integer_names(): index = MultiIndex( levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1] ) - index.format(names=True) + msg = "MultiIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + index.format(names=True) def test_format_sparse_config(idx): # GH1538 + msg = "MultiIndex.format is deprecated" with pd.option_context("display.multi_sparse", False): - result = idx.format() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = idx.format() assert result[1] == "foo two" @@ -37,8 +44,9 @@ def test_format_sparse_display(): [0, 0, 0, 0, 0, 0], ], ) - - result = index.format() + msg = "MultiIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = index.format() assert result[3] == "1 0 0 0" diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 22bb63d67f57f..1bb8d66332cd0 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -275,8 +275,11 @@ def test_map(self): def test_format_empty(self): # GH35712 empty_idx = PeriodIndex([], freq="Y") - assert empty_idx.format() == [] - assert empty_idx.format(name=True) == [""] + msg = r"PeriodIndex\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert empty_idx.format() == [] + with tm.assert_produces_warning(FutureWarning, match=msg): + assert empty_idx.format(name=True) == [""] def test_period_index_frequency_ME_error_message(self): msg = "Invalid frequency: 2ME" diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 132704434829e..95756b04bca69 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -240,7 +240,9 @@ def test_cache(self): pass assert idx._cache == {} - idx.format() + msg = "RangeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx.format() assert idx._cache == {} df = pd.DataFrame({"a": range(10)}, index=idx) @@ -566,8 +568,11 @@ def test_engineless_lookup(self): def test_format_empty(self): # GH35712 empty_idx = RangeIndex(0) - assert empty_idx.format() == [] - assert empty_idx.format(name=True) == [""] + msg = r"RangeIndex\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert empty_idx.format() == [] + with tm.assert_produces_warning(FutureWarning, match=msg): + assert empty_idx.format(name=True) == [""] @pytest.mark.parametrize( "ri", diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 6afab569797f2..04ab2020b4c7a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -666,13 +666,17 @@ def test_format_bug(self): # include us since the default for Timestamp shows these but Index # formatting does not we are skipping) now = datetime.now() + msg = r"Index\.format is deprecated" + if not str(now).endswith("000"): index = Index([now]) - formatted = index.format() + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = index.format() expected = [str(index[0])] assert formatted == expected - Index([]).format() + with tm.assert_produces_warning(FutureWarning, match=msg): + Index([]).format() @pytest.mark.parametrize("vals", [[1, 2.0 + 3.0j, 4.0], ["a", "b", "c"]]) def test_format_missing(self, vals, nulls_fixture): @@ -682,7 +686,9 @@ def test_format_missing(self, vals, nulls_fixture): index = Index(vals, dtype=object) # TODO: case with complex dtype? - formatted = index.format() + msg = r"Index\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = index.format() null_repr = "NaN" if isinstance(nulls_fixture, float) else str(nulls_fixture) expected = [str(index[0]), str(index[1]), str(index[2]), null_repr] diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 79dc423f12a85..32adbc693390b 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -559,15 +559,20 @@ def test_format(self, simple_index): pytest.skip("Tested elsewhere.") idx = simple_index expected = [str(x) for x in idx] - assert idx.format() == expected + msg = r"Index\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected def test_format_empty(self, simple_index): # GH35712 if isinstance(simple_index, (PeriodIndex, RangeIndex)): pytest.skip("Tested elsewhere") empty_idx = type(simple_index)([]) - assert empty_idx.format() == [] - assert empty_idx.format(name=True) == [""] + msg = r"Index\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert empty_idx.format() == [] + with tm.assert_produces_warning(FutureWarning, match=msg): + assert empty_idx.format(name=True) == [""] def test_fillna(self, index): # GH 11343 @@ -955,7 +960,9 @@ def test_format(self, simple_index): idx = simple_index max_width = max(len(str(x)) for x in idx) expected = [str(x).ljust(max_width) for x in idx] - assert idx.format() == expected + msg = r"Index\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected def test_insert_non_na(self, simple_index): # GH#43921 inserting an element that we know we can hold should diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index e4ce969daab53..18af18ade85f4 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -809,7 +809,7 @@ def test_to_excel_multiindex_cols(self, merge_cells, frame, path): reader, sheet_name="test1", header=header, index_col=[0, 1] ) if not merge_cells: - fm = frame.columns.format(sparsify=False, adjoin=False, names=False) + fm = frame.columns._format_multi(sparsify=False, include_names=False) frame.columns = [".".join(map(str, q)) for q in zip(*fm)] tm.assert_frame_equal(frame, df) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 57f1f082708ae..d18c333d79244 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3333,7 +3333,9 @@ def test_period_format_and_strftime_default(self): per = pd.PeriodIndex([datetime(2003, 1, 1, 12), None], freq="h") # Default formatting - formatted = per.format() + msg = "PeriodIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = per.format() assert formatted[0] == "2003-01-01 12:00" # default: minutes not shown assert formatted[1] == "NaT" # format is equivalent to strftime(None)... @@ -3342,35 +3344,40 @@ def test_period_format_and_strftime_default(self): # Same test with nanoseconds freq per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="ns") - formatted = per.format() + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = per.format() assert (formatted == per.strftime(None)).all() assert formatted[0] == "2003-01-01 12:01:01.123456789" assert formatted[1] == "2003-01-01 12:01:01.123456790" def test_period_custom(self): # GH#46252 custom formatting directives %l (ms) and %u (us) + msg = "PeriodIndex.format is deprecated" # 3 digits per = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="ms") - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") assert formatted[0] == "03 12:01:01 (ms=123 us=123000 ns=123000000)" assert formatted[1] == "03 12:01:01 (ms=124 us=124000 ns=124000000)" # 6 digits per = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="us") - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456000)" assert formatted[1] == "03 12:01:01 (ms=123 us=123457 ns=123457000)" # 9 digits per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="ns") - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456789)" assert formatted[1] == "03 12:01:01 (ms=123 us=123456 ns=123456790)" def test_period_tz(self): # Formatting periods created from a datetime with timezone. - + msg = r"PeriodIndex\.format is deprecated" # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) @@ -3378,13 +3385,15 @@ def test_period_tz(self): # Since tz is currently set as utc, we'll see 2012 with tm.assert_produces_warning(UserWarning, match="will drop timezone"): per = dt.to_period(freq="h") - assert per.format()[0] == "2012-12-31 23:00" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert per.format()[0] == "2012-12-31 23:00" # If tz is currently set as paris before conversion, we'll see 2013 dt = dt.tz_convert("Europe/Paris") with tm.assert_produces_warning(UserWarning, match="will drop timezone"): per = dt.to_period(freq="h") - assert per.format()[0] == "2013-01-01 00:00" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert per.format()[0] == "2013-01-01 00:00" @pytest.mark.parametrize( "locale_str", @@ -3411,7 +3420,9 @@ def test_period_non_ascii_fmt(self, locale_str): # Index per = pd.period_range("2003-01-01 01:00:00", periods=2, freq="12h") - formatted = per.format(date_format="%y é") + msg = "PeriodIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = per.format(date_format="%y é") assert formatted[0] == "03 é" assert formatted[1] == "03 é" @@ -3443,33 +3454,45 @@ def test_period_custom_locale_directive(self, locale_str): # Index per = pd.period_range("2003-01-01 01:00:00", periods=2, freq="12h") - formatted = per.format(date_format="%y %I:%M:%S%p") + msg = "PeriodIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = per.format(date_format="%y %I:%M:%S%p") assert formatted[0] == f"03 01:00:00{am_local}" assert formatted[1] == f"03 01:00:00{pm_local}" class TestDatetimeIndexFormat: def test_datetime(self): - formatted = pd.to_datetime([datetime(2003, 1, 1, 12), NaT]).format() + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = pd.to_datetime([datetime(2003, 1, 1, 12), NaT]).format() assert formatted[0] == "2003-01-01 12:00:00" assert formatted[1] == "NaT" def test_date(self): - formatted = pd.to_datetime([datetime(2003, 1, 1), NaT]).format() + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = pd.to_datetime([datetime(2003, 1, 1), NaT]).format() assert formatted[0] == "2003-01-01" assert formatted[1] == "NaT" def test_date_tz(self): - formatted = pd.to_datetime([datetime(2013, 1, 1)], utc=True).format() + dti = pd.to_datetime([datetime(2013, 1, 1)], utc=True) + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dti.format() assert formatted[0] == "2013-01-01 00:00:00+00:00" - formatted = pd.to_datetime([datetime(2013, 1, 1), NaT], utc=True).format() + dti = pd.to_datetime([datetime(2013, 1, 1), NaT], utc=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dti.format() assert formatted[0] == "2013-01-01 00:00:00+00:00" def test_date_explicit_date_format(self): - formatted = pd.to_datetime([datetime(2003, 2, 1), NaT]).format( - date_format="%m-%d-%Y", na_rep="UT" - ) + dti = pd.to_datetime([datetime(2003, 2, 1), NaT]) + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dti.format(date_format="%m-%d-%Y", na_rep="UT") assert formatted[0] == "02-01-2003" assert formatted[1] == "UT" diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 86474a38d29fb..86addb9dadfad 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -258,7 +258,9 @@ def test_index_repr_in_frame_with_nan(self): def test_format_pre_1900_dates(self): rng = date_range("1/1/1850", "1/1/1950", freq="Y-DEC") - rng.format() + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + rng.format() ts = Series(1, index=rng) repr(ts) From b5b8be04b2a63fc02e89650a740779e718d7a99b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 13 Oct 2023 11:41:56 -0700 Subject: [PATCH 14/19] REF: de-duplicate some Timedelta helpers (#55501) * REF: implement disallow_ambiguous_unit * REF: implement get_unit_for_round --- pandas/_libs/tslibs/timedeltas.pxd | 1 + pandas/_libs/tslibs/timedeltas.pyi | 2 ++ pandas/_libs/tslibs/timedeltas.pyx | 27 ++++++++++++++++++--------- pandas/_libs/tslibs/timestamps.pyx | 7 ++----- pandas/core/arrays/datetimelike.py | 6 ++---- pandas/core/indexes/timedeltas.py | 7 ++----- pandas/core/tools/timedeltas.py | 8 ++------ 7 files changed, 29 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pxd b/pandas/_libs/tslibs/timedeltas.pxd index fb6e29a8932a1..f3473e46b6699 100644 --- a/pandas/_libs/tslibs/timedeltas.pxd +++ b/pandas/_libs/tslibs/timedeltas.pxd @@ -4,6 +4,7 @@ from numpy cimport int64_t from .np_datetime cimport NPY_DATETIMEUNIT +cpdef int64_t get_unit_for_round(freq, NPY_DATETIMEUNIT creso) except? -1 # Exposed for tslib, not intended for outside use. cpdef int64_t delta_to_nanoseconds( delta, NPY_DATETIMEUNIT reso=*, bint round_ok=* diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index 6d993722ce1d4..181703c5f55b2 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -68,6 +68,8 @@ UnitChoices: TypeAlias = Literal[ ] _S = TypeVar("_S", bound=timedelta) +def get_unit_for_round(freq, creso: int) -> int: ... +def disallow_ambiguous_unit(unit: str | None) -> None: ... def ints_to_pytimedelta( arr: npt.NDArray[np.timedelta64], box: bool = ..., diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index a573d9a8ed0c0..5e124b89eab5e 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -827,6 +827,14 @@ def _binary_op_method_timedeltalike(op, name): # ---------------------------------------------------------------------- # Timedelta Construction +cpdef disallow_ambiguous_unit(unit): + if unit in {"Y", "y", "M"}: + raise ValueError( + "Units 'M', 'Y', and 'y' are no longer supported, as they do not " + "represent unambiguous timedelta values durations." + ) + + cdef int64_t parse_iso_format_string(str ts) except? -1: """ Extracts and cleanses the appropriate values from a match object with @@ -1815,11 +1823,7 @@ class Timedelta(_Timedelta): ) raise OutOfBoundsTimedelta(msg) from err - if unit in {"Y", "y", "M"}: - raise ValueError( - "Units 'M', 'Y', and 'y' are no longer supported, as they do not " - "represent unambiguous timedelta values durations." - ) + disallow_ambiguous_unit(unit) # GH 30543 if pd.Timedelta already passed, return it # check that only value is passed @@ -1932,10 +1936,7 @@ class Timedelta(_Timedelta): int64_t result, unit ndarray[int64_t] arr - from pandas._libs.tslibs.offsets import to_offset - - to_offset(freq).nanos # raises on non-fixed freq - unit = delta_to_nanoseconds(to_offset(freq), self._creso) + unit = get_unit_for_round(freq, self._creso) arr = np.array([self._value], dtype="i8") try: @@ -2292,3 +2293,11 @@ cdef bint _should_cast_to_timedelta(object obj): return ( is_any_td_scalar(obj) or obj is None or obj is NaT or isinstance(obj, str) ) + + +cpdef int64_t get_unit_for_round(freq, NPY_DATETIMEUNIT creso) except? -1: + from pandas._libs.tslibs.offsets import to_offset + + freq = to_offset(freq) + freq.nanos # raises on non-fixed freq + return delta_to_nanoseconds(freq, creso) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index edd061fd8cdf1..66b1cec63e9e9 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -107,7 +107,7 @@ from pandas._libs.tslibs.np_datetime import ( from pandas._libs.tslibs.offsets cimport to_offset from pandas._libs.tslibs.timedeltas cimport ( _Timedelta, - delta_to_nanoseconds, + get_unit_for_round, is_any_td_scalar, ) @@ -1896,8 +1896,7 @@ class Timestamp(_Timestamp): int64_t nanos freq = to_offset(freq, is_period=False) - freq.nanos # raises on non-fixed freq - nanos = delta_to_nanoseconds(freq, self._creso) + nanos = get_unit_for_round(freq, self._creso) if nanos == 0: if freq.nanos == 0: raise ValueError("Division by zero in rounding") @@ -1905,8 +1904,6 @@ class Timestamp(_Timestamp): # e.g. self.unit == "s" and sub-second freq return self - # TODO: problem if nanos==0 - if self.tz is not None: value = self.tz_localize(None)._value else: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index fd51303ebd55f..22ffaceeff1bb 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -36,7 +36,6 @@ Timedelta, Timestamp, astype_overflowsafe, - delta_to_nanoseconds, get_unit_from_dtype, iNaT, ints_to_pydatetime, @@ -49,6 +48,7 @@ round_nsint64, ) from pandas._libs.tslibs.np_datetime import compare_mismatched_resolutions +from pandas._libs.tslibs.timedeltas import get_unit_for_round from pandas._libs.tslibs.timestamps import integer_op_not_supported from pandas._typing import ( ArrayLike, @@ -2129,9 +2129,7 @@ def _round(self, freq, mode, ambiguous, nonexistent): values = self.view("i8") values = cast(np.ndarray, values) - offset = to_offset(freq) - offset.nanos # raises on non-fixed frequencies - nanos = delta_to_nanoseconds(offset, self._creso) + nanos = get_unit_for_round(freq, self._creso) if nanos == 0: # GH 52761 return self.copy() diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index b1d8d0efb60e8..9d8ef5b0a69cd 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -13,6 +13,7 @@ Timedelta, to_offset, ) +from pandas._libs.tslibs.timedeltas import disallow_ambiguous_unit from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( @@ -170,11 +171,7 @@ def __new__( if is_scalar(data): cls._raise_scalar_data_error(data) - if unit in {"Y", "y", "M"}: - raise ValueError( - "Units 'M', 'Y', and 'y' are no longer supported, as they do not " - "represent unambiguous timedelta values durations." - ) + disallow_ambiguous_unit(unit) if dtype is not None: dtype = pandas_dtype(dtype) diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 8db77725a1aa3..587946aba5041 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -17,6 +17,7 @@ ) from pandas._libs.tslibs.timedeltas import ( Timedelta, + disallow_ambiguous_unit, parse_timedelta_unit, ) @@ -178,16 +179,11 @@ def to_timedelta( """ if unit is not None: unit = parse_timedelta_unit(unit) + disallow_ambiguous_unit(unit) if errors not in ("ignore", "raise", "coerce"): raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'.") - if unit in {"Y", "y", "M"}: - raise ValueError( - "Units 'M', 'Y', and 'y' are no longer supported, as they do not " - "represent unambiguous timedelta values durations." - ) - if arg is None: return arg elif isinstance(arg, ABCSeries): From 0021d241c6aa1b8db91151361b48d7864201fd01 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Sat, 14 Oct 2023 09:58:45 +0200 Subject: [PATCH 15/19] =?UTF-8?q?DEPR:=20rename=20=E2=80=98BM=E2=80=99/?= =?UTF-8?q?=E2=80=98CBM=E2=80=99=20to=20=E2=80=98BME=E2=80=99/=E2=80=98CBM?= =?UTF-8?q?E=E2=80=99=20for=20offsets=20(#55496)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * rename BM, CBM to BME, CBME for offsets * fix tests * correct docs * add tests, add notes to 2.2.0 whatsnew --- doc/source/user_guide/timeseries.rst | 16 ++++++++-------- doc/source/whatsnew/v2.2.0.rst | 6 +++++- pandas/_libs/tslibs/dtypes.pyx | 4 +++- pandas/_libs/tslibs/offsets.pyx | 10 +++++----- pandas/core/generic.py | 4 ++-- pandas/core/resample.py | 4 ++-- pandas/tests/frame/methods/test_asfreq.py | 10 +++++----- .../indexes/datetimes/methods/test_shift.py | 2 +- .../indexes/datetimes/methods/test_to_period.py | 2 +- pandas/tests/indexes/datetimes/test_datetime.py | 14 ++++++++++++++ pandas/tests/indexes/datetimes/test_misc.py | 2 +- .../tests/indexes/multi/test_get_level_values.py | 2 +- pandas/tests/plotting/test_datetimelike.py | 2 +- pandas/tests/resample/test_datetime_index.py | 15 +++++++++++++-- .../tests/tseries/frequencies/test_inference.py | 2 +- pandas/tests/tseries/offsets/test_offsets.py | 8 ++++---- pandas/tseries/frequencies.py | 2 +- 17 files changed, 68 insertions(+), 37 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 9f3077e266e98..7ffbd9b2d740a 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -461,7 +461,7 @@ of those specified will not be generated: .. ipython:: python - pd.date_range(start, end, freq="BM") + pd.date_range(start, end, freq="BME") pd.date_range(start, end, freq="W") @@ -557,7 +557,7 @@ intelligent functionality like selection, slicing, etc. .. ipython:: python - rng = pd.date_range(start, end, freq="BM") + rng = pd.date_range(start, end, freq="BME") ts = pd.Series(np.random.randn(len(rng)), index=rng) ts.index ts[:5].index @@ -884,9 +884,9 @@ into ``freq`` keyword arguments. The available date offsets and associated frequ :class:`~pandas.tseries.offsets.LastWeekOfMonth`, ``'LWOM'``, "the x-th day of the last week of each month" :class:`~pandas.tseries.offsets.MonthEnd`, ``'ME'``, "calendar month end" :class:`~pandas.tseries.offsets.MonthBegin`, ``'MS'``, "calendar month begin" - :class:`~pandas.tseries.offsets.BMonthEnd` or :class:`~pandas.tseries.offsets.BusinessMonthEnd`, ``'BM'``, "business month end" + :class:`~pandas.tseries.offsets.BMonthEnd` or :class:`~pandas.tseries.offsets.BusinessMonthEnd`, ``'BME'``, "business month end" :class:`~pandas.tseries.offsets.BMonthBegin` or :class:`~pandas.tseries.offsets.BusinessMonthBegin`, ``'BMS'``, "business month begin" - :class:`~pandas.tseries.offsets.CBMonthEnd` or :class:`~pandas.tseries.offsets.CustomBusinessMonthEnd`, ``'CBM'``, "custom business month end" + :class:`~pandas.tseries.offsets.CBMonthEnd` or :class:`~pandas.tseries.offsets.CustomBusinessMonthEnd`, ``'CBME'``, "custom business month end" :class:`~pandas.tseries.offsets.CBMonthBegin` or :class:`~pandas.tseries.offsets.CustomBusinessMonthBegin`, ``'CBMS'``, "custom business month begin" :class:`~pandas.tseries.offsets.SemiMonthEnd`, ``'SM'``, "15th (or other day_of_month) and calendar month end" :class:`~pandas.tseries.offsets.SemiMonthBegin`, ``'SMS'``, "15th (or other day_of_month) and calendar month begin" @@ -1248,8 +1248,8 @@ frequencies. We will refer to these aliases as *offset aliases*. "W", "weekly frequency" "ME", "month end frequency" "SM", "semi-month end frequency (15th and end of month)" - "BM", "business month end frequency" - "CBM", "custom business month end frequency" + "BME", "business month end frequency" + "CBME", "custom business month end frequency" "MS", "month start frequency" "SMS", "semi-month start frequency (1st and 15th)" "BMS", "business month start frequency" @@ -1586,7 +1586,7 @@ rather than changing the alignment of the data and the index: ts.shift(5, freq="D") ts.shift(5, freq=pd.offsets.BDay()) - ts.shift(5, freq="BM") + ts.shift(5, freq="BME") Note that with when ``freq`` is specified, the leading entry is no longer NaN because the data is not being realigned. @@ -1692,7 +1692,7 @@ the end of the interval. .. warning:: The default values for ``label`` and ``closed`` is '**left**' for all - frequency offsets except for 'ME', 'Y', 'Q', 'BM', 'BY', 'BQ', and 'W' + frequency offsets except for 'ME', 'Y', 'Q', 'BME', 'BY', 'BQ', and 'W' which all have a default of 'right'. This might unintendedly lead to looking ahead, where the value for a later diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 93ca2541d7ecd..ef59c86a21598 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -253,7 +253,11 @@ Other Deprecations - Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) - Deprecated including the groups in computations when using :meth:`DataFrameGroupBy.apply` and :meth:`DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) - Deprecated not passing a tuple to :class:`DataFrameGroupBy.get_group` or :class:`SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`) -- Deprecated string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`52536`) +- Deprecated string ``AS`` denoting frequency in :class:`YearBegin` and strings ``AS-DEC``, ``AS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`54275`) +- Deprecated string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`54275`) +- Deprecated string ``BAS`` denoting frequency in :class:`BYearBegin` and strings ``BAS-DEC``, ``BAS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`54275`) +- Deprecated string ``BA`` denoting frequency in :class:`BYearEnd` and strings ``BA-DEC``, ``BA-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`54275`) +- Deprecated strings ``BM``, and ``CBM`` denoting frequencies in :class:`BusinessMonthEnd`, :class:`CustomBusinessMonthEnd` (:issue:`52064`) - Deprecated strings ``H``, ``BH``, and ``CBH`` denoting frequencies in :class:`Hour`, :class:`BusinessHour`, :class:`CustomBusinessHour` (:issue:`52536`) - Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) - Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 26181d8f15518..370917df4dca6 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -188,7 +188,7 @@ cdef dict _abbrev_to_attrnames = {v: k for k, v in attrname_to_abbrevs.items()} OFFSET_TO_PERIOD_FREQSTR: dict = { "WEEKDAY": "D", "EOM": "M", - "BM": "M", + "BME": "M", "BQS": "Q", "QS": "Q", "BQ": "Q", @@ -280,6 +280,8 @@ DEPR_ABBREVS: dict[str, str]= { "BAS-SEP": "BYS-SEP", "BAS-OCT": "BYS-OCT", "BAS-NOV": "BYS-NOV", + "BM": "BME", + "CBM": "CBME", "H": "h", "BH": "bh", "CBH": "cbh", diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 6c5cdde20da5f..f20073766bc3a 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -2935,7 +2935,7 @@ cdef class BusinessMonthEnd(MonthOffset): >>> pd.offsets.BMonthEnd().rollforward(ts) Timestamp('2022-11-30 00:00:00') """ - _prefix = "BM" + _prefix = "BME" _day_opt = "business_end" @@ -4465,10 +4465,10 @@ cdef class CustomBusinessMonthEnd(_CustomBusinessMonth): >>> freq = pd.offsets.CustomBusinessMonthEnd(calendar=bdc) >>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 11, 10), freq=freq) DatetimeIndex(['2022-07-29', '2022-08-31', '2022-09-29', '2022-10-28'], - dtype='datetime64[ns]', freq='CBM') + dtype='datetime64[ns]', freq='CBME') """ - _prefix = "CBM" + _prefix = "CBME" cdef class CustomBusinessMonthBegin(_CustomBusinessMonth): @@ -4551,12 +4551,12 @@ prefix_mapping = { BYearEnd, # 'BY' BusinessDay, # 'B' BusinessMonthBegin, # 'BMS' - BusinessMonthEnd, # 'BM' + BusinessMonthEnd, # 'BME' BQuarterEnd, # 'BQ' BQuarterBegin, # 'BQS' BusinessHour, # 'bh' CustomBusinessDay, # 'C' - CustomBusinessMonthEnd, # 'CBM' + CustomBusinessMonthEnd, # 'CBME' CustomBusinessMonthBegin, # 'CBMS' CustomBusinessHour, # 'cbh' MonthEnd, # 'ME' diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a7183a9d9498a..f1ecc57335a51 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9187,11 +9187,11 @@ def resample( Use frame.T.resample(...) instead. closed : {{'right', 'left'}}, default None Which side of bin interval is closed. The default is 'left' - for all frequency offsets except for 'ME', 'Y', 'Q', 'BM', + for all frequency offsets except for 'ME', 'Y', 'Q', 'BME', 'BA', 'BQ', and 'W' which all have a default of 'right'. label : {{'right', 'left'}}, default None Which bin edge label to label bucket with. The default is 'left' - for all frequency offsets except for 'ME', 'Y', 'Q', 'BM', + for all frequency offsets except for 'ME', 'Y', 'Q', 'BME', 'BA', 'BQ', and 'W' which all have a default of 'right'. convention : {{'start', 'end', 's', 'e'}}, default 'start' For `PeriodIndex` only, controls whether to use the start or diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 8b3071a6f8582..969748209772a 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2101,7 +2101,7 @@ def __init__( else: freq = to_offset(freq) - end_types = {"ME", "Y", "Q", "BM", "BY", "BQ", "W"} + end_types = {"ME", "Y", "Q", "BME", "BY", "BQ", "W"} rule = freq.rule_code if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types): if closed is None: @@ -2297,7 +2297,7 @@ def _adjust_bin_edges( ) -> tuple[DatetimeIndex, npt.NDArray[np.int64]]: # Some hacks for > daily data, see #1471, #1458, #1483 - if self.freq.name in ("BM", "ME", "W") or self.freq.name.split("-")[0] in ( + if self.freq.name in ("BME", "ME", "W") or self.freq.name.split("-")[0] in ( "BQ", "BY", "Q", diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index b3ab11d07bd7e..bc6e74d5b1f00 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -32,16 +32,16 @@ def test_asfreq2(self, frame_or_series): datetime(2009, 11, 30), datetime(2009, 12, 31), ], - freq="BM", + freq="BME", ), ) daily_ts = ts.asfreq("B") - monthly_ts = daily_ts.asfreq("BM") + monthly_ts = daily_ts.asfreq("BME") tm.assert_equal(monthly_ts, ts) daily_ts = ts.asfreq("B", method="pad") - monthly_ts = daily_ts.asfreq("BM") + monthly_ts = daily_ts.asfreq("BME") tm.assert_equal(monthly_ts, ts) daily_ts = ts.asfreq(offsets.BDay()) @@ -140,12 +140,12 @@ def test_asfreq_resample_set_correct_freq(self, frame_or_series): def test_asfreq_empty(self, datetime_frame): # test does not blow up on length-0 DataFrame zero_length = datetime_frame.reindex([]) - result = zero_length.asfreq("BM") + result = zero_length.asfreq("BME") assert result is not zero_length def test_asfreq(self, datetime_frame): offset_monthly = datetime_frame.asfreq(offsets.BMonthEnd()) - rule_monthly = datetime_frame.asfreq("BM") + rule_monthly = datetime_frame.asfreq("BME") tm.assert_frame_equal(offset_monthly, rule_monthly) diff --git a/pandas/tests/indexes/datetimes/methods/test_shift.py b/pandas/tests/indexes/datetimes/methods/test_shift.py index 064f664a4de10..c50f75894d810 100644 --- a/pandas/tests/indexes/datetimes/methods/test_shift.py +++ b/pandas/tests/indexes/datetimes/methods/test_shift.py @@ -157,6 +157,6 @@ def test_shift_bmonth(self): def test_shift_empty(self): # GH#14811 - dti = date_range(start="2016-10-21", end="2016-10-21", freq="BM") + dti = date_range(start="2016-10-21", end="2016-10-21", freq="BME") result = dti.shift(1) tm.assert_index_equal(result, dti) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 6839fafcdc114..d95cd6f3a2cc5 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -63,7 +63,7 @@ def test_to_period_annualish(self, off): assert prng.freq == "Y-DEC" def test_to_period_monthish(self): - offsets = ["MS", "BM"] + offsets = ["MS", "BME"] for off in offsets: rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index a18501a193b60..3b0b856d07673 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -264,3 +264,17 @@ def test_AS_BA_BAS_deprecated(self, freq_depr, expected_values, expected_freq): ) tm.assert_index_equal(result, expected) + + def test_BM_deprecated(self): + # GH#52064 + msg = "'BM' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range(start="2016-02-21", end="2016-08-21", freq="2BM") + result = DatetimeIndex( + ["2016-02-29", "2016-04-29", "2016-06-30"], + dtype="datetime64[ns]", + freq="2BME", + ) + + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 0a5287d154adc..d86d78ba47c5b 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -141,7 +141,7 @@ def test_datetimeindex_accessors4(self): def test_datetimeindex_accessors5(self): freq_m = to_offset("ME") - bm = to_offset("BM") + bm = to_offset("BME") qfeb = to_offset("Q-FEB") qsfeb = to_offset("QS-FEB") bq = to_offset("BQ") diff --git a/pandas/tests/indexes/multi/test_get_level_values.py b/pandas/tests/indexes/multi/test_get_level_values.py index 84907f5279876..28c77e78924cb 100644 --- a/pandas/tests/indexes/multi/test_get_level_values.py +++ b/pandas/tests/indexes/multi/test_get_level_values.py @@ -115,7 +115,7 @@ def test_get_level_values_when_periods(): def test_values_loses_freq_of_underlying_index(): # GH#49054 - idx = pd.DatetimeIndex(date_range("20200101", periods=3, freq="BM")) + idx = pd.DatetimeIndex(date_range("20200101", periods=3, freq="BME")) expected = idx.copy(deep=True) idx2 = Index([1, 2, 3]) midx = MultiIndex(levels=[idx, idx2], codes=[[0, 1, 2], [0, 1, 2]]) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index db7c0cec09e6c..a384fd9cdc8f2 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -360,7 +360,7 @@ def test_business_freq(self): assert PeriodIndex(data=idx).freqstr == "B" def test_business_freq_convert(self): - bts = tm.makeTimeSeries(300).asfreq("BM") + bts = tm.makeTimeSeries(300).asfreq("BME") ts = bts.to_period("M") _, ax = mpl.pyplot.subplots() bts.plot(ax=ax) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index f66f5bf50974e..e0ba7902a8a6c 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -498,12 +498,12 @@ def test_resample_how_method(unit): def test_resample_extra_index_point(unit): # GH#9756 - index = date_range(start="20150101", end="20150331", freq="BM").as_unit(unit) + index = date_range(start="20150101", end="20150331", freq="BME").as_unit(unit) expected = DataFrame({"A": Series([21, 41, 63], index=index)}) index = date_range(start="20150101", end="20150331", freq="B").as_unit(unit) df = DataFrame({"A": Series(range(len(index)), index=index)}, dtype="int64") - result = df.resample("BM").last() + result = df.resample("BME").last() tm.assert_frame_equal(result, expected) @@ -2020,6 +2020,17 @@ def test_resample_M_deprecated(): tm.assert_series_equal(result, expected) +def test_resample_BM_deprecated(): + # GH#52064 + depr_msg = "'BM' is deprecated and will be removed in a future version." + + s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) + expected = s.resample("2BME").mean() + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = s.resample("2BM").mean() + tm.assert_series_equal(result, expected) + + def test_resample_ms_closed_right(): # https://github.com/pandas-dev/pandas/issues/55271 dti = date_range(start="2020-01-31", freq="1min", periods=6000) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 22ff7f8405a40..aec9915d69e3c 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -53,7 +53,7 @@ def base_delta_code_pair(request): freqs = ( [f"Q-{month}" for month in MONTHS] + [f"{annual}-{month}" for annual in ["Y", "BY"] for month in MONTHS] - + ["ME", "BM", "BMS"] + + ["ME", "BME", "BMS"] + [f"WOM-{count}{day}" for count in range(1, 5) for day in DAYS] + [f"W-{day}" for day in DAYS] ) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 9389f78c9e672..5678dd1fb511e 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -757,7 +757,7 @@ class TestOffsetNames: def test_get_offset_name(self): assert BDay().freqstr == "B" assert BDay(2).freqstr == "2B" - assert BMonthEnd().freqstr == "BM" + assert BMonthEnd().freqstr == "BME" assert Week(weekday=0).freqstr == "W-MON" assert Week(weekday=1).freqstr == "W-TUE" assert Week(weekday=2).freqstr == "W-WED" @@ -776,8 +776,8 @@ def test_get_offset(): pairs = [ ("B", BDay()), ("b", BDay()), - ("bm", BMonthEnd()), - ("Bm", BMonthEnd()), + ("bme", BMonthEnd()), + ("Bme", BMonthEnd()), ("W-MON", Week(weekday=0)), ("W-TUE", Week(weekday=1)), ("W-WED", Week(weekday=2)), @@ -811,7 +811,7 @@ def test_alias_equality(self): assert k == v.copy() def test_rule_code(self): - lst = ["ME", "MS", "BM", "BMS", "D", "B", "h", "min", "s", "ms", "us"] + lst = ["ME", "MS", "BME", "BMS", "D", "B", "h", "min", "s", "ms", "us"] for k in lst: assert k == _get_offset(k).rule_code # should be cached - this is kind of an internals test... diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index db4fdf0d24465..4bd558a30c92f 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -369,7 +369,7 @@ def _get_monthly_rule(self) -> str | None: if pos_check is None: return None else: - return {"cs": "MS", "bs": "BMS", "ce": "ME", "be": "BM"}.get(pos_check) + return {"cs": "MS", "bs": "BMS", "ce": "ME", "be": "BME"}.get(pos_check) def _is_business_daily(self) -> bool: # quick check: cannot be business daily From 10cf330662b34a2686722abe5fab35009fb3ee9a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 15 Oct 2023 03:27:59 +0200 Subject: [PATCH 16/19] CLN/TST: clean logic of old datetime test_indexing test (#55523) --- pandas/tests/series/indexing/test_datetime.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 84cf80fa1ffce..fc1c80eb4dec6 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -427,10 +427,10 @@ def test_indexing(): # getting # GH 3070, make sure semantics work on Series/Frame - expected = ts["2001"] - expected.name = "A" + result = ts["2001"] + tm.assert_series_equal(result, ts.iloc[:12]) - df = DataFrame({"A": ts}) + df = DataFrame({"A": ts.copy()}) # GH#36179 pre-2.0 df["2001"] operated as slicing on rows. in 2.0 it behaves # like any other key, so raises @@ -438,14 +438,16 @@ def test_indexing(): df["2001"] # setting + ts = Series(np.random.default_rng(2).random(len(idx)), index=idx) + expected = ts.copy() + expected.iloc[:12] = 1 ts["2001"] = 1 - expected = ts["2001"] - expected.name = "A" + tm.assert_series_equal(ts, expected) + expected = df.copy() + expected.iloc[:12, 0] = 1 df.loc["2001", "A"] = 1 - - with pytest.raises(KeyError, match="2001"): - df["2001"] + tm.assert_frame_equal(df, expected) def test_getitem_str_month_with_datetimeindex(): From 68e3c4b2f855e6e9a8469aeca6eb73ae60327160 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 15 Oct 2023 15:39:09 +0200 Subject: [PATCH 17/19] BUG: idxmax raising for arrow strings (#55384) --- pandas/core/arrays/arrow/array.py | 11 ++++++++++- pandas/core/arrays/string_arrow.py | 11 +++++++++++ pandas/tests/frame/test_reductions.py | 9 +++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c91f892936640..a00640a88d7fb 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1627,6 +1627,15 @@ def _reduce( ------ TypeError : subclass does not define reductions """ + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if isinstance(result, pa.Array): + return type(self)(result) + else: + return result + + def _reduce_calc( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) if keepdims: @@ -1637,7 +1646,7 @@ def _reduce( [pa_result], type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]), ) - return type(self)(result) + return result if pc.is_null(pa_result).as_py(): return self.dtype.na_value diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 24b99b5d4852e..2a10e87981bc3 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -502,6 +502,17 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): def _convert_int_dtype(self, result): return Int64Dtype().__from_arrow__(result) + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if name in ("argmin", "argmax") and isinstance(result, pa.Array): + return self._convert_int_dtype(result) + elif isinstance(result, pa.Array): + return type(self)(result) + else: + return result + def _rank( self, *, diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 0d5c2e3cd6c13..a17dc4a789fe3 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1073,6 +1073,15 @@ def test_idxmax_arrow_types(self): expected = Series([2, 1], index=["a", "b"]) tm.assert_series_equal(result, expected) + df = DataFrame({"a": ["b", "c", "a"]}, dtype="string[pyarrow]") + result = df.idxmax(numeric_only=False) + expected = Series([1], index=["a"]) + tm.assert_series_equal(result, expected) + + result = df.idxmin(numeric_only=False) + expected = Series([2], index=["a"]) + tm.assert_series_equal(result, expected) + def test_idxmax_axis_2(self, float_frame): frame = float_frame msg = "No axis named 2 for object type DataFrame" From 7b8c6f6b410331f7b83348cb5f8812a58dab2b39 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 15 Oct 2023 15:43:58 +0200 Subject: [PATCH 18/19] DOC: Adjust user guide for CoW docs (#55337) --- doc/source/user_guide/copy_on_write.rst | 130 ++++++++++++++---------- 1 file changed, 75 insertions(+), 55 deletions(-) diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst index 59bdb1926895f..d0c57b56585db 100644 --- a/doc/source/user_guide/copy_on_write.rst +++ b/doc/source/user_guide/copy_on_write.rst @@ -7,8 +7,8 @@ Copy-on-Write (CoW) ******************* Copy-on-Write was first introduced in version 1.5.0. Starting from version 2.0 most of the -optimizations that become possible through CoW are implemented and supported. A complete list -can be found at :ref:`Copy-on-Write optimizations `. +optimizations that become possible through CoW are implemented and supported. All possible +optimizations are supported starting from pandas 2.1. We expect that CoW will be enabled by default in version 3.0. @@ -154,6 +154,77 @@ With copy on write this can be done by using ``loc``. df.loc[df["bar"] > 5, "foo"] = 100 +Read-only NumPy arrays +---------------------- + +Accessing the underlying NumPy array of a DataFrame will return a read-only array if the array +shares data with the initial DataFrame: + +The array is a copy if the initial DataFrame consists of more than one array: + + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 2], "b": [1.5, 2.5]}) + df.to_numpy() + +The array shares data with the DataFrame if the DataFrame consists of only one NumPy array: + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df.to_numpy() + +This array is read-only, which means that it can't be modified inplace: + +.. ipython:: python + :okexcept: + + arr = df.to_numpy() + arr[0, 0] = 100 + +The same holds true for a Series, since a Series always consists of a single array. + +There are two potential solution to this: + +- Trigger a copy manually if you want to avoid updating DataFrames that share memory with your array. +- Make the array writeable. This is a more performant solution but circumvents Copy-on-Write rules, so + it should be used with caution. + +.. ipython:: python + + arr = df.to_numpy() + arr.flags.writeable = True + arr[0, 0] = 100 + arr + +Patterns to avoid +----------------- + +No defensive copy will be performed if two objects share the same data while +you are modifying one object inplace. + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df2 = df.reset_index() + df2.iloc[0, 0] = 100 + +This creates two objects that share data and thus the setitem operation will trigger a +copy. This is not necessary if the initial object ``df`` isn't needed anymore. +Simply reassigning to the same variable will invalidate the reference that is +held by the object. + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = df.reset_index() + df.iloc[0, 0] = 100 + +No copy is necessary in this example. +Creating multiple references keeps unnecessary references alive +and thus will hurt performance with Copy-on-Write. + .. _copy_on_write.optimizations: Copy-on-Write optimizations @@ -161,59 +232,8 @@ Copy-on-Write optimizations A new lazy copy mechanism that defers the copy until the object in question is modified and only if this object shares data with another object. This mechanism was added to -following methods: - - - :meth:`DataFrame.reset_index` / :meth:`Series.reset_index` - - :meth:`DataFrame.set_index` - - :meth:`DataFrame.set_axis` / :meth:`Series.set_axis` - - :meth:`DataFrame.set_flags` / :meth:`Series.set_flags` - - :meth:`DataFrame.rename_axis` / :meth:`Series.rename_axis` - - :meth:`DataFrame.reindex` / :meth:`Series.reindex` - - :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` - - :meth:`DataFrame.assign` - - :meth:`DataFrame.drop` - - :meth:`DataFrame.dropna` / :meth:`Series.dropna` - - :meth:`DataFrame.select_dtypes` - - :meth:`DataFrame.align` / :meth:`Series.align` - - :meth:`Series.to_frame` - - :meth:`DataFrame.rename` / :meth:`Series.rename` - - :meth:`DataFrame.add_prefix` / :meth:`Series.add_prefix` - - :meth:`DataFrame.add_suffix` / :meth:`Series.add_suffix` - - :meth:`DataFrame.drop_duplicates` / :meth:`Series.drop_duplicates` - - :meth:`DataFrame.droplevel` / :meth:`Series.droplevel` - - :meth:`DataFrame.reorder_levels` / :meth:`Series.reorder_levels` - - :meth:`DataFrame.between_time` / :meth:`Series.between_time` - - :meth:`DataFrame.filter` / :meth:`Series.filter` - - :meth:`DataFrame.head` / :meth:`Series.head` - - :meth:`DataFrame.tail` / :meth:`Series.tail` - - :meth:`DataFrame.isetitem` - - :meth:`DataFrame.pipe` / :meth:`Series.pipe` - - :meth:`DataFrame.pop` / :meth:`Series.pop` - - :meth:`DataFrame.replace` / :meth:`Series.replace` - - :meth:`DataFrame.shift` / :meth:`Series.shift` - - :meth:`DataFrame.sort_index` / :meth:`Series.sort_index` - - :meth:`DataFrame.sort_values` / :meth:`Series.sort_values` - - :meth:`DataFrame.squeeze` / :meth:`Series.squeeze` - - :meth:`DataFrame.swapaxes` - - :meth:`DataFrame.swaplevel` / :meth:`Series.swaplevel` - - :meth:`DataFrame.take` / :meth:`Series.take` - - :meth:`DataFrame.to_timestamp` / :meth:`Series.to_timestamp` - - :meth:`DataFrame.to_period` / :meth:`Series.to_period` - - :meth:`DataFrame.truncate` - - :meth:`DataFrame.iterrows` - - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize` - - :meth:`DataFrame.fillna` / :meth:`Series.fillna` - - :meth:`DataFrame.interpolate` / :meth:`Series.interpolate` - - :meth:`DataFrame.ffill` / :meth:`Series.ffill` - - :meth:`DataFrame.bfill` / :meth:`Series.bfill` - - :meth:`DataFrame.where` / :meth:`Series.where` - - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects` - - :meth:`DataFrame.astype` / :meth:`Series.astype` - - :meth:`DataFrame.convert_dtypes` / :meth:`Series.convert_dtypes` - - :meth:`DataFrame.join` - - :meth:`DataFrame.eval` - - :func:`concat` - - :func:`merge` +methods that don't require a copy of the underlying data. Popular examples are :meth:`DataFrame.drop` for ``axis=1`` +and :meth:`DataFrame.rename`. These methods return views when Copy-on-Write is enabled, which provides a significant performance improvement compared to the regular execution. From e0d6051f985994e594b07a2b93b9ca2eff43eae4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 15 Oct 2023 09:00:34 -1000 Subject: [PATCH 19/19] TST: Replace node.add_marker with applymarker (#55513) --- .../development/contributing_codebase.rst | 2 +- pandas/tests/apply/test_frame_apply.py | 2 +- pandas/tests/apply/test_frame_transform.py | 4 +- pandas/tests/apply/test_series_apply.py | 2 +- pandas/tests/apply/test_str.py | 6 +- pandas/tests/arithmetic/test_datetime64.py | 2 +- pandas/tests/arrays/string_/test_string.py | 6 +- pandas/tests/base/test_conversion.py | 2 +- pandas/tests/base/test_misc.py | 4 +- pandas/tests/computation/test_eval.py | 6 +- .../copy_view/test_core_functionalities.py | 2 +- .../tests/extension/decimal/test_decimal.py | 4 +- pandas/tests/extension/json/test_json.py | 6 +- pandas/tests/extension/test_arrow.py | 62 +++++++++---------- pandas/tests/extension/test_categorical.py | 4 +- pandas/tests/extension/test_masked.py | 2 +- pandas/tests/extension/test_numpy.py | 8 +-- pandas/tests/extension/test_sparse.py | 4 +- .../tests/frame/methods/test_interpolate.py | 2 +- pandas/tests/frame/methods/test_nlargest.py | 2 +- pandas/tests/frame/methods/test_quantile.py | 48 ++++---------- .../tests/frame/methods/test_sort_values.py | 4 +- .../frame/methods/test_to_dict_of_blocks.py | 2 +- pandas/tests/frame/test_arithmetic.py | 2 +- pandas/tests/frame/test_constructors.py | 4 +- pandas/tests/frame/test_reductions.py | 6 +- pandas/tests/frame/test_ufunc.py | 6 +- pandas/tests/generic/test_duplicate_labels.py | 2 +- pandas/tests/generic/test_finalize.py | 12 ++-- pandas/tests/groupby/methods/test_quantile.py | 2 +- .../groupby/methods/test_value_counts.py | 14 ++--- pandas/tests/groupby/test_categorical.py | 8 +-- pandas/tests/groupby/test_function.py | 4 +- pandas/tests/groupby/test_groupby_dropna.py | 2 +- .../tests/groupby/transform/test_transform.py | 4 +- pandas/tests/indexes/datetimes/test_ops.py | 2 +- pandas/tests/indexes/test_common.py | 4 +- pandas/tests/indexes/test_index_new.py | 2 +- pandas/tests/indexes/test_setops.py | 4 +- pandas/tests/indexing/test_loc.py | 2 +- pandas/tests/io/excel/test_readers.py | 54 ++++++++-------- pandas/tests/io/excel/test_writers.py | 2 +- pandas/tests/io/json/test_pandas.py | 4 +- pandas/tests/io/json/test_readlines.py | 16 ++--- pandas/tests/io/parser/common/test_float.py | 2 +- .../io/parser/common/test_read_errors.py | 4 +- pandas/tests/io/parser/conftest.py | 2 +- .../io/parser/dtypes/test_dtypes_basic.py | 2 +- pandas/tests/io/parser/test_comment.py | 4 +- pandas/tests/io/parser/test_compression.py | 2 +- pandas/tests/io/parser/test_encoding.py | 4 +- pandas/tests/io/parser/test_index_col.py | 2 +- pandas/tests/io/parser/test_parse_dates.py | 6 +- pandas/tests/io/parser/test_skiprows.py | 2 +- pandas/tests/io/parser/test_unsupported.py | 2 +- pandas/tests/io/pytables/test_round_trip.py | 2 +- pandas/tests/io/test_parquet.py | 6 +- pandas/tests/io/test_sql.py | 34 +++++----- .../reshape/concat/test_append_common.py | 4 +- .../scalar/timestamp/test_constructors.py | 2 +- pandas/tests/series/methods/test_astype.py | 4 +- .../tests/series/methods/test_interpolate.py | 2 +- pandas/tests/series/methods/test_map.py | 2 +- pandas/tests/series/test_arithmetic.py | 2 +- pandas/tests/series/test_constructors.py | 4 +- pandas/tests/series/test_ufunc.py | 2 +- pandas/tests/strings/test_api.py | 2 +- pandas/tests/tools/test_to_datetime.py | 2 +- pandas/tests/tools/test_to_numeric.py | 2 +- pandas/tests/tseries/offsets/test_common.py | 4 +- pandas/tests/tseries/offsets/test_offsets.py | 4 +- .../test_moments_consistency_expanding.py | 2 +- .../test_moments_consistency_rolling.py | 2 +- pandas/util/_test_decorators.py | 2 +- 74 files changed, 216 insertions(+), 242 deletions(-) diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index e0aa8be066914..e22b57dbbff17 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -540,7 +540,7 @@ xfail during the testing phase. To do so, use the ``request`` fixture: def test_xfail(request): mark = pytest.mark.xfail(raises=TypeError, reason="Indicate why here") - request.node.add_marker(mark) + request.applymarker(mark) xfail is not to be used for tests involving failure due to invalid user arguments. For these tests, we need to verify the correct exception type and error message diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index be988594ebf58..232cfceb3b6d6 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -325,7 +325,7 @@ def test_apply_empty_infer_type(ax, func, raw, axis, engine, request): mark = pytest.mark.xfail( reason="numba engine only supports raw=True at the moment" ) - request.node.add_marker(mark) + request.applymarker(mark) result = df.apply(func, axis=axis, engine=engine, raw=raw) if is_reduction: diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py index 2d57515882aed..558d76ae8fdc4 100644 --- a/pandas/tests/apply/test_frame_transform.py +++ b/pandas/tests/apply/test_frame_transform.py @@ -156,7 +156,7 @@ def func(x): def test_transform_bad_dtype(op, frame_or_series, request): # GH 35964 if op == "ngroup": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") ) @@ -185,7 +185,7 @@ def test_transform_failure_typeerror(request, op): # GH 35964 if op == "ngroup": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") ) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 643b9220999f7..b8026d771baed 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -321,7 +321,7 @@ def test_transform(string_series, by_row): def test_transform_partial_failure(op, request): # GH 35964 if op in ("ffill", "bfill", "pad", "backfill", "shift"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason=f"{op} is successful on any dtype") ) diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 363d0285cabbc..c046c60e174b3 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -31,7 +31,7 @@ @pytest.mark.parametrize("how", ["agg", "apply"]) def test_apply_with_string_funcs(request, float_frame, func, args, kwds, how): if len(args) > 1 and how == "agg": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=TypeError, reason="agg/apply signature mismatch - agg passes 2nd " @@ -256,7 +256,7 @@ def test_agg_cython_table_transform_frame(df, func, expected, axis): def test_transform_groupby_kernel_series(request, string_series, op): # GH 35964 if op == "ngroup": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") ) args = [0.0] if op == "fillna" else [] @@ -269,7 +269,7 @@ def test_transform_groupby_kernel_series(request, string_series, op): @pytest.mark.parametrize("op", frame_transform_kernels) def test_transform_groupby_kernel_frame(request, axis, float_frame, op): if op == "ngroup": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") ) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 693b8d9483407..67bcafd583086 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1084,7 +1084,7 @@ def test_dt64arr_addsub_intlike( # GH#19959, GH#19123, GH#19012 tz = tz_naive_fixture if box_with_array is pd.DataFrame: - request.node.add_marker( + request.applymarker( pytest.mark.xfail(raises=ValueError, reason="Axis alignment fails") ) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 89cc31ec5ecc8..451136225a612 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -151,7 +151,7 @@ def test_add_2d(dtype, request, arrow_string_storage): if dtype.storage in arrow_string_storage: reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) - request.node.add_marker(mark) + request.applymarker(mark) a = pd.array(["a", "b", "c"], dtype=dtype) b = np.array([["a", "b", "c"]], dtype=object) @@ -180,7 +180,7 @@ def test_mul(dtype, request, arrow_string_storage): if dtype.storage in arrow_string_storage: reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason) - request.node.add_marker(mark) + request.applymarker(mark) a = pd.array(["a", "b", None], dtype=dtype) result = a * 2 @@ -446,7 +446,7 @@ def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): else: reason = "'ArrowStringArray' object has no attribute 'max'" mark = pytest.mark.xfail(raises=TypeError, reason=reason) - request.node.add_marker(mark) + request.applymarker(mark) arr = box(["a", "b", "c", None], dtype=dtype) result = getattr(np, method)(arr) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 3e0b0dbeb5624..c0f65c23c6d35 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -337,7 +337,7 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): if arr.dtype.name == "int64" and box is pd.array: mark = pytest.mark.xfail(reason="thing is Int64 and to_numpy() returns object") - request.node.add_marker(mark) + request.applymarker(mark) result = thing.to_numpy() tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 3ca53c4010449..c6fd4955d2d63 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -141,7 +141,7 @@ def test_searchsorted(request, index_or_series_obj): if isinstance(obj, pd.MultiIndex): # See gh-14833 - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="np.searchsorted doesn't work on pd.MultiIndex: GH 14833" ) @@ -150,7 +150,7 @@ def test_searchsorted(request, index_or_series_obj): # TODO: Should Series cases also raise? Looks like they use numpy # comparison semantics https://github.com/numpy/numpy/issues/15981 mark = pytest.mark.xfail(reason="complex objects are not comparable") - request.node.add_marker(mark) + request.applymarker(mark) max_obj = max(obj, default=0) index = np.searchsorted(obj, max_obj) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 9c630e29ea8e6..f336ef34cae0a 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -194,7 +194,7 @@ def test_compound_invert_op(self, op, lhs, rhs, request, engine, parser): reason="Looks like expected is negative, unclear whether " "expected is incorrect or result is incorrect" ) - request.node.add_marker(mark) + request.applymarker(mark) skip_these = ["in", "not in"] ex = f"~(lhs {op} rhs)" @@ -860,7 +860,7 @@ def test_basic_series_frame_alignment( f"parser={parser}, index_name={index_name}, " f"r_idx_type={r_idx_type}, c_idx_type={c_idx_type}" ) - request.node.add_marker(pytest.mark.xfail(reason=reason, strict=False)) + request.applymarker(pytest.mark.xfail(reason=reason, strict=False)) df = tm.makeCustomDataframe( 10, 7, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type ) @@ -1883,7 +1883,7 @@ def test_negate_lt_eq_le(engine, parser): def test_eval_no_support_column_name(request, column): # GH 44603 if column in ["True", "False", "inf", "Inf"]: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=KeyError, reason=f"GH 47859 DataFrame eval not supported with {column}", diff --git a/pandas/tests/copy_view/test_core_functionalities.py b/pandas/tests/copy_view/test_core_functionalities.py index 5c177465d2fa4..bfdf2b92fd326 100644 --- a/pandas/tests/copy_view/test_core_functionalities.py +++ b/pandas/tests/copy_view/test_core_functionalities.py @@ -57,7 +57,7 @@ def test_setitem_with_view_invalidated_does_not_copy(using_copy_on_write, reques mark = pytest.mark.xfail( reason="blk.delete does not track references correctly" ) - request.node.add_marker(mark) + request.applymarker(mark) assert np.shares_memory(arr, get_array(df, "a")) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 8dbd1c4c511d0..5ccffd1d25b3d 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -85,14 +85,14 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request): if all_numeric_reductions in ["kurt", "skew", "sem", "median"]: mark = pytest.mark.xfail(raises=NotImplementedError) - request.node.add_marker(mark) + request.applymarker(mark) super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) def test_reduce_frame(self, data, all_numeric_reductions, skipna, request): op_name = all_numeric_reductions if op_name in ["skew", "median"]: mark = pytest.mark.xfail(raises=NotImplementedError) - request.node.add_marker(mark) + request.applymarker(mark) return super().test_reduce_frame(data, all_numeric_reductions, skipna) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 68a27a28b160f..71133030a5c18 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -237,7 +237,7 @@ def test_equals_same_data_different_object( ): if using_copy_on_write: mark = pytest.mark.xfail(reason="Fails with CoW") - request.node.add_marker(mark) + request.applymarker(mark) super().test_equals_same_data_different_object(data) @@ -300,7 +300,7 @@ class TestArithmeticOps(BaseJSON, base.BaseArithmeticOpsTests): def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): if len(data[0]) != 1: mark = pytest.mark.xfail(reason="raises in coercing to Series") - request.node.add_marker(mark) + request.applymarker(mark) super().test_arith_frame_with_scalar(data, all_arithmetic_operators) @@ -308,7 +308,7 @@ class TestComparisonOps(BaseJSON, base.BaseComparisonOpsTests): def test_compare_array(self, data, comparison_op, request): if comparison_op.__name__ in ["eq", "ne"]: mark = pytest.mark.xfail(reason="Comparison methods not implemented") - request.node.add_marker(mark) + request.applymarker(mark) super().test_compare_array(data, comparison_op) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 41312f45838a9..86aef2642750e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -78,7 +78,7 @@ def _require_timezone_database(request): "on CI to path to the tzdata for pyarrow." ), ) - request.node.add_marker(mark) + request.applymarker(mark) @pytest.fixture(params=tm.ALL_PYARROW_DTYPES, ids=str) @@ -271,7 +271,7 @@ class TestArrowArray(base.ExtensionTests): def test_astype_str(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_binary(pa_dtype): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"For {pa_dtype} .astype(str) decodes.", ) @@ -286,7 +286,7 @@ def test_from_dtype(self, data, request): else: reason = f"pyarrow.type_for_alias cannot infer {pa_dtype}" - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=reason, ) @@ -313,7 +313,7 @@ def test_from_sequence_pa_array_notimplemented(self, request): def test_from_sequence_of_strings_pa_array(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_time64(pa_dtype) and pa_dtype.equals("time64[ns]") and not PY311: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Nanosecond time parsing not supported.", ) @@ -321,7 +321,7 @@ def test_from_sequence_of_strings_pa_array(self, data, request): elif pa_version_under11p0 and ( pa.types.is_duration(pa_dtype) or pa.types.is_decimal(pa_dtype) ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, reason=f"pyarrow doesn't support parsing {pa_dtype}", @@ -402,12 +402,12 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques mark = pytest.mark.xfail( reason=f"{all_numeric_accumulations} not implemented for pyarrow < 9" ) - request.node.add_marker(mark) + request.applymarker(mark) elif all_numeric_accumulations == "cumsum" and ( pa.types.is_boolean(pa_type) or pa.types.is_decimal(pa_type) ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{all_numeric_accumulations} not implemented for {pa_type}", raises=NotImplementedError, @@ -496,19 +496,19 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, reque if all_numeric_reductions in {"skew", "kurt"} and ( dtype._is_numeric or dtype.kind == "b" ): - request.node.add_marker(xfail_mark) + request.applymarker(xfail_mark) elif ( all_numeric_reductions in {"var", "std", "median"} and pa_version_under7p0 and pa.types.is_decimal(pa_dtype) ): - request.node.add_marker(xfail_mark) + request.applymarker(xfail_mark) elif ( all_numeric_reductions == "sem" and pa_version_under8p0 and (dtype._is_numeric or pa.types.is_temporal(pa_dtype)) ): - request.node.add_marker(xfail_mark) + request.applymarker(xfail_mark) elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in { "sem", @@ -516,7 +516,7 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, reque "var", "median", }: - request.node.add_marker(xfail_mark) + request.applymarker(xfail_mark) super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) @pytest.mark.parametrize("skipna", [True, False]) @@ -532,7 +532,7 @@ def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna, reque if pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype): # We *might* want to make this behave like the non-pyarrow cases, # but have not yet decided. - request.node.add_marker(xfail_mark) + request.applymarker(xfail_mark) return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) @@ -560,7 +560,7 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna, request): if op_name == "skew": if data.dtype._is_numeric: mark = pytest.mark.xfail(reason="skew not implemented") - request.node.add_marker(mark) + request.applymarker(mark) return super().test_reduce_frame(data, all_numeric_reductions, skipna) @pytest.mark.parametrize("typ", ["int64", "uint64", "float64"]) @@ -592,7 +592,7 @@ def test_in_numeric_groupby(self, data_for_grouping): def test_construct_from_string_own_name(self, dtype, request): pa_dtype = dtype.pyarrow_dtype if pa.types.is_decimal(pa_dtype): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=NotImplementedError, reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}", @@ -616,7 +616,7 @@ def test_is_dtype_from_name(self, dtype, request): assert not type(dtype).is_dtype(dtype.name) else: if pa.types.is_decimal(pa_dtype): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=NotImplementedError, reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}", @@ -638,7 +638,7 @@ def test_get_common_dtype(self, dtype, request): or pa.types.is_binary(pa_dtype) or pa.types.is_decimal(pa_dtype) ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( f"{pa_dtype} does not have associated numpy " @@ -690,21 +690,21 @@ def test_setitem_preserves_views(self, data): def test_EA_types(self, engine, data, dtype_backend, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_decimal(pa_dtype): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=NotImplementedError, reason=f"Parameterized types {pa_dtype} not supported.", ) ) elif pa.types.is_timestamp(pa_dtype) and pa_dtype.unit in ("us", "ns"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=ValueError, reason="https://github.com/pandas-dev/pandas/issues/49767", ) ) elif pa.types.is_binary(pa_dtype): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="CSV parsers don't correctly handle binary") ) df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) @@ -725,7 +725,7 @@ def test_EA_types(self, engine, data, dtype_backend, request): def test_invert(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if not (pa.types.is_boolean(pa_dtype) or pa.types.is_integer(pa_dtype)): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, reason=f"pyarrow.compute.invert does support {pa_dtype}", @@ -737,7 +737,7 @@ def test_invert(self, data, request): def test_diff(self, data, periods, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_unsigned_integer(pa_dtype) and periods == 1: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=pa.ArrowInvalid, reason=( @@ -756,7 +756,7 @@ def test_value_counts_returns_pyarrow_int64(self, data): def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, request): pa_dtype = data_for_sorting.dtype.pyarrow_dtype if pa.types.is_decimal(pa_dtype) and pa_version_under7p0: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"No pyarrow kernel for {pa_dtype}", raises=pa.ArrowNotImplementedError, @@ -782,7 +782,7 @@ def test_argreduce_series( ): pa_dtype = data_missing_for_sorting.dtype.pyarrow_dtype if pa.types.is_decimal(pa_dtype) and pa_version_under7p0 and skipna: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"No pyarrow kernel for {pa_dtype}", raises=pa.ArrowNotImplementedError, @@ -1036,7 +1036,7 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: - request.node.add_marker(mark) + request.applymarker(mark) super().test_arith_series_with_scalar(data, all_arithmetic_operators) @@ -1050,7 +1050,7 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: - request.node.add_marker(mark) + request.applymarker(mark) super().test_arith_frame_with_scalar(data, all_arithmetic_operators) @@ -1066,7 +1066,7 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators, request): and pa.types.is_unsigned_integer(pa_dtype) and not pa_version_under7p0 ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=pa.ArrowInvalid, reason=( @@ -1078,7 +1078,7 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators, request): mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: - request.node.add_marker(mark) + request.applymarker(mark) op_name = all_arithmetic_operators ser = pd.Series(data) @@ -1092,7 +1092,7 @@ def test_add_series_with_extension_array(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa_dtype.equals("int8"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=pa.ArrowInvalid, reason=f"raises on overflow for {pa_dtype}", @@ -1368,7 +1368,7 @@ def test_quantile(data, interpolation, quantile, request): elif pa.types.is_temporal(data._pa_array.type): pass else: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, reason=f"quantile not supported by pyarrow for {pa_dtype}", @@ -2830,7 +2830,7 @@ def test_infer_dtype_pyarrow_dtype(data, request): reason="in infer_dtype pd.NA is not ignored in these cases " "even with skipna=True in the list(data) check below" ) - request.node.add_marker(mark) + request.applymarker(mark) assert res == lib.infer_dtype(list(data), skipna=True) @@ -2881,7 +2881,7 @@ def test_arithmetic_temporal(pa_type, request): raises=pa.ArrowNotImplementedError, reason="Function 'subtract_checked' has no kernel matching input types", ) - request.node.add_marker(mark) + request.applymarker(mark) arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type)) unit = pa_type.unit diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 82b6c54bc3106..5cde5df4bc007 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -148,7 +148,7 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): # frame & scalar op_name = all_arithmetic_operators if op_name == "__rmod__": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="rmod never called when string is first argument" ) @@ -158,7 +158,7 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): op_name = all_arithmetic_operators if op_name == "__rmod__": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="rmod never called when string is first argument" ) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index d27e9b8b9e983..bd12bcced1448 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -234,7 +234,7 @@ def test_divmod_series_array(self, data, data_for_twos, request): "floordiv but not for divmod. This matches what we do for " "non-masked bool dtype." ) - request.node.add_marker(mark) + request.applymarker(mark) super().test_divmod_series_array(data, data_for_twos) def test_combine_le(self, data_repeated): diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 542e938d1a40a..04b25fd0566da 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -169,7 +169,7 @@ def skip_numpy_object(dtype, request): """ if dtype == "object": mark = pytest.mark.xfail(reason="Fails for object dtype") - request.node.add_marker(mark) + request.applymarker(mark) skip_nested = pytest.mark.usefixtures("skip_numpy_object") @@ -198,7 +198,7 @@ def test_series_constructor_scalar_with_index(self, data, dtype): class TestDtype(BaseNumPyTests, base.BaseDtypeTests): def test_check_dtype(self, data, request): if data.dtype.numpy_dtype == "object": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"NumpyExtensionArray expectedly clashes with a " f"NumPy name: {data.dtype.numpy_dtype}" @@ -261,7 +261,7 @@ def test_diff(self, data, periods): def test_insert(self, data, request): if data.dtype.numpy_dtype == object: mark = pytest.mark.xfail(reason="Dimension mismatch in np.concatenate") - request.node.add_marker(mark) + request.applymarker(mark) super().test_insert(data) @@ -289,7 +289,7 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators, request): opname = all_arithmetic_operators if data.dtype.numpy_dtype == object and opname not in ["__add__", "__radd__"]: mark = pytest.mark.xfail(reason="Fails for object dtype") - request.node.add_marker(mark) + request.applymarker(mark) super().test_arith_series_with_array(data, all_arithmetic_operators) @skip_nested diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index f56dea3f43de7..003f50e8e23a2 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -214,7 +214,7 @@ def test_fillna_limit_backfill(self, data_missing): def test_fillna_no_op_returns_copy(self, data, request): if np.isnan(data.fill_value): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="returns array with different fill value") ) super().test_fillna_no_op_returns_copy(data) @@ -392,7 +392,7 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): "rmod", ]: mark = pytest.mark.xfail(reason="result dtype.fill_value mismatch") - request.node.add_marker(mark) + request.applymarker(mark) super().test_arith_frame_with_scalar(data, all_arithmetic_operators) diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 67aa07dd83764..bb12d7e202e09 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -54,7 +54,7 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request # GH#44749 if using_array_manager and frame_or_series is DataFrame: mark = pytest.mark.xfail(reason=".values-based in-place check is invalid") - request.node.add_marker(mark) + request.applymarker(mark) obj = frame_or_series([1, np.nan, 2]) orig = obj.values diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 0bdf9a0e5c007..1196f8cd3886a 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -169,7 +169,7 @@ def test_nlargest_n_duplicate_index(self, df_duplicates, n, order, request): if Version(np.__version__) >= Version("1.25") and ( (order == ["a"] and n in (1, 2, 3, 4)) or (order == ["a", "b"]) and n == 5 ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( "pandas default unstable sorting of duplicates" diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 61b253b24a7ec..4bfe364e5eafc 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -63,7 +63,7 @@ def test_quantile( tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result.index, expected.index) - request.node.add_marker( + request.applymarker( pytest.mark.xfail( using_array_manager, reason="Name set incorrectly for arraymanager" ) @@ -83,7 +83,7 @@ def test_quantile( tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result.index, expected.index) - request.node.add_marker( + request.applymarker( pytest.mark.xfail( using_array_manager, reason="Name set incorrectly for arraymanager" ) @@ -107,9 +107,7 @@ def test_non_numeric_exclusion(self, interp_method, request, using_array_manager if interpolation == "nearest": xp = (xp + 0.5).astype(np.int64) if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(rs, xp) def test_axis(self, interp_method, request, using_array_manager): @@ -121,9 +119,7 @@ def test_axis(self, interp_method, request, using_array_manager): if interpolation == "nearest": expected = expected.astype(np.int64) if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) result = df.quantile( @@ -151,9 +147,7 @@ def test_axis_numeric_only_true(self, interp_method, request, using_array_manage if interpolation == "nearest": expected = expected.astype(np.int64) if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) def test_quantile_date_range(self, interp_method, request, using_array_manager): @@ -170,9 +164,7 @@ def test_quantile_date_range(self, interp_method, request, using_array_manager): ["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]" ) if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) @@ -194,9 +186,7 @@ def test_quantile_axis_mixed(self, interp_method, request, using_array_manager): if interpolation == "nearest": expected -= 0.5 if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) # must raise @@ -208,9 +198,7 @@ def test_quantile_axis_parameter(self, interp_method, request, using_array_manag # GH 9543/9544 interpolation, method = interp_method if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(0.5, axis=0, interpolation=interpolation, method=method) @@ -336,9 +324,7 @@ def test_quantile_multi(self, interp_method, request, using_array_manager): if interpolation == "nearest": expected = expected.astype(np.int64) if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_frame_equal(result, expected) def test_quantile_multi_axis_1(self, interp_method, request, using_array_manager): @@ -353,9 +339,7 @@ def test_quantile_multi_axis_1(self, interp_method, request, using_array_manager if interpolation == "nearest": expected = expected.astype(np.int64) if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_frame_equal(result, expected) def test_quantile_multi_empty(self, interp_method): @@ -458,9 +442,7 @@ def test_quantile_invalid(self, invalid, datetime_frame, interp_method): def test_quantile_box(self, interp_method, request, using_array_manager): interpolation, method = interp_method if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) df = DataFrame( { "A": [ @@ -591,9 +573,7 @@ def test_quantile_box_nat(self): def test_quantile_nan(self, interp_method, request, using_array_manager): interpolation, method = interp_method if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) # GH 14357 - float block where some cols have missing values df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)}) df.iloc[-1, 1] = np.nan @@ -640,9 +620,7 @@ def test_quantile_nan(self, interp_method, request, using_array_manager): def test_quantile_nat(self, interp_method, request, using_array_manager): interpolation, method = interp_method if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) # full NaT column df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index bd7d882f6d94a..f2f02058a534e 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -863,7 +863,7 @@ def test_sort_index_level_and_column_label( Version(np.__version__) >= Version("1.25") and request.node.callspec.id == "df_idx0-inner-True" ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( "pandas default unstable sorting of duplicates" @@ -907,7 +907,7 @@ def test_sort_column_level_and_index_label( result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) if Version(np.__version__) >= Version("1.25"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( "pandas default unstable sorting of duplicates" diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index 906e74230a762..9d90111be6075 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -52,7 +52,7 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write): def test_to_dict_of_blocks_item_cache(request, using_copy_on_write): if using_copy_on_write: - request.node.add_marker(pytest.mark.xfail(reason="CoW - not yet implemented")) + request.applymarker(pytest.mark.xfail(reason="CoW - not yet implemented")) # Calling to_dict_of_blocks should not poison item_cache df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) df["c"] = NumpyExtensionArray(np.array([1, 2, None, 3], dtype=object)) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 09a5cda4b3458..1d1a4dbe83a9c 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -508,7 +508,7 @@ def test_floordiv_axis0_numexpr_path(self, opname, request): and opname == "pow" and "python" in request.node.callspec.id ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="https://github.com/pydata/numexpr/issues/454") ) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3d8053703e906..4307cfc1e1d4e 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -3171,7 +3171,7 @@ def test_from_out_of_bounds_ns_datetime( "non-nano, but DatetimeArray._from_sequence has not", strict=True, ) - request.node.add_marker(mark) + request.applymarker(mark) scalar = datetime(9999, 1, 1) exp_dtype = "M8[us]" # pydatetime objects default to this reso @@ -3207,7 +3207,7 @@ def test_from_out_of_bounds_ns_timedelta( "to non-nano, but TimedeltaArray._from_sequence has not", strict=True, ) - request.node.add_marker(mark) + request.applymarker(mark) scalar = datetime(9999, 1, 1) - datetime(1970, 1, 1) exp_dtype = "m8[us]" # smallest reso that fits diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index a17dc4a789fe3..b42f2148f90d5 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -729,7 +729,7 @@ def test_std_datetime64_with_nat( mark = pytest.mark.xfail( reason="GH#51446: Incorrect type inference on NaT in reduction result" ) - request.node.add_marker(mark) + request.applymarker(mark) df = DataFrame({"a": to_datetime(values)}) result = df.std(skipna=skipna) if not skipna or all(value is pd.NaT for value in values): @@ -1594,7 +1594,7 @@ def test_reductions_skipna_none_raises( self, request, frame_or_series, all_reductions ): if all_reductions == "count": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="Count does not accept skipna") ) obj = frame_or_series([1, 2, 3]) @@ -1822,7 +1822,7 @@ def test_sum_timedelta64_skipna_false(using_array_manager, request): mark = pytest.mark.xfail( reason="Incorrect type inference on NaT in reduction result" ) - request.node.add_marker(mark) + request.applymarker(mark) arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2) arr[-1, -1] = "Nat" diff --git a/pandas/tests/frame/test_ufunc.py b/pandas/tests/frame/test_ufunc.py index 305c0f8bba8ce..88c62da2b0a73 100644 --- a/pandas/tests/frame/test_ufunc.py +++ b/pandas/tests/frame/test_ufunc.py @@ -31,7 +31,7 @@ def test_unary_unary(dtype): def test_unary_binary(request, dtype): # unary input, binary output if is_extension_array_dtype(dtype) or isinstance(dtype, dict): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Extension / mixed with multiple outputs not implemented." ) @@ -106,7 +106,7 @@ def test_binary_input_aligns_columns(request, dtype_a, dtype_b): or is_extension_array_dtype(dtype_b) or isinstance(dtype_b, dict) ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Extension / mixed with multiple inputs not implemented." ) @@ -135,7 +135,7 @@ def test_binary_input_aligns_columns(request, dtype_a, dtype_b): @pytest.mark.parametrize("dtype", dtypes) def test_binary_input_aligns_index(request, dtype): if is_extension_array_dtype(dtype) or isinstance(dtype, dict): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Extension / mixed with multiple inputs not implemented." ) diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py index a81e013290b64..cb21ac6b83ee9 100644 --- a/pandas/tests/generic/test_duplicate_labels.py +++ b/pandas/tests/generic/test_duplicate_labels.py @@ -92,7 +92,7 @@ def test_preserve_getitem(self): def test_ndframe_getitem_caching_issue(self, request, using_copy_on_write): if not using_copy_on_write: - request.node.add_marker(pytest.mark.xfail(reason="Unclear behavior.")) + request.applymarker(pytest.mark.xfail(reason="Unclear behavior.")) # NDFrame.__getitem__ will cache the first df['A']. May need to # invalidate that cache? Update the cached entries? df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 0f7ae998a4b2b..68746b9e9a803 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -490,7 +490,7 @@ def test_binops(request, args, annotate, all_binary_operators): if not (isinstance(left, int) or isinstance(right, int)) and annotate != "both": if not all_binary_operators.__name__.startswith("r"): if annotate == "right" and isinstance(left, type(right)): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{all_binary_operators} doesn't work when right has " f"attrs and both are {type(left)}" @@ -498,14 +498,14 @@ def test_binops(request, args, annotate, all_binary_operators): ) if not isinstance(left, type(right)): if annotate == "left" and isinstance(left, pd.Series): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{all_binary_operators} doesn't work when the " "objects are different Series has attrs" ) ) elif annotate == "right" and isinstance(right, pd.Series): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{all_binary_operators} doesn't work when the " "objects are different Series has attrs" @@ -513,7 +513,7 @@ def test_binops(request, args, annotate, all_binary_operators): ) else: if annotate == "left" and isinstance(left, type(right)): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{all_binary_operators} doesn't work when left has " f"attrs and both are {type(left)}" @@ -521,14 +521,14 @@ def test_binops(request, args, annotate, all_binary_operators): ) if not isinstance(left, type(right)): if annotate == "right" and isinstance(right, pd.Series): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{all_binary_operators} doesn't work when the " "objects are different Series has attrs" ) ) elif annotate == "left" and isinstance(left, pd.Series): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{all_binary_operators} doesn't work when the " "objects are different Series has attrs" diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 4e7c09b70feb0..fcb9701e9881b 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -45,7 +45,7 @@ def test_quantile(interpolation, a_vals, b_vals, q, request): and isinstance(b_vals, list) and b_vals == [4, 3, 2, 1] ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Unclear numpy expectation for nearest " "result with equidistant data" diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 45a33d3b70f71..c1ee107715b71 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -249,7 +249,7 @@ def test_bad_subset(education_df): def test_basic(education_df, request): # gh43564 if Version(np.__version__) >= Version("1.25"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( "pandas default unstable sorting of duplicates" @@ -306,7 +306,7 @@ def test_against_frame_and_seriesgroupby( # - apply with :meth:`~DataFrame.value_counts` # - `~SeriesGroupBy.value_counts` if Version(np.__version__) >= Version("1.25") and frame and sort and normalize: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( "pandas default unstable sorting of duplicates" @@ -482,7 +482,7 @@ def test_dropna_combinations( nulls_df, group_dropna, count_dropna, expected_rows, expected_values, request ): if Version(np.__version__) >= Version("1.25") and not group_dropna: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( "pandas default unstable sorting of duplicates" @@ -586,7 +586,7 @@ def test_categorical_single_grouper_with_only_observed_categories( # Test single categorical grouper with only observed grouping categories # when non-groupers are also categorical if Version(np.__version__) >= Version("1.25"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( "pandas default unstable sorting of duplicates" @@ -695,7 +695,7 @@ def test_categorical_single_grouper_observed_true( # GH#46357 if Version(np.__version__) >= Version("1.25"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( "pandas default unstable sorting of duplicates" @@ -776,7 +776,7 @@ def test_categorical_single_grouper_observed_false( # GH#46357 if Version(np.__version__) >= Version("1.25"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( "pandas default unstable sorting of duplicates" @@ -929,7 +929,7 @@ def test_categorical_non_groupers( # regardless of `observed` if Version(np.__version__) >= Version("1.25"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( "pandas default unstable sorting of duplicates" diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 11291bb89b604..939dd176ae90e 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1443,7 +1443,7 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( mark = pytest.mark.xfail( reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293" ) - request.node.add_marker(mark) + request.applymarker(mark) df = DataFrame( { @@ -1912,7 +1912,7 @@ def test_category_order_reducer( # GH#48749 if reduction_func == "corrwith" and not as_index: msg = "GH#49950 - corrwith with as_index=False may not have grouping column" - request.node.add_marker(pytest.mark.xfail(reason=msg)) + request.applymarker(pytest.mark.xfail(reason=msg)) elif index_kind != "range" and not as_index: pytest.skip(reason="Result doesn't have categories, nothing to test") df = DataFrame( @@ -2123,7 +2123,7 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys pytest.skip("corrwith not implemented for SeriesGroupBy") elif reduction_func == "corrwith": msg = "GH#32293: attempts to call SeriesGroupBy.corrwith" - request.node.add_marker(pytest.mark.xfail(reason=msg)) + request.applymarker(pytest.mark.xfail(reason=msg)) elif ( reduction_func == "nunique" and not test_series @@ -2132,7 +2132,7 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys and not as_index ): msg = "GH#52848 - raises a ValueError" - request.node.add_marker(pytest.mark.xfail(reason=msg)) + request.applymarker(pytest.mark.xfail(reason=msg)) df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]}) df = df.astype({"a1": "category", "a2": "category"}) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 4876267c72f12..b840443aab347 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -418,7 +418,7 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only): pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1") if groupby_func in ("corrwith", "skew"): msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1" - request.node.add_marker(pytest.mark.xfail(reason=msg)) + request.applymarker(pytest.mark.xfail(reason=msg)) df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] @@ -822,7 +822,7 @@ def test_duplicate_columns(request, groupby_func, as_index): # GH#50806 if groupby_func == "corrwith": msg = "GH#50845 - corrwith fails when there are duplicate columns" - request.node.add_marker(pytest.mark.xfail(reason=msg)) + request.applymarker(pytest.mark.xfail(reason=msg)) df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb")) args = get_groupby_method_args(groupby_func, df) gb = df.groupby("a", as_index=as_index) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 8065aa63dff81..ab3920d18374b 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -591,7 +591,7 @@ def test_categorical_transformers( # GH#36327 if transformation_func == "fillna": msg = "GH#49651 fillna may incorrectly reorders results when dropna=False" - request.node.add_marker(pytest.mark.xfail(reason=msg, strict=False)) + request.applymarker(pytest.mark.xfail(reason=msg, strict=False)) values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None) df = pd.DataFrame( diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 4a493ef3fd52c..add3c94dcd36a 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -203,7 +203,7 @@ def test_transform_axis_1_reducer(request, reduction_func): "nth", ): marker = pytest.mark.xfail(reason="transform incorrectly fails - GH#45986") - request.node.add_marker(marker) + request.applymarker(marker) df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) msg = "DataFrame.groupby with axis=1 is deprecated" @@ -1455,7 +1455,7 @@ def test_null_group_str_reducer(request, dropna, reduction_func): # GH 17093 if reduction_func == "corrwith": msg = "incorrectly raises" - request.node.add_marker(pytest.mark.xfail(reason=msg)) + request.applymarker(pytest.mark.xfail(reason=msg)) index = [1, 2, 3, 4] # test transform preserves non-standard index df = DataFrame({"A": [1, 1, np.nan, np.nan], "B": [1, 2, 2, 3]}, index=index) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 7eea05c753b8a..c58d55ad6371b 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -34,7 +34,7 @@ class TestDatetimeIndexOps: def test_resolution(self, request, tz_naive_fixture, freq, expected): tz = tz_naive_fixture if freq == "Y" and not IS64 and isinstance(tz, tzlocal): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038") ) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 6245a129afedc..412a59d15307d 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -258,7 +258,7 @@ def test_searchsorted_monotonic(self, index_flat, request): reason="IntervalIndex.searchsorted does not support Interval arg", raises=NotImplementedError, ) - request.node.add_marker(mark) + request.applymarker(mark) # nothing to test if the index is empty if index.empty: @@ -459,7 +459,7 @@ def test_sort_values_with_missing(index_with_missing, na_position, request): # sort non-missing and place missing according to na_position if isinstance(index_with_missing, CategoricalIndex): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="missing value sorting order not well-defined", strict=False ) diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index d35c35661051a..b6de73266dc34 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -146,7 +146,7 @@ def test_constructor_infer_nat_dt_like( if nulls_fixture is NA: expected = Index([NA, NaT]) mark = pytest.mark.xfail(reason="Broken with np.NaT ctor; see GH 31884") - request.node.add_marker(mark) + request.applymarker(mark) # GH#35942 numpy will emit a DeprecationWarning within the # assert_index_equal calls. Since we can't do anything # about it until GH#31884 is fixed, we suppress that warning. diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index d6304774b87c4..1dfeb4f2c6b5b 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -64,7 +64,7 @@ def test_union_different_types(index_flat, index_flat2, request): mark = pytest.mark.xfail( reason="GH#44000 True==1", raises=ValueError, strict=False ) - request.node.add_marker(mark) + request.applymarker(mark) common_dtype = find_common_type([idx1.dtype, idx2.dtype]) @@ -89,7 +89,7 @@ def test_union_different_types(index_flat, index_flat2, request): raises=AssertionError, strict=False, ) - request.node.add_marker(mark) + request.applymarker(mark) any_uint64 = np.uint64 in (idx1.dtype, idx2.dtype) idx1_signed = is_signed_integer_dtype(idx1.dtype) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 70eada188f3c8..6836e9a7c390e 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1124,7 +1124,7 @@ def test_loc_copy_vs_view(self, request, using_copy_on_write): if not using_copy_on_write: mark = pytest.mark.xfail(reason="accidental fix reverted - GH37497") - request.node.add_marker(mark) + request.applymarker(mark) x = DataFrame(zip(range(3), range(3)), columns=["a", "b"]) y = x.copy() diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 8dd9f96a05a90..c5bf935b0d54d 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -197,7 +197,7 @@ def test_usecols_int(self, read_ext): def test_usecols_list(self, request, engine, read_ext, df_ref): if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -221,7 +221,7 @@ def test_usecols_list(self, request, engine, read_ext, df_ref): def test_usecols_str(self, request, engine, read_ext, df_ref): if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -279,7 +279,7 @@ def test_usecols_diff_positional_int_columns_order( self, request, engine, read_ext, usecols, df_ref ): if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -301,7 +301,7 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -313,7 +313,7 @@ def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -401,7 +401,7 @@ def test_excel_stop_iterator(self, read_ext): def test_excel_cell_error_na(self, request, engine, read_ext): if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -409,7 +409,7 @@ def test_excel_cell_error_na(self, request, engine, read_ext): # https://github.com/tafia/calamine/issues/355 if engine == "calamine" and read_ext == ".ods": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="Calamine can't extract error from ods files") ) @@ -419,7 +419,7 @@ def test_excel_cell_error_na(self, request, engine, read_ext): def test_excel_table(self, request, engine, read_ext, df_ref): if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -440,7 +440,7 @@ def test_excel_table(self, request, engine, read_ext, df_ref): def test_reader_special_dtypes(self, request, engine, read_ext): if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -778,7 +778,7 @@ def test_exception_message_includes_sheet_name(self, read_ext): def test_date_conversion_overflow(self, request, engine, read_ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -794,13 +794,13 @@ def test_date_conversion_overflow(self, request, engine, read_ext): ) if engine == "openpyxl": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="Maybe not supported by openpyxl") ) if engine is None and read_ext in (".xlsx", ".xlsm"): # GH 35029 - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="Defaults to openpyxl, maybe not supported") ) @@ -809,7 +809,7 @@ def test_date_conversion_overflow(self, request, engine, read_ext): def test_sheet_name(self, request, read_ext, engine, df_ref): if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -975,7 +975,7 @@ def test_close_from_py_localpath(self, read_ext): def test_reader_seconds(self, request, engine, read_ext): if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -983,7 +983,7 @@ def test_reader_seconds(self, request, engine, read_ext): # GH 55045 if engine == "calamine" and read_ext == ".ods": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="ODS file contains bad datetime (seconds as text)" ) @@ -1017,7 +1017,7 @@ def test_reader_seconds(self, request, engine, read_ext): def test_read_excel_multiindex(self, request, engine, read_ext): # see gh-4679 if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -1025,9 +1025,7 @@ def test_read_excel_multiindex(self, request, engine, read_ext): # https://github.com/tafia/calamine/issues/354 if engine == "calamine" and read_ext == ".ods": - request.node.add_marker( - pytest.mark.xfail(reason="Last test fails in calamine") - ) + request.applymarker(pytest.mark.xfail(reason="Last test fails in calamine")) mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) mi_file = "testmultiindex" + read_ext @@ -1118,7 +1116,7 @@ def test_read_excel_multiindex_blank_after_name( ): # GH34673 if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb (GH4679" ) @@ -1241,7 +1239,7 @@ def test_read_excel_bool_header_arg(self, read_ext): def test_read_excel_skiprows(self, request, engine, read_ext): # GH 4903 if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -1296,7 +1294,7 @@ def test_read_excel_skiprows(self, request, engine, read_ext): def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext): # GH 4903 if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -1426,7 +1424,7 @@ def test_ignore_chartsheets_by_str(self, request, engine, read_ext): if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="pyxlsb can't distinguish chartsheets from worksheets" ) @@ -1439,7 +1437,7 @@ def test_ignore_chartsheets_by_int(self, request, engine, read_ext): if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="pyxlsb can't distinguish chartsheets from worksheets" ) @@ -1568,7 +1566,7 @@ def test_excel_passes_na_filter(self, read_ext, na_filter): def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -1597,7 +1595,7 @@ def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): def test_sheet_name(self, request, engine, read_ext, df_ref): if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -1689,7 +1687,7 @@ def test_header_with_index_col(self, filename): def test_read_datetime_multiindex(self, request, engine, read_ext): # GH 34748 if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) @@ -1720,7 +1718,7 @@ def test_ignore_chartsheets(self, request, engine, read_ext): if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="pyxlsb can't distinguish chartsheets from worksheets" ) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 18af18ade85f4..946c621ae2b6e 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -217,7 +217,7 @@ def test_excel_multindex_roundtrip( reason="Column index name cannot be serialized unless " "it's a MultiIndex" ) - request.node.add_marker(mark) + request.applymarker(mark) # Empty name case current read in as # unnamed levels, not Nones. diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 2767078674632..7312facc44c26 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -242,7 +242,7 @@ def test_roundtrip_categorical( ): # TODO: create a better frame to test with and improve coverage if orient in ("index", "columns"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"Can't have duplicate index values for orient '{orient}')" ) @@ -1893,7 +1893,7 @@ def test_json_pandas_nulls(self, nulls_fixture, request): # GH 31615 if isinstance(nulls_fixture, Decimal): mark = pytest.mark.xfail(reason="not implemented") - request.node.add_marker(mark) + request.applymarker(mark) result = DataFrame([[nulls_fixture]]).to_json() assert result == '{"0":{"0":null}}' diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index d7baba87bba31..124d6890886a8 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -43,7 +43,7 @@ def test_read_datetime(request, engine): if engine == "pyarrow": # GH 48893 reason = "Pyarrow only supports a file path as an input and line delimited json" - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) df = DataFrame( [([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")], @@ -121,7 +121,7 @@ def test_readjson_chunks(request, lines_json_df, chunksize, engine): "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) unchunked = read_json(StringIO(lines_json_df), lines=True) with read_json( @@ -148,7 +148,7 @@ def test_readjson_chunks_series(request, engine): "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason)) + request.applymarker(pytest.mark.xfail(reason=reason)) # Test reading line-format JSON to Series with chunksize param s = pd.Series({"A": 1, "B": 2}) @@ -172,7 +172,7 @@ def test_readjson_each_chunk(request, lines_json_df, engine): "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) # Other tests check that the final result of read_json(chunksize=True) # is correct. This checks the intermediate chunks. @@ -191,7 +191,7 @@ def test_readjson_chunks_from_file(request, engine): "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) with tm.ensure_clean("test.json") as path: df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) @@ -274,7 +274,7 @@ def test_readjson_unicode(request, monkeypatch, engine): "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) with tm.ensure_clean("test.json") as path: monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949") @@ -309,7 +309,7 @@ def test_readjson_nrows_chunks(request, nrows, chunksize, engine): "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} @@ -351,7 +351,7 @@ def test_readjson_lines_chunks_fileurl(request, datapath, engine): "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) df_list_expected = [ DataFrame([[1, 2]], columns=["a", "b"], index=[0]), diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 2ca98de914f9e..8ec372420a0f0 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -55,7 +55,7 @@ def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): if precision == "round_trip": if exp == 999999999999999999 and is_platform_linux(): mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") - request.node.add_marker(mark) + request.applymarker(mark) value = np.inf if exp > 0 else 0.0 expected = DataFrame({"data": [value]}) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 4e82dca83e2d0..ff1af6e2dc81b 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -210,7 +210,7 @@ def test_null_byte_char(request, all_parsers): if parser.engine == "c" or (parser.engine == "python" and PY311): if parser.engine == "python" and PY311: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="In Python 3.11, this is read as an empty character not null" ) @@ -230,7 +230,7 @@ def test_open_file(request, all_parsers): # GH 39024 parser = all_parsers if parser.engine == "c": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{parser.engine} engine does not support sep=None " f"with delim_whitespace=False" diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 3ab40ff846cb6..591defdde7df9 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -278,7 +278,7 @@ def pyarrow_xfail(request): return if parser.engine == "pyarrow": mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") - request.node.add_marker(mark) + request.applymarker(mark) @pytest.fixture diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 97a32ad79a67c..8fb25fe3ee47e 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -234,7 +234,7 @@ def decimal_number_check(request, parser, numeric_decimal, thousands, float_prec # GH#31920 value = numeric_decimal[0] if thousands is None and value in ("1_,", "1_234,56", "1_234,56e0"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason=f"thousands={thousands} and sep is in {value}") ) df = parser.read_csv( diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index 9a14e67c154b6..5b738446ea441 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -45,7 +45,7 @@ def test_line_comment(all_parsers, read_kwargs, request): mark = pytest.mark.xfail( reason="Custom terminator not supported with Python engine" ) - request.node.add_marker(mark) + request.applymarker(mark) data = data.replace("\n", read_kwargs.get("lineterminator")) @@ -146,7 +146,7 @@ def test_comment_char_in_default_value(all_parsers, request): if all_parsers.engine == "c": reason = "see gh-34002: works on the python engine but not the c engine" # NA value containing comment char is interpreted as comment - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=AssertionError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=AssertionError)) parser = all_parsers data = ( diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index d150b52258d47..1c67ec7c3066c 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -105,7 +105,7 @@ def test_compression( filename = filename if filename is None else filename.format(ext=ext) if filename and buffer: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Cannot deduce compression from buffer of compressed data." ) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 3873bf31c1ed4..c09ab7898c67c 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -127,9 +127,7 @@ def _encode_data_with_bom(_data): and kwargs.get("skip_blank_lines", True) ): # Manually xfail, since we don't have mechanism to xfail specific version - request.node.add_marker( - pytest.mark.xfail(reason="Pyarrow can't read blank lines") - ) + request.applymarker(pytest.mark.xfail(reason="Pyarrow can't read blank lines")) result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index a7ded00e758b7..8213ea006614f 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -341,7 +341,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): data = "a,b\n01,2" parser = all_parsers if dtype == object and parser.engine == "pyarrow": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine") ) result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 9f7840588f89e..bd08dd3a0c5d2 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -157,7 +157,7 @@ def test_multiple_date_col_custom(all_parsers, keep_date_col, request): mark = pytest.mark.xfail( reason="pyarrow doesn't support disabling auto-inference on column numbers." ) - request.node.add_marker(mark) + request.applymarker(mark) def date_parser(*date_cols): """ @@ -326,7 +326,7 @@ def test_multiple_date_col(all_parsers, keep_date_col, request): mark = pytest.mark.xfail( reason="pyarrow doesn't support disabling auto-inference on column numbers." ) - request.node.add_marker(mark) + request.applymarker(mark) kwds = { "header": None, @@ -1836,7 +1836,7 @@ def test_hypothesis_delimited_date( request, date_format, dayfirst, delimiter, test_datetime ): if date_format == "%m %Y" and delimiter == ".": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="parse_datetime_string cannot reliably tell whether " "e.g. %m.%Y is a float or a date" diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index c58e27aacfa00..4b509edc36925 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -204,7 +204,7 @@ def test_skiprows_lineterminator(all_parsers, lineterminator, request): if parser.engine == "python" and lineterminator == "\r": mark = pytest.mark.xfail(reason="'CR' not respect with the Python parser yet") - request.node.add_marker(mark) + request.applymarker(mark) data = data.replace("\n", lineterminator) result = parser.read_csv( diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index b489c09e917af..f201f6c394566 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -190,7 +190,7 @@ def test_invalid_file_inputs(request, all_parsers): # GH#45957 parser = all_parsers if parser.engine == "python": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason=f"{parser.engine} engine supports lists.") ) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 48983cbb5ec28..af24a5cf7e0ab 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -331,7 +331,7 @@ def test_timeseries_preepoch(setup_path, request): _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) except OverflowError: if is_platform_windows(): - request.node.add_marker( + request.applymarker( pytest.mark.xfail("known failure on some windows platforms") ) raise diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b043f9fab23ae..1538275e6af73 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -453,7 +453,7 @@ def test_read_filters(self, engine, tmp_path): def test_write_index(self, engine, using_copy_on_write, request): check_names = engine != "fastparquet" if using_copy_on_write and engine == "fastparquet": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="fastparquet write into index") ) @@ -626,7 +626,7 @@ def test_dtype_backend(self, engine, request): mark = pytest.mark.xfail( reason="Fastparquet nullable dtype support is disabled" ) - request.node.add_marker(mark) + request.applymarker(mark) table = pyarrow.table( { @@ -988,7 +988,7 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): not pa_version_under7p0 and timezone_aware_date_list.tzinfo != datetime.timezone.utc ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="temporary skip this test until it is properly resolved: " "https://github.com/pandas-dev/pandas/issues/37286" diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 11f95ff104767..63546b44e92be 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -817,7 +817,7 @@ def sample(pd_table, conn, keys, data_iter): def test_default_type_conversion(conn, request): conn_name = conn if conn_name == "sqlite_buildin_iris": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="sqlite_buildin connection does not implement read_sql_table" ) @@ -1155,7 +1155,7 @@ def test_read_sql_iris_named_parameter(conn, request, sql_strings, flavor): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_sql_iris_no_parameter_with_percent(conn, request, sql_strings, flavor): if "mysql" in conn or "postgresql" in conn: - request.node.add_marker(pytest.mark.xfail(reason="broken test")) + request.applymarker(pytest.mark.xfail(reason="broken test")) conn_name = conn conn = request.getfixturevalue(conn) @@ -1399,7 +1399,7 @@ def test_api_custom_dateparsing_error( conn_name = conn conn = request.getfixturevalue(conn) if text == "types" and conn_name == "sqlite_buildin_iris": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="failing combination of arguments") ) @@ -1497,7 +1497,7 @@ def test_api_to_sql_index_label(conn, request, index_name, index_label, expected def test_api_to_sql_index_label_multiindex(conn, request): conn_name = conn if "mysql" in conn_name: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="MySQL can fail using TEXT without length as key", strict=False ) @@ -1802,7 +1802,7 @@ def test_read_table_columns(conn, request, test_frame1): # test columns argument in read_table conn_name = conn if conn_name == "sqlite_buildin": - request.node.add_marker(pytest.mark.xfail(reason="Not Implemented")) + request.applymarker(pytest.mark.xfail(reason="Not Implemented")) conn = request.getfixturevalue(conn) sql.to_sql(test_frame1, "test_frame", conn) @@ -1818,7 +1818,7 @@ def test_read_table_index_col(conn, request, test_frame1): # test columns argument in read_table conn_name = conn if conn_name == "sqlite_buildin": - request.node.add_marker(pytest.mark.xfail(reason="Not Implemented")) + request.applymarker(pytest.mark.xfail(reason="Not Implemented")) conn = request.getfixturevalue(conn) sql.to_sql(test_frame1, "test_frame", conn) @@ -1839,7 +1839,7 @@ def test_read_table_index_col(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_sql_delegate(conn, request): if conn == "sqlite_buildin_iris": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="sqlite_buildin connection does not implement read_sql_table" ) @@ -1884,7 +1884,7 @@ def test_not_reflect_all_tables(sqlite_conn): def test_warning_case_insensitive_table_name(conn, request, test_frame1): conn_name = conn if conn_name == "sqlite_buildin": - request.node.add_marker(pytest.mark.xfail(reason="Does not raise warning")) + request.applymarker(pytest.mark.xfail(reason="Does not raise warning")) conn = request.getfixturevalue(conn) # see gh-7815 @@ -2034,7 +2034,7 @@ def test_column_with_percentage(conn, request): # GH 37157 conn_name = conn if conn_name == "sqlite_buildin": - request.node.add_marker(pytest.mark.xfail(reason="Not Implemented")) + request.applymarker(pytest.mark.xfail(reason="Not Implemented")) conn = request.getfixturevalue(conn) df = DataFrame({"A": [0, 1, 2], "%_variation": [3, 4, 5]}) @@ -2226,7 +2226,7 @@ def test_sqlalchemy_default_type_conversion(conn, request): if conn_name == "sqlite_str": pytest.skip("types tables not created in sqlite_str fixture") elif "mysql" in conn_name or "sqlite" in conn_name: - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="boolean dtype not inferred properly") ) @@ -2260,7 +2260,7 @@ def test_default_date_load(conn, request): if conn_name == "sqlite_str": pytest.skip("types tables not created in sqlite_str fixture") elif "sqlite" in conn_name: - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="sqlite does not read date properly") ) @@ -2310,7 +2310,7 @@ def check(col): conn = request.getfixturevalue(conn) df = read_sql_query("select * from types", conn) if not hasattr(df, "DateColWithTz"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="no column with datetime with time zone") ) @@ -2322,7 +2322,7 @@ def check(col): df = read_sql_query("select * from types", conn, parse_dates=["DateColWithTz"]) if not hasattr(df, "DateColWithTz"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="no column with datetime with time zone") ) col = df.DateColWithTz @@ -2689,7 +2689,7 @@ def test_get_schema_create_table(conn, request, test_frame3): # TINYINT (which read_sql_table returns as an int and causes a dtype # mismatch) if conn == "sqlite_str": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="test does not support sqlite_str fixture") ) @@ -2901,7 +2901,7 @@ def test_to_sql_with_negative_npinf(conn, request, input): if Version(pymysql.__version__) < Version("1.0.3") and "infe0" in df.columns: mark = pytest.mark.xfail(reason="GH 36465") - request.node.add_marker(mark) + request.applymarker(mark) msg = "inf cannot be used with MySQL" with pytest.raises(ValueError, match=msg): @@ -2953,7 +2953,7 @@ class Temporary(Base): @pytest.mark.parametrize("conn", all_connectable) def test_invalid_engine(conn, request, test_frame1): if conn == "sqlite_buildin": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="SQLiteDatabase does not raise for bad engine") ) @@ -3078,7 +3078,7 @@ def test_read_sql_dtype_backend_table( dtype_backend_expected, ): if "sqlite" in conn: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( "SQLite actually returns proper boolean values via " diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index df5ca2f27c15d..a87042180df8f 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -197,11 +197,11 @@ def test_concatlike_dtypes_coercion(self, item, item2, request): # index doesn't because bool is object dtype exp_series_dtype = typ2 mark = pytest.mark.xfail(reason="GH#39187 casting to object") - request.node.add_marker(mark) + request.applymarker(mark) elif typ2 == "bool" and typ1 in ("int64", "float64"): exp_series_dtype = typ1 mark = pytest.mark.xfail(reason="GH#39187 casting to object") - request.node.add_marker(mark) + request.applymarker(mark) elif typ1 in {"datetime64[ns, US/Eastern]", "timedelta64[ns]"} or typ2 in { "datetime64[ns, US/Eastern]", "timedelta64[ns]", diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index b65b34f748260..5eddb4b83f44c 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -353,7 +353,7 @@ def test_constructor_positional_keyword_mixed_with_tzinfo(self, kwd, request): if kwd != "nanosecond": # nanosecond is keyword-only as of 2.0, others are not mark = pytest.mark.xfail(reason="GH#45307") - request.node.add_marker(mark) + request.applymarker(mark) kwargs = {kwd: 4} ts = Timestamp(2020, 12, 31, tzinfo=timezone.utc, **kwargs) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 03fc6cba2902a..faa3978038dd5 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -170,7 +170,7 @@ def test_astype_generic_timestamp_no_frequency(self, dtype, request): if np.dtype(dtype).name not in ["timedelta64", "datetime64"]: mark = pytest.mark.xfail(reason="GH#33890 Is assigned ns unit") - request.node.add_marker(mark) + request.applymarker(mark) msg = ( rf"The '{dtype.__name__}' dtype has no unit\. " @@ -485,7 +485,7 @@ def test_astype_string_to_extension_dtype_roundtrip( mark = pytest.mark.xfail( reason="TODO StringArray.astype() with missing values #GH40566" ) - request.node.add_marker(mark) + request.applymarker(mark) # GH-40351 ser = Series(data, dtype=dtype) diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index f8bbd4c25a4c0..426885bf41a1f 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -831,7 +831,7 @@ def test_interpolate_timedelta_index(self, request, interp_methods_ind): method, kwargs = interp_methods_ind if method in {"cubic", "zero"}: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{method} interpolation is not supported for TimedeltaIndex" ) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index ae6c62e95f696..6b357281c831b 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -204,7 +204,7 @@ def test_map(datetime_series): def test_map_empty(request, index): if isinstance(index, MultiIndex): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Initializing a Series from a MultiIndex is not supported" ) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index e9eb906a9cf10..656f6736f03ee 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -881,7 +881,7 @@ def test_none_comparison(request, series_with_simple_index): series = series_with_simple_index if len(series) < 1: - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="Test doesn't make sense on empty data") ) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 4f9050be100ca..dc5158cb5813c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1362,7 +1362,7 @@ def test_constructor_dict_extension(self, ea_scalar_and_dtype, request): reason="Construction from dict goes through " "maybe_convert_objects which casts to nano" ) - request.node.add_marker(mark) + request.applymarker(mark) d = {"a": ea_scalar} result = Series(d, index=["a"]) expected = Series(ea_scalar, index=["a"], dtype=ea_dtype) @@ -1688,7 +1688,7 @@ def test_constructor_generic_timestamp_no_frequency(self, dtype, request): if np.dtype(dtype).name not in ["timedelta64", "datetime64"]: mark = pytest.mark.xfail(reason="GH#33890 Is assigned ns unit") - request.node.add_marker(mark) + request.applymarker(mark) with pytest.raises(ValueError, match=msg): Series([], dtype=dtype) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 698c727f1beb8..a75e77a5e2714 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -274,7 +274,7 @@ def test_multiply(self, values_for_np_reduce, box_with_array, request): if isinstance(values, pd.core.arrays.SparseArray): mark = pytest.mark.xfail(reason="SparseArray has no 'prod'") - request.node.add_marker(mark) + request.applymarker(mark) if values.dtype.kind in "iuf": result = np.multiply.reduce(obj) diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index c439a5f006922..0d2f220e70c56 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -92,7 +92,7 @@ def test_api_per_method( if reason is not None: mark = pytest.mark.xfail(raises=raises, reason=reason) - request.node.add_marker(mark) + request.applymarker(mark) t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b0406dbfa3469..aefaba1aed058 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1831,7 +1831,7 @@ def test_to_datetime_month_or_year_unit_int(self, cache, unit, item, request): # TODO: this should also work if isinstance(item, float): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{type(item).__name__} in np.array should work" ) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 1d969e648b752..da8e2fe9abc16 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -396,7 +396,7 @@ def test_period(request, transform_assert_equal): inp = transform(idx) if not isinstance(inp, Index): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="Missing PeriodDtype support in to_numeric") ) result = to_numeric(inp) diff --git a/pandas/tests/tseries/offsets/test_common.py b/pandas/tests/tseries/offsets/test_common.py index 1b90b94d8a9da..5b80b8b1c4ab4 100644 --- a/pandas/tests/tseries/offsets/test_common.py +++ b/pandas/tests/tseries/offsets/test_common.py @@ -142,7 +142,7 @@ def test_apply_out_of_range(request, tz_naive_fixture, _offset): if isinstance(tz, tzlocal) and not IS64 and _offset is not DateOffset: # If we hit OutOfBoundsDatetime on non-64 bit machines # we'll drop out of the try clause before the next test - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038") ) elif ( @@ -150,7 +150,7 @@ def test_apply_out_of_range(request, tz_naive_fixture, _offset): and is_platform_windows() and _offset in (QuarterEnd, BQuarterBegin, BQuarterEnd) ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="After GH#49737 t.tzinfo is None on CI") ) assert str(t.tzinfo) == str(result.tzinfo) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 5678dd1fb511e..7cefd93851b0e 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -602,7 +602,7 @@ def test_mul(self): @pytest.mark.parametrize("kwd", sorted(liboffsets._relativedelta_kwds)) def test_constructor(self, kwd, request): if kwd == "millisecond": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=NotImplementedError, reason="Constructing DateOffset object with `millisecond` is not " @@ -916,7 +916,7 @@ def test_month_offset_name(month_classes): @pytest.mark.parametrize("kwd", sorted(liboffsets._relativedelta_kwds)) def test_valid_relativedelta_kwargs(kwd, request): if kwd == "millisecond": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=NotImplementedError, reason="Constructing DateOffset object with `millisecond` is not " diff --git a/pandas/tests/window/moments/test_moments_consistency_expanding.py b/pandas/tests/window/moments/test_moments_consistency_expanding.py index dafc60a057c0f..7d2fa1ad5d211 100644 --- a/pandas/tests/window/moments/test_moments_consistency_expanding.py +++ b/pandas/tests/window/moments/test_moments_consistency_expanding.py @@ -19,7 +19,7 @@ def test_expanding_apply_consistency_sum_nans(request, all_data, min_periods, f) if not no_nans(all_data) and not ( all_na(all_data) and not all_data.empty and min_periods > 0 ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="np.sum has different behavior with NaNs") ) expanding_f_result = all_data.expanding(min_periods=min_periods).sum() diff --git a/pandas/tests/window/moments/test_moments_consistency_rolling.py b/pandas/tests/window/moments/test_moments_consistency_rolling.py index 62bfc66b124f3..be22338c00cb2 100644 --- a/pandas/tests/window/moments/test_moments_consistency_rolling.py +++ b/pandas/tests/window/moments/test_moments_consistency_rolling.py @@ -29,7 +29,7 @@ def test_rolling_apply_consistency_sum( if not no_nans(all_data) and not ( all_na(all_data) and not all_data.empty and min_periods > 0 ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="np.sum has different behavior with NaNs") ) rolling_f_result = all_data.rolling( diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 9be0c3edaa998..3292b701c18d7 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -229,7 +229,7 @@ def async_mark(): def mark_array_manager_not_yet_implemented(request) -> None: mark = pytest.mark.xfail(reason="Not yet implemented for ArrayManager") - request.node.add_marker(mark) + request.applymarker(mark) skip_array_manager_not_yet_implemented = pytest.mark.xfail(