From 370818ae884445c34764cac848c1cc1771676ce8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 18 Jun 2024 15:54:03 +0200 Subject: [PATCH] GH-41978: [Python] Fix pandas tests to follow downstream datetime64 unit changes (#41979) ### Rationale for this change Pandas changed the default unit used in certain cases for the `datetime64[unit]` dtype. This causes some failures in our test suite when tested with pandas 3.0.0.dev * GitHub Issue: #41978 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/pandas_compat.py | 11 ++++++++--- python/pyarrow/tests/interchange/test_conversion.py | 6 ++++-- python/pyarrow/tests/parquet/test_datetime.py | 1 + python/pyarrow/tests/test_pandas.py | 1 + 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index e246f1263d20d..c23c64d532b66 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -109,9 +109,10 @@ def get_logical_type_from_numpy(pandas_collection): except KeyError: if hasattr(pandas_collection.dtype, 'tz'): return 'datetimetz' - # See https://github.com/pandas-dev/pandas/issues/24739 - if str(pandas_collection.dtype) == 'datetime64[ns]': - return 'datetime64[ns]' + # See https://github.com/pandas-dev/pandas/issues/24739 (infer_dtype will + # result in "datetime64" without unit, while pandas astype requires a unit) + if str(pandas_collection.dtype).startswith('datetime64'): + return str(pandas_collection.dtype) result = _pandas_api.infer_dtype(pandas_collection) if result == 'string': return 'unicode' @@ -1107,6 +1108,10 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): tz = pa.lib.string_to_tzinfo( column_indexes[0]['metadata']['timezone']) level = pd.to_datetime(level, utc=True).tz_convert(tz) + if _pandas_api.is_ge_v3(): + # with pandas 3+, to_datetime returns a unit depending on the string + # data, so we restore it to the original unit from the metadata + level = level.as_unit(np.datetime_data(dtype)[0]) # GH-41503: if the column index was decimal, restore to decimal elif pandas_dtype == "decimal": level = _pandas_api.pd.Index([decimal.Decimal(i) for i in level]) diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py index b1e0fa0d1c651..6d91bad57cef4 100644 --- a/python/pyarrow/tests/interchange/test_conversion.py +++ b/python/pyarrow/tests/interchange/test_conversion.py @@ -335,8 +335,10 @@ def test_pandas_to_pyarrow_with_missing(np_float): np_array = np.array([0, np.nan, 2], dtype=np_float) datetime_array = [None, dt(2007, 7, 14), dt(2007, 7, 15)] df = pd.DataFrame({ - "a": np_array, # float, ColumnNullType.USE_NAN - "dt": datetime_array # ColumnNullType.USE_SENTINEL + # float, ColumnNullType.USE_NAN + "a": np_array, + # ColumnNullType.USE_SENTINEL + "dt": np.array(datetime_array, dtype="datetime64[ns]") }) expected = pa.table({ "a": pa.array(np_array, from_pandas=True), diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py index 0896eb37e6473..08fb1098322be 100644 --- a/python/pyarrow/tests/parquet/test_datetime.py +++ b/python/pyarrow/tests/parquet/test_datetime.py @@ -331,6 +331,7 @@ def get_table(pq_reader_method, filename, **kwargs): pq_reader_method, filename, coerce_int96_timestamp_unit="s" ) df_correct = tab_correct.to_pandas(timestamp_as_object=True) + df["a"] = df["a"].astype(object) tm.assert_frame_equal(df, df_correct) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index ba9d6a3c01391..7d74a60dcb921 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -4754,6 +4754,7 @@ def make_df_with_timestamps(): np.datetime64('2050-05-03 15:42', 'ns'), ], }) + df['dateTimeMs'] = df['dateTimeMs'].astype('object') # Not part of what we're testing, just ensuring that the inputs are what we # expect. assert (df.dateTimeMs.dtype, df.dateTimeNs.dtype) == (