diff --git a/.github/workflows/asv.yaml b/.github/workflows/asv.yaml index 1d1e83f42..9691cab74 100644 --- a/.github/workflows/asv.yaml +++ b/.github/workflows/asv.yaml @@ -54,7 +54,7 @@ jobs: if: ${{ steps.build.outcome == 'success' }} - name: Publish benchmarks artifact - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: Benchmarks log path: benchmarks/asv_bench/results diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 67a390400..5f8dcc702 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -92,9 +92,10 @@ jobs: - { os: ubuntu-20.04, module: hadoop, python-version: 3.9 } - { os: ubuntu-latest, module: vineyard, python-version: 3.9 } - { os: ubuntu-latest, module: external-storage, python-version: 3.9 } - - { os: ubuntu-latest, module: compatibility, python-version: 3.9 } + # always test compatibility with the latest version + # - { os: ubuntu-latest, module: compatibility, python-version: 3.9 } - { os: ubuntu-latest, module: doc-build, python-version: 3.9 } - - { os: [self-hosted, gpu], module: gpu, python-version: 3.11} + - { os: self-hosted, module: gpu, python-version: 3.11} - { os: ubuntu-latest, module: jax, python-version: 3.9 } # a self-hosted runner which needs computing resources, activate when necessary # - { os: juicefs-ci, module: kubernetes-juicefs, python-version: 3.9 } diff --git a/python/xorbits/_mars/_utils.pyx b/python/xorbits/_mars/_utils.pyx index c885e2792..cc64b115b 100644 --- a/python/xorbits/_mars/_utils.pyx +++ b/python/xorbits/_mars/_utils.pyx @@ -207,7 +207,7 @@ cdef list tokenize_pandas_dataframe(ob): cdef list tokenize_pandas_categorical(ob): - l = ob.to_list() + l = ob.tolist() l.append(ob.shape) return iterative_tokenize(l) diff --git a/python/xorbits/_mars/dataframe/base/accessor.py b/python/xorbits/_mars/dataframe/base/accessor.py index e1490ff0e..b5369b34f 100644 --- a/python/xorbits/_mars/dataframe/base/accessor.py +++ b/python/xorbits/_mars/dataframe/base/accessor.py @@ -17,12 +17,7 @@ from typing import Iterable import pandas as pd -from pandas.api.types import ( - is_datetime64_dtype, - is_datetime64tz_dtype, - is_period_dtype, - is_timedelta64_dtype, -) +from pandas.api.types import is_datetime64_dtype, is_timedelta64_dtype from ...utils import adapt_mars_docstring from .datetimes import SeriesDatetimeMethod, _datetime_method_to_handlers @@ -238,9 +233,9 @@ class DatetimeAccessor: def __init__(self, series): if ( not is_datetime64_dtype(series.dtype) - and not is_datetime64tz_dtype(series.dtype) + and not isinstance(series.dtype, pd.DatetimeTZDtype) and not is_timedelta64_dtype(series.dtype) - and not is_period_dtype(series.dtype) + and not isinstance(series.dtype, pd.PeriodDtype) ): raise AttributeError("Can only use .dt accessor with datetimelike values") self._series = series diff --git a/python/xorbits/_mars/dataframe/base/apply.py b/python/xorbits/_mars/dataframe/base/apply.py index 284ea0e68..b638f45e9 100644 --- a/python/xorbits/_mars/dataframe/base/apply.py +++ b/python/xorbits/_mars/dataframe/base/apply.py @@ -107,9 +107,10 @@ def execute(cls, ctx, op): **op.kwds, ) else: - result = input_data.apply( - func, convert_dtype=op.convert_dtype, args=op.args, **op.kwds - ) + if op.convert_dtype: + result = input_data.apply(func, args=op.args, **op.kwds) + else: + result = input_data.apply(func, args=op.args, **op.kwds).astype(object) ctx[out.key] = result @classmethod diff --git a/python/xorbits/_mars/dataframe/base/tests/test_apply_execution.py b/python/xorbits/_mars/dataframe/base/tests/test_apply_execution.py index 6df022df7..b8950f36b 100644 --- a/python/xorbits/_mars/dataframe/base/tests/test_apply_execution.py +++ b/python/xorbits/_mars/dataframe/base/tests/test_apply_execution.py @@ -210,7 +210,7 @@ def subtract_custom_value(x, custom_value): ).execute() assert res.data_params["dtype"] == "object" pd.testing.assert_series_equal( - res.fetch(), s.apply(apply_func, args=(5,), convert_dtype=False) + res.fetch(), s.apply(apply_func, args=(5,)).astype(object) ) res = ms.apply( @@ -220,9 +220,7 @@ def subtract_custom_value(x, custom_value): assert res.shape == (4,) with pytest.raises(AttributeError): _ = res.dtypes - pd.testing.assert_series_equal( - res.fetch(), s.apply(apply_func, args=(5,), convert_dtype=True) - ) + pd.testing.assert_series_equal(res.fetch(), s.apply(apply_func, args=(5,))) def test_apply_execution_with_multi_chunks(setup): diff --git a/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py b/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py index 529812ea5..4d1e7128e 100644 --- a/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py +++ b/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py @@ -476,7 +476,7 @@ def test_series_apply_execute(setup): r = series.apply(lambda x: [x, x + 1], convert_dtype=False) result = r.execute().fetch() - expected = s_raw.apply(lambda x: [x, x + 1], convert_dtype=False) + expected = s_raw.apply(lambda x: [x, x + 1]).astype(object) pd.testing.assert_series_equal(result, expected) s_raw2 = pd.Series([np.array([1, 2, 3]), np.array([4, 5, 6])]) @@ -502,7 +502,7 @@ def closure(z): r = series.apply(closure, convert_dtype=False) result = r.execute().fetch() - expected = s_raw.apply(closure, convert_dtype=False) + expected = s_raw.apply(closure).astype(object) pd.testing.assert_series_equal(result, expected) class callable_series: @@ -518,7 +518,7 @@ def __call__(self, z): cs = callable_series() r = series.apply(cs, convert_dtype=False) result = r.execute().fetch() - expected = s_raw.apply(cs, convert_dtype=False) + expected = s_raw.apply(cs).astype(object) pd.testing.assert_series_equal(result, expected) @@ -528,9 +528,9 @@ def test_apply_with_arrow_dtype_execution(setup): df1 = table.to_pandas(types_mapper=pd.ArrowDtype) df = from_pandas_df(df1) - r = df.apply(lambda row: str(row[0]) + row[1], axis=1) + r = df.apply(lambda row: str(row.iloc[0]) + row.iloc[1], axis=1) result = r.execute().fetch() - expected = df1.apply(lambda row: str(row[0]) + row[1], axis=1) + expected = df1.apply(lambda row: str(row.iloc[0]) + row.iloc[1], axis=1) pd.testing.assert_series_equal(result, expected) s1 = df1["b"] diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py index 7bb9358ae..adb73f4db 100644 --- a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py +++ b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py @@ -599,7 +599,7 @@ def test_date_range(): with pytest.raises(ValueError): _ = date_range(pd.NaT, periods=10) - expected = pd.date_range("2020-1-1", periods=9.0, name="date") + expected = pd.date_range("2020-1-1", periods=9, name="date") dr = date_range("2020-1-1", periods=9.0, name="date", chunk_size=3) assert isinstance(dr, DatetimeIndex) diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py index 9292dd39f..3f9e22773 100644 --- a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py +++ b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py @@ -1281,11 +1281,11 @@ def test_date_range_execution(setup): # start, end and freq dr = md.date_range( - "2020-1-1", "2020-1-10", freq="12H", chunk_size=chunk_size, **kw + "2020-1-1", "2020-1-10", freq="12h", chunk_size=chunk_size, **kw ) result = dr.execute().fetch() - expected = pd.date_range("2020-1-1", "2020-1-10", freq="12H", **kw) + expected = pd.date_range("2020-1-1", "2020-1-10", freq="12h", **kw) pd.testing.assert_index_equal(result, expected) # test timezone @@ -1317,15 +1317,15 @@ def test_date_range_execution(setup): pd.testing.assert_index_equal(result, expected) # test freq - dr = md.date_range(start="1/1/2018", periods=5, freq="M", chunk_size=3) + dr = md.date_range(start="1/1/2018", periods=5, freq="ME", chunk_size=3) result = dr.execute().fetch() - expected = pd.date_range(start="1/1/2018", periods=5, freq="M") + expected = pd.date_range(start="1/1/2018", periods=5, freq="ME") pd.testing.assert_index_equal(result, expected) - dr = md.date_range(start="2018/01/01", end="2018/07/01", freq="M") + dr = md.date_range(start="2018/01/01", end="2018/07/01", freq="ME") result = dr.execute().fetch() - expected = pd.date_range(start="2018/01/01", end="2018/07/01", freq="M") + expected = pd.date_range(start="2018/01/01", end="2018/07/01", freq="ME") pd.testing.assert_index_equal(result, expected) diff --git a/python/xorbits/_mars/dataframe/indexing/index_lib.py b/python/xorbits/_mars/dataframe/indexing/index_lib.py index c9aa6808c..b0e465963 100644 --- a/python/xorbits/_mars/dataframe/indexing/index_lib.py +++ b/python/xorbits/_mars/dataframe/indexing/index_lib.py @@ -815,7 +815,7 @@ def _create_reorder_chunk( reorder_indexes[-1] ] params["columns_value"] = parse_index(reorder_columns, store_data=True) - params["dtypes"] = concat_chunk.dtypes[reorder_indexes[-1]] + params["dtypes"] = concat_chunk.dtypes.iloc[reorder_indexes[-1]] return reorder_chunk_op.new_chunk([concat_chunk], kws=[params]) diff --git a/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py b/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py index c63ba124b..ed57e765c 100644 --- a/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py +++ b/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py @@ -1725,6 +1725,7 @@ def test_sample_execution(setup): def test_loc_setitem(setup): raw_df = pd.DataFrame({"a": [1, 2, 3, 4, 2, 4, 5, 7, 2, 8, 9], 1: [10] * 11}) + raw_df = raw_df.astype("object") md_data = md.DataFrame(raw_df, chunk_size=3) md_data.loc[md_data["a"] <= 4, 1] = "v1" pd_data = raw_df.copy(True) diff --git a/python/xorbits/_mars/dataframe/missing/fillna.py b/python/xorbits/_mars/dataframe/missing/fillna.py index 65bca253b..1e3547c44 100644 --- a/python/xorbits/_mars/dataframe/missing/fillna.py +++ b/python/xorbits/_mars/dataframe/missing/fillna.py @@ -89,6 +89,18 @@ def _set_inputs(self, inputs): def output_limit(self): return self._output_limit or 1 + @staticmethod + def _apply_fillna_with_method(df, value, method, axis, limit, inplace=False): + """ + Parameter method is deprecated since version 2.1.0, use ffill or bfill instead. + """ + if method is not None: + if method in ["backfill", "bfill"]: + return df.bfill(axis=axis, limit=limit, inplace=inplace) + elif method in ["pad", "ffill"]: + return df.ffill(axis=axis, limit=limit, inplace=inplace) + return df.fillna(value=value, axis=axis, inplace=inplace) + @staticmethod def _get_first_slice(op, df, end): if op.method == "bfill": @@ -115,11 +127,7 @@ def _execute_map(cls, ctx, op): axis = op.axis method = op.method - filled = input_data.fillna( - method=method, - axis=axis, - limit=limit, - ) + filled = cls._apply_fillna_with_method(input_data, None, method, axis, limit) ctx[op.outputs[0].key] = cls._get_first_slice(op, filled, 1) del filled @@ -137,15 +145,17 @@ def _execute_combine(cls, ctx, op): summaries = [ctx[inp.key] for inp in op.inputs[1:]] if not summaries: - ctx[op.outputs[0].key] = input_data.fillna( - method=method, - axis=axis, - limit=limit, + ctx[op.outputs[0].key] = cls._apply_fillna_with_method( + input_data, None, method, axis, limit ) return valid_summary = cls._get_first_slice( - op, pd.concat(summaries, axis=axis).fillna(method=method, axis=axis), 1 + op, + cls._apply_fillna_with_method( + pd.concat(summaries, axis=axis), None, method, axis, limit + ), + 1, ) if method == "bfill": @@ -154,17 +164,12 @@ def _execute_combine(cls, ctx, op): concat_df = pd.concat([valid_summary, input_data], axis=axis) if is_pandas_2(): - concat_df = concat_df.fillna( - method=method, - axis=axis, - limit=limit, + concat_df = cls._apply_fillna_with_method( + concat_df, None, method, axis, limit ) else: - concat_df.fillna( - method=method, - axis=axis, - inplace=True, - limit=limit, + concat_df = cls._apply_fillna_with_method( + concat_df, None, method, axis, limit, inplace=True ) ctx[op.outputs[0].key] = cls._get_first_slice(op, concat_df, -1) @@ -180,11 +185,8 @@ def execute(cls, ctx, op): if isinstance(op.value, ENTITY_TYPE): value = ctx[op.value.key] if not isinstance(input_data, pd.Index): - ctx[op.outputs[0].key] = input_data.fillna( - value=value, - method=op.method, - axis=op.axis, - limit=op.limit, + ctx[op.outputs[0].key] = cls._apply_fillna_with_method( + input_data, value, op.method, op.axis, op.limit ) else: ctx[op.outputs[0].key] = input_data.fillna(value=value) diff --git a/python/xorbits/_mars/dataframe/missing/tests/test_missing_execution.py b/python/xorbits/_mars/dataframe/missing/tests/test_missing_execution.py index 72dd97440..ab8bae485 100644 --- a/python/xorbits/_mars/dataframe/missing/tests/test_missing_execution.py +++ b/python/xorbits/_mars/dataframe/missing/tests/test_missing_execution.py @@ -84,11 +84,11 @@ def test_dataframe_fill_na_execution(setup): # test forward fill in axis=0 without limit r = df.fillna(method="pad") - pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(method="pad")) + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.ffill()) # test backward fill in axis=0 without limit r = df.fillna(method="backfill") - pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(method="backfill")) + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.bfill()) # test forward fill in axis=1 without limit r = df.ffill(axis=1)