Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raise error when sorting by categorical column in dask-cudf #15788

Merged
merged 9 commits into from
May 20, 2024
19 changes: 19 additions & 0 deletions python/dask_cudf/dask_cudf/expr/_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from dask import config
from dask.dataframe.core import is_dataframe_like
from dask.dataframe.dispatch import is_categorical_dtype

import cudf

Expand Down Expand Up @@ -81,6 +82,24 @@ def from_dict(cls, *args, **kwargs):
with config.set({"dataframe.backend": "cudf"}):
return DXDataFrame.from_dict(*args, **kwargs)

def sort_values(
self,
by,
**kwargs,
):
# Raise if the first column is categorical, otherwise the
# upstream divisions logic may produce errors
# (See: https://github.com/rapidsai/cudf/issues/11795)
check_by = by[0] if isinstance(by, list) else by
if is_categorical_dtype(self.dtypes.get(check_by, None)):
raise NotImplementedError(
"Dask-cudf does not support sorting on categorical "
"columns when query-planning is enabled. Please use "
"the legacy API for now."
f"\n{_LEGACY_WORKAROUND}",
)
return super().sort_values(by, **kwargs)

def groupby(
self,
by,
Expand Down
13 changes: 12 additions & 1 deletion python/dask_cudf/dask_cudf/tests/test_sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
pytest.param(
"d",
marks=xfail_dask_expr(
"Dask-expr fails to sort by categorical column."
"Possible segfault when sorting by categorical column.",
),
),
["a", "b"],
Expand All @@ -47,6 +47,17 @@ def test_sort_values(nelem, nparts, by, ascending):
dd.assert_eq(got, expect, check_index=False)


@pytest.mark.parametrize("by", ["b", ["b", "a"]])
def test_sort_values_categorical_raises(by):
df = cudf.DataFrame()
df["a"] = np.ascontiguousarray(np.arange(10)[::-1])
df["b"] = df["a"].astype("category")
ddf = dd.from_pandas(df, npartitions=10)

with pytest.raises(NotImplementedError, match="sorting on categorical"):
ddf.sort_values(by=by)


@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
def test_sort_values_single_partition(by, ascending):
Expand Down
Loading