From 0f1ae264fc88d49e3be00d776597e102ed4730c0 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 8 Nov 2024 08:55:15 -0600 Subject: [PATCH] Wrap custom iterator result (#17251) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: #17165 Fixes: https://github.com/rapidsai/cudf/issues/14481 This PR properly wraps the result of custom iterator. ```python In [2]: import pandas as pd In [3]: s = pd.Series([10, 1, 2, 3, 4, 5]*1000000) # Without custom_iter: In [4]: %timeit for i in s: True 6.34 s ± 25.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # This PR: In [4]: %timeit for i in s: True 6.16 s ± 17.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # On `branch-24.12`: 1.53 s ± 6.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) ``` I think `custom_iter` has to exist. Here is why, invoking any sort of `iteration` on GPU objects will raise errors and thus in the end we fall-back to CPU. Instead of trying to move the objects from host to device memory (if the object is on host memory only), we will avoid a CPU-to-GPU transfer. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17251 --- .../cudf/source/developer_guide/cudf_pandas.md | 3 ++- python/cudf/cudf/pandas/_wrappers/common.py | 11 ++++++++++- python/cudf/cudf/pandas/fast_slow_proxy.py | 4 +++- .../cudf/cudf_pandas_tests/test_cudf_pandas.py | 18 ++++++++++++++++++ 4 files changed, 33 insertions(+), 3 deletions(-) diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md index 911a64fa152..b653b786129 100644 --- a/docs/cudf/source/developer_guide/cudf_pandas.md +++ b/docs/cudf/source/developer_guide/cudf_pandas.md @@ -11,7 +11,8 @@ In the rest of this document, to maintain a concrete pair of libraries in mind, For example, future support could include pairs such as CuPy (as the "fast" library) and NumPy (as the "slow" library). ```{note} -We currently do not wrap the entire NumPy library because it exposes a C API. But we do wrap NumPy's `numpy.ndarray` and CuPy's `cupy.ndarray` in a proxy type. +1. We currently do not wrap the entire NumPy library because it exposes a C API. But we do wrap NumPy's `numpy.ndarray` and CuPy's `cupy.ndarray` in a proxy type. +2. There is a `custom_iter` method defined to always utilize slow objects `iter` method, that way we don't move the objects to GPU and trigger an error and again move the object to CPU to execute the iteration successfully. ``` ### Types: diff --git a/python/cudf/cudf/pandas/_wrappers/common.py b/python/cudf/cudf/pandas/_wrappers/common.py index 66a51a83896..b801654068e 100644 --- a/python/cudf/cudf/pandas/_wrappers/common.py +++ b/python/cudf/cudf/pandas/_wrappers/common.py @@ -52,4 +52,13 @@ def array_interface(self: _FastSlowProxy): def custom_iter(self: _FastSlowProxy): - return iter(self._fsproxy_slow) + """ + Custom iter method to handle the case where only the slow + object's iter method is used. + """ + # NOTE: Do not remove this method. This is required to avoid + # falling back to GPU for iter method. + return _maybe_wrap_result( + iter(self._fsproxy_slow), + None, # type: ignore + ) diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 73afde407db..99c0cb82f41 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -1099,7 +1099,9 @@ def _maybe_wrap_result(result: Any, func: Callable, /, *args, **kwargs) -> Any: """ Wraps "result" in a fast-slow proxy if is a "proxiable" object. """ - if _is_final_type(result): + if isinstance(result, (int, str, float, bool, type(None))): + return result + elif _is_final_type(result): typ = get_final_type_map()[type(result)] return typ._fsproxy_wrap(result, func) elif _is_intermediate_type(result): diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 3e7d1cf3c4c..e260b448219 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -1777,3 +1777,21 @@ def test_cudf_pandas_util_version(attrs): assert not hasattr(pd.util, attrs) else: assert hasattr(pd.util, attrs) + + +def test_iteration_over_dataframe_dtypes_produces_proxy_objects(dataframe): + _, xdf = dataframe + xdf["b"] = xpd.IntervalIndex.from_arrays(xdf["a"], xdf["b"]) + xdf["a"] = xpd.Series([1, 1, 1, 2, 3], dtype="category") + dtype_series = xdf.dtypes + assert all(is_proxy_object(x) for x in dtype_series) + assert isinstance(dtype_series.iloc[0], xpd.CategoricalDtype) + assert isinstance(dtype_series.iloc[1], xpd.IntervalDtype) + + +def test_iter_doesnot_raise(monkeypatch): + s = xpd.Series([1, 2, 3]) + with monkeypatch.context() as monkeycontext: + monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") + for _ in s: + pass