From b5a8a503c3cee798bae44b7f20e1a20f854c320f Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Wed, 22 May 2024 13:53:14 +0200 Subject: [PATCH] docs(python): Expand docstrings for `to_numpy` methods (#16394) --- py-polars/polars/dataframe/frame.py | 76 ++++++++++++++++++++++------- py-polars/polars/series/series.py | 54 +++++++++++++++----- 2 files changed, 101 insertions(+), 29 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 3bca63c971ea..b5a268464355 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -1498,13 +1498,23 @@ def to_numpy( structured: bool = False, # noqa: FBT001 *, order: IndexOrder = "fortran", - allow_copy: bool = True, writable: bool = False, + allow_copy: bool = True, use_pyarrow: bool | None = None, ) -> np.ndarray[Any, Any]: """ Convert this DataFrame to a NumPy ndarray. + This operation copies data only when necessary. The conversion is zero copy when + all of the following hold: + + - The DataFrame is fully contiguous in memory, with all Series back-to-back and + all Series consisting of a single chunk. + - The data type is an integer or float. + - The DataFrame contains no null values. + - The `order` parameter is set to `fortran` (default). + - The `writable` parameter is set to `False` (default). + Parameters ---------- structured @@ -1519,13 +1529,13 @@ def to_numpy( However, the C-like order might be more appropriate to use for downstream applications to prevent cloning data, e.g. when reshaping into a one-dimensional array. - allow_copy - Allow memory to be copied to perform the conversion. If set to `False`, - causes conversions that are not zero-copy to fail. writable Ensure the resulting array is writable. This will force a copy of the data if the array was created without copy, as the underlying Arrow data is immutable. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. use_pyarrow Use `pyarrow.Array.to_numpy @@ -1538,6 +1548,48 @@ def to_numpy( Examples -------- + Numeric data without nulls can be converted without copying data in some cases. + The resulting array will not be writable. + + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> arr = df.to_numpy() + >>> arr + array([[1], + [2], + [3]]) + >>> arr.flags.writeable + False + + Set `writable=True` to force data copy to make the array writable. + + >>> df.to_numpy(writable=True).flags.writeable + True + + If the DataFrame contains different numeric data types, the resulting data type + will be the supertype. This requires data to be copied. Integer types with + nulls are cast to a float type with `nan` representing a null value. + + >>> df = pl.DataFrame({"a": [1, 2, None], "b": [4.0, 5.0, 6.0]}) + >>> df.to_numpy() + array([[ 1., 4.], + [ 2., 5.], + [nan, 6.]]) + + Set `allow_copy=False` to raise an error if data would be copied. + + >>> s.to_numpy(allow_copy=False) # doctest: +SKIP + Traceback (most recent call last): + ... + RuntimeError: copy not allowed: cannot convert to a NumPy array without copying data + + Polars defaults to F-contiguous order. Use `order="c"` to force the resulting + array to be C-contiguous. + + >>> df.to_numpy(order="c").flags.c_contiguous + True + + DataFrames with mixed types will result in an array with an object dtype. + >>> df = pl.DataFrame( ... { ... "foo": [1, 2, 3], @@ -1546,28 +1598,18 @@ def to_numpy( ... }, ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, ... ) - - Export to a standard 2D numpy array. - >>> df.to_numpy() array([[1, 6.5, 'a'], [2, 7.0, 'b'], [3, 8.5, 'c']], dtype=object) - Export to a structured array, which can better-preserve individual - column data, such as name and dtype... + Set `structured=True` to convert to a structured array, which can better + preserve individual column data such as name and data type. >>> df.to_numpy(structured=True) array([(1, 6.5, 'a'), (2, 7. , 'b'), (3, 8.5, 'c')], dtype=[('foo', 'u1'), ('bar', '>> import numpy as np - >>> df.to_numpy(structured=True).view(np.recarray) - rec.array([(1, 6.5, 'a'), (2, 7. , 'b'), (3, 8.5, 'c')], - dtype=[('foo', 'u1'), ('bar', '>> s = pl.Series("a", [1, 2, 3]) + Numeric data without nulls can be converted without copying data. + The resulting array will not be writable. + + >>> s = pl.Series([1, 2, 3], dtype=pl.Int8) >>> arr = s.to_numpy() - >>> arr # doctest: +IGNORE_RESULT - array([1, 2, 3], dtype=int64) - >>> type(arr) - - """ + >>> arr + array([1, 2, 3], dtype=int8) + >>> arr.flags.writeable + False + + Set `writable=True` to force data copy to make the array writable. + + >>> s.to_numpy(writable=True).flags.writeable + True + + Integer Series containing nulls will be cast to a float type with `nan` + representing a null value. This requires data to be copied. + + >>> s = pl.Series([1, 2, None], dtype=pl.UInt16) + >>> s.to_numpy() + array([ 1., 2., nan], dtype=float32) + + Set `allow_copy=False` to raise an error if data would be copied. + + >>> s.to_numpy(allow_copy=False) # doctest: +SKIP + Traceback (most recent call last): + ... + RuntimeError: copy not allowed: cannot convert to a NumPy array without copying data + + Series of data type `Array` and `Struct` will result in an array with more than + one dimension. + + >>> s = pl.Series([[1, 2, 3], [4, 5, 6]], dtype=pl.Array(pl.Int64, 3)) + >>> s.to_numpy() + array([[1, 2, 3], + [4, 5, 6]]) + """ # noqa: W505 if zero_copy_only is not None: issue_deprecation_warning( "The `zero_copy_only` parameter for `Series.to_numpy` is deprecated."