From 2296900c9ed14817a31d3eae5aac69e2a3a01e92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Wed, 13 Nov 2024 07:11:02 +0000 Subject: [PATCH 01/18] docs: Fix join API reference links (#19745) --- docs/source/user-guide/transformations/joins.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/user-guide/transformations/joins.md b/docs/source/user-guide/transformations/joins.md index b135a45f53d3..5b55386b70f0 100644 --- a/docs/source/user-guide/transformations/joins.md +++ b/docs/source/user-guide/transformations/joins.md @@ -15,14 +15,14 @@ If you want to learn about joins in general and how to work with them in Polars, === ":fontawesome-brands-python: Python" [:material-api: `join`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join.html) - [:material-api: `join_where`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_asof.html) - [:material-api: `join_asof`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_where.html) + [:material-api: `join_where`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_where.html) + [:material-api: `join_asof`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_asof.html) === ":fontawesome-brands-rust: Rust" [:material-api: `join`](https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join) ([:material-flag-plus: semi_anti_join](/user-guide/installation/#feature-flags "Enable the feature flag semi_anti_join for semi and for anti joins"){.feature-flag} needed for some options.) - [:material-api: `join_asof_by`](https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoin.html#method.join_asof) + [:material-api: `join_asof_by`](https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoinBy.html#method.join_asof_by) [:material-flag-plus: Available on feature asof_join](/user-guide/installation/#feature-flags "To use this functionality enable the feature flag asof_join"){.feature-flag} [:material-api: `join_where`](https://docs.rs/polars/latest/polars/prelude/struct.JoinBuilder.html#method.join_where) [:material-flag-plus: Available on feature iejoin](/user-guide/installation/#feature-flags "To use this functionality enable the feature flag iejoin"){.feature-flag} From 861481c6b46d320e3ddc889a2d3b59e05a9964d6 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 13 Nov 2024 08:11:15 +0100 Subject: [PATCH 02/18] docs: Add `meta.is_column` to API docs (#19744) --- py-polars/docs/source/reference/expressions/meta.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/py-polars/docs/source/reference/expressions/meta.rst b/py-polars/docs/source/reference/expressions/meta.rst index e70283c4c9b4..514067e0166f 100644 --- a/py-polars/docs/source/reference/expressions/meta.rst +++ b/py-polars/docs/source/reference/expressions/meta.rst @@ -11,6 +11,7 @@ The following methods are available under the `expr.meta` attribute. Expr.meta.eq Expr.meta.has_multiple_outputs + Expr.meta.is_column Expr.meta.is_column_selection Expr.meta.is_regex_projection Expr.meta.ne From 6808bd8e83ff985da17f0b21382c7b9707578d62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Wed, 13 Nov 2024 07:11:42 +0000 Subject: [PATCH 03/18] docs: Fix formatting of nested list (#19746) --- docs/source/user-guide/expressions/index.md | 24 +++++++++++---------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/source/user-guide/expressions/index.md b/docs/source/user-guide/expressions/index.md index 7e4b6f0a8b1a..b4442d6f4289 100644 --- a/docs/source/user-guide/expressions/index.md +++ b/docs/source/user-guide/expressions/index.md @@ -4,19 +4,21 @@ We [introduced the concept of “expressions” in a previous section](../concep In this section we will focus on exploring the types of expressions that Polars offers. Each section gives an overview of what they do and provides additional examples. + - Essentials: - - [Basic operations](basic-operations.md) – how to do basic operations on dataframe columns, like arithmetic calculations, comparisons, and other common, general-purpose operations - - [Expression expansion](expression-expansion.md) – what is expression expansion and how to use it - - [Casting](casting.md) – how to convert / cast values to different data types + - [Basic operations](basic-operations.md) – how to do basic operations on dataframe columns, like arithmetic calculations, comparisons, and other common, general-purpose operations + - [Expression expansion](expression-expansion.md) – what is expression expansion and how to use it + - [Casting](casting.md) – how to convert / cast values to different data types - How to work with specific types of data or data type namespaces: - - [Strings](strings.md) – how to work with strings and the namespace `str` - - [Lists and arrays](lists-and-arrays.md) – the differences between the data types `List` and `Array`, when to use them, and how to use them - - [Categorical data and enums](categorical-data-and-enums.md) – the differences between the data types `Categorical` and `Enum`, when to use them, and how to use them - - [Structs](structs.md) – when to use the data type `Struct` and how to use it - - [Missing data](missing-data.md) – how to work with missing data and how to fill missing data + - [Strings](strings.md) – how to work with strings and the namespace `str` + - [Lists and arrays](lists-and-arrays.md) – the differences between the data types `List` and `Array`, when to use them, and how to use them + - [Categorical data and enums](categorical-data-and-enums.md) – the differences between the data types `Categorical` and `Enum`, when to use them, and how to use them + - [Structs](structs.md) – when to use the data type `Struct` and how to use it + - [Missing data](missing-data.md) – how to work with missing data and how to fill missing data - Types of operations: - - [Aggregation](aggregation.md) – how to work with aggregating contexts like `group_by` - - [Window functions](window-functions.md) – how to apply window functions over columns in a dataframe - - [Folds](folds.md) – how to perform arbitrary computations horizontally across columns + - [Aggregation](aggregation.md) – how to work with aggregating contexts like `group_by` + - [Window functions](window-functions.md) – how to apply window functions over columns in a dataframe + - [Folds](folds.md) – how to perform arbitrary computations horizontally across columns - [User-defined Python functions](user-defined-python-functions.md) – how to apply user-defined Python functions to dataframe columns or to column values - [Numpy functions](numpy-functions.md) – how to use NumPy native functions on Polars dataframes and series + From 37ae8e7c67514b4dfd895724aa3a387ceb321851 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Wed, 13 Nov 2024 11:12:35 +0400 Subject: [PATCH 04/18] fix(python): Address incorrect `selector & col` expansion (#19742) --- py-polars/polars/selectors.py | 30 ++++++++++---------------- py-polars/tests/unit/test_selectors.py | 10 +++++++-- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py index 4cb11506b3f6..9d3cedb47e85 100644 --- a/py-polars/polars/selectors.py +++ b/py-polars/polars/selectors.py @@ -385,10 +385,6 @@ def __and__(self, other: Any) -> Expr: ... def __and__(self, other: Any) -> SelectorType | Expr: if is_column(other): colname = other.meta.output_name() - if self._attrs["name"] == "by_name" and ( - params := self._attrs["params"] - ).get("require_all", True): - return by_name(*params["*names"], colname) other = by_name(colname) if is_selector(other): return _selector_proxy_( @@ -399,6 +395,12 @@ def __and__(self, other: Any) -> SelectorType | Expr: else: return self.as_expr().__and__(other) + def __rand__(self, other: Any) -> Expr: + if is_column(other): + colname = other.meta.output_name() + return by_name(colname) & self + return self.as_expr().__rand__(other) + @overload def __or__(self, other: SelectorType) -> SelectorType: ... @@ -417,6 +419,11 @@ def __or__(self, other: Any) -> SelectorType | Expr: else: return self.as_expr().__or__(other) + def __ror__(self, other: Any) -> Expr: + if is_column(other): + other = by_name(other.meta.output_name()) + return self.as_expr().__ror__(other) + @overload def __xor__(self, other: SelectorType) -> SelectorType: ... @@ -435,21 +442,6 @@ def __xor__(self, other: Any) -> SelectorType | Expr: else: return self.as_expr().__or__(other) - def __rand__(self, other: Any) -> Expr: - if is_column(other): - colname = other.meta.output_name() - if self._attrs["name"] == "by_name" and ( - params := self._attrs["params"] - ).get("require_all", True): - return by_name(colname, *params["*names"]) - other = by_name(colname) - return self.as_expr().__rand__(other) - - def __ror__(self, other: Any) -> Expr: - if is_column(other): - other = by_name(other.meta.output_name()) - return self.as_expr().__ror__(other) - def __rxor__(self, other: Any) -> Expr: if is_column(other): other = by_name(other.meta.output_name()) diff --git a/py-polars/tests/unit/test_selectors.py b/py-polars/tests/unit/test_selectors.py index dd2c415c9a13..f4e29e9194c6 100644 --- a/py-polars/tests/unit/test_selectors.py +++ b/py-polars/tests/unit/test_selectors.py @@ -182,11 +182,17 @@ def test_selector_by_name(df: pl.DataFrame) -> None: # check "by_name & col" for selector_expr, expected in ( - (cs.by_name("abc", "cde") & pl.col("ghi"), ["abc", "cde", "ghi"]), - (pl.col("ghi") & cs.by_name("cde", "abc"), ["ghi", "cde", "abc"]), + (cs.by_name("abc", "cde") & pl.col("ghi"), []), + (cs.by_name("abc", "cde") & pl.col("cde"), ["cde"]), + (pl.col("cde") & cs.by_name("cde", "abc"), ["cde"]), ): assert df.select(selector_expr).columns == expected + # check "by_name & by_name" + assert df.select( + cs.by_name("abc", "cde", "def", "eee") & cs.by_name("cde", "eee", "fgg") + ).columns == ["cde", "eee"] + # expected errors with pytest.raises(ColumnNotFoundError, match="xxx"): df.select(cs.by_name("xxx", "fgg", "!!!")) From 87367e978ec62163d345aaca0b8c0b6bbbdae380 Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Wed, 13 Nov 2024 18:48:24 +1100 Subject: [PATCH 05/18] fix: Fix incorrect lazy schema for aggregations (#19753) --- .github/workflows/test-coverage.yml | 8 +- crates/polars-plan/src/plans/aexpr/schema.rs | 223 +++++++++++-------- py-polars/tests/unit/test_schema.py | 32 +++ 3 files changed, 169 insertions(+), 94 deletions(-) diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index 232f79fb8947..531add1428e6 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -96,9 +96,13 @@ jobs: with: python-version: '3.12' - - name: Create virtual environment + - name: Install uv run: | curl -LsSf https://astral.sh/uv/install.sh | sh + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + + - name: Create virtual environment + run: | uv venv echo "$GITHUB_WORKSPACE/.venv/bin" >> $GITHUB_PATH echo "VIRTUAL_ENV=$GITHUB_WORKSPACE/.venv" >> $GITHUB_ENV @@ -165,7 +169,7 @@ jobs: runs-on: ubuntu-latest steps: - # Needed to fetch the Codecov config file + # Needed to fetch the Codecov config file - uses: actions/checkout@v4 - name: Download coverage reports diff --git a/crates/polars-plan/src/plans/aexpr/schema.rs b/crates/polars-plan/src/plans/aexpr/schema.rs index 7105855636c5..6c1b675b2bd8 100644 --- a/crates/polars-plan/src/plans/aexpr/schema.rs +++ b/crates/polars-plan/src/plans/aexpr/schema.rs @@ -32,50 +32,57 @@ impl AExpr { ctx: Context, arena: &Arena, ) -> PolarsResult { - // During aggregation a column that isn't aggregated gets an extra nesting level - // col(foo: i64) -> list[i64] - // But not if we do an aggregation: - // col(foo: i64).sum() -> i64 - // The `nested` keeps track of the nesting we need to add. - let mut nested = matches!(ctx, Context::Aggregation) as u8; - let mut field = self.to_field_impl(schema, ctx, arena, &mut nested)?; + // Indicates whether we should auto-implode the result. This is initialized to true if we are + // in an aggregation context, so functions that return scalars should explicitly set this + // to false in `to_field_impl`. + let mut agg_list = matches!(ctx, Context::Aggregation); + let mut field = self.to_field_impl(schema, ctx, arena, &mut agg_list)?; - if nested >= 1 { + if agg_list { field.coerce(field.dtype().clone().implode()); } + Ok(field) } /// Get Field result of the expression. The schema is the input data. + /// + /// This is taken as `&mut bool` as for some expressions this is determined by the upper node + /// (e.g. `alias`, `cast`). #[recursive] pub fn to_field_impl( &self, schema: &Schema, ctx: Context, arena: &Arena, - nested: &mut u8, + agg_list: &mut bool, ) -> PolarsResult { use AExpr::*; use DataType::*; match self { Len => { - *nested = 0; + *agg_list = false; Ok(Field::new(PlSmallStr::from_static(LEN), IDX_DTYPE)) }, Window { function, options, .. } => { - if let WindowType::Over(mapping) = options { - *nested += matches!(mapping, WindowMapping::Join) as u8; + if let WindowType::Over(WindowMapping::Join) = options { + // expr.over(..), defaults to agg-list unless explicitly unset + // by the `to_field_impl` of the `expr` + *agg_list = true; } + let e = arena.get(*function); - e.to_field_impl(schema, ctx, arena, nested) + e.to_field_impl(schema, ctx, arena, agg_list) }, Explode(expr) => { // `Explode` is a "flatten" operation, which is not the same as returning a scalar. // Namely, it should be auto-imploded in the aggregation context, so we don't update - // the `nested` state here. - let field = arena.get(*expr).to_field_impl(schema, ctx, arena, &mut 0)?; + // the `agg_list` state here. + let field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; if let List(inner) = field.dtype() { Ok(Field::new(field.name().clone(), *inner.clone())) @@ -87,14 +94,14 @@ impl AExpr { name.clone(), arena .get(*expr) - .to_field_impl(schema, ctx, arena, nested)? + .to_field_impl(schema, ctx, arena, agg_list)? .dtype, )), Column(name) => schema .get_field(name) .ok_or_else(|| PolarsError::ColumnNotFound(name.to_string().into())), Literal(sv) => { - *nested = 0; + *agg_list = false; Ok(match sv { LiteralValue::Series(s) => s.field().into_owned(), _ => Field::new(sv.output_name().clone(), sv.get_datatype()), @@ -116,35 +123,42 @@ impl AExpr { | Operator::LogicalOr => { let out_field; let out_name = { - out_field = - arena.get(*left).to_field_impl(schema, ctx, arena, nested)?; + out_field = arena + .get(*left) + .to_field_impl(schema, ctx, arena, agg_list)?; out_field.name() }; Field::new(out_name.clone(), Boolean) }, Operator::TrueDivide => { - return get_truediv_field(*left, *right, arena, ctx, schema, nested) + return get_truediv_field(*left, *right, arena, ctx, schema, agg_list) }, _ => { - return get_arithmetic_field(*left, *right, arena, *op, ctx, schema, nested) + return get_arithmetic_field( + *left, *right, arena, *op, ctx, schema, agg_list, + ) }, }; Ok(field) }, - Sort { expr, .. } => arena.get(*expr).to_field_impl(schema, ctx, arena, nested), + Sort { expr, .. } => arena.get(*expr).to_field_impl(schema, ctx, arena, agg_list), Gather { expr, returns_scalar, .. } => { if *returns_scalar { - *nested = nested.saturating_sub(1); + *agg_list = false; } - arena.get(*expr).to_field_impl(schema, ctx, arena, nested) + arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false) }, - SortBy { expr, .. } => arena.get(*expr).to_field_impl(schema, ctx, arena, nested), - Filter { input, .. } => arena.get(*input).to_field_impl(schema, ctx, arena, nested), + SortBy { expr, .. } => arena.get(*expr).to_field_impl(schema, ctx, arena, agg_list), + Filter { input, .. } => arena + .get(*input) + .to_field_impl(schema, ctx, arena, agg_list), Agg(agg) => { use IRAggExpr::*; match agg { @@ -152,13 +166,16 @@ impl AExpr { | Min { input: expr, .. } | First(expr) | Last(expr) => { - *nested = nested.saturating_sub(1); - arena.get(*expr).to_field_impl(schema, ctx, arena, nested) + *agg_list = false; + arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false) }, Sum(expr) => { - *nested = nested.saturating_sub(1); - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; let dt = match field.dtype() { Boolean => Some(IDX_DTYPE), UInt8 | Int8 | Int16 | UInt16 => Some(Int64), @@ -170,9 +187,10 @@ impl AExpr { Ok(field) }, Median(expr) => { - *nested = nested.saturating_sub(1); - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; match field.dtype { Date => field.coerce(Datetime(TimeUnit::Milliseconds, None)), _ => float_type(&mut field), @@ -180,9 +198,10 @@ impl AExpr { Ok(field) }, Mean(expr) => { - *nested = nested.saturating_sub(1); - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; match field.dtype { Date => field.coerce(Datetime(TimeUnit::Milliseconds, None)), _ => float_type(&mut field), @@ -190,69 +209,80 @@ impl AExpr { Ok(field) }, Implode(expr) => { - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; field.coerce(DataType::List(field.dtype().clone().into())); Ok(field) }, Std(expr, _) => { - *nested = nested.saturating_sub(1); - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; float_type(&mut field); Ok(field) }, Var(expr, _) => { - *nested = nested.saturating_sub(1); - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; float_type(&mut field); Ok(field) }, NUnique(expr) => { - *nested = 0; - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; field.coerce(IDX_DTYPE); Ok(field) }, Count(expr, _) => { - *nested = 0; - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; field.coerce(IDX_DTYPE); Ok(field) }, AggGroups(expr) => { - *nested = 1; - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = true; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; field.coerce(List(IDX_DTYPE.into())); Ok(field) }, Quantile { expr, .. } => { - *nested = nested.saturating_sub(1); - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; float_type(&mut field); Ok(field) }, #[cfg(feature = "bitwise")] Bitwise(expr, _) => { - *nested = nested.saturating_sub(1); - let field = arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; // @Q? Do we need to coerce here? Ok(field) }, } }, Cast { expr, dtype, .. } => { - let field = arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + let field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, agg_list)?; Ok(Field::new(field.name().clone(), dtype.clone())) }, Ternary { truthy, falsy, .. } => { - let mut nested_truthy = *nested; - let mut nested_falsy = *nested; + let mut agg_list_truthy = *agg_list; + let mut agg_list_falsy = *agg_list; // During aggregation: // left: col(foo): list nesting: 1 @@ -261,11 +291,11 @@ impl AExpr { let mut truthy = arena .get(*truthy) - .to_field_impl(schema, ctx, arena, &mut nested_truthy)?; + .to_field_impl(schema, ctx, arena, &mut agg_list_truthy)?; let falsy = arena .get(*falsy) - .to_field_impl(schema, ctx, arena, &mut nested_falsy)?; + .to_field_impl(schema, ctx, arena, &mut agg_list_falsy)?; let st = if let DataType::Null = *truthy.dtype() { falsy.dtype().clone() @@ -273,7 +303,7 @@ impl AExpr { try_get_supertype(truthy.dtype(), falsy.dtype())? }; - *nested = std::cmp::max(nested_truthy, nested_falsy); + *agg_list = agg_list_truthy | agg_list_falsy; truthy.coerce(st); Ok(truthy) @@ -284,14 +314,14 @@ impl AExpr { options, .. } => { - let fields = func_args_to_fields(input, ctx, schema, arena, nested)?; + let fields = func_args_to_fields(input, ctx, schema, arena, agg_list)?; polars_ensure!(!fields.is_empty(), ComputeError: "expression: '{}' didn't get any inputs", options.fmt_str); let out = output_type.get_field(schema, ctx, &fields)?; if options.flags.contains(FunctionFlags::RETURNS_SCALAR) { - *nested = 0; + *agg_list = false; } else if matches!(ctx, Context::Aggregation) { - *nested += 1; + *agg_list = true; } Ok(out) @@ -301,19 +331,21 @@ impl AExpr { input, options, } => { - let fields = func_args_to_fields(input, ctx, schema, arena, nested)?; + let fields = func_args_to_fields(input, ctx, schema, arena, agg_list)?; polars_ensure!(!fields.is_empty(), ComputeError: "expression: '{}' didn't get any inputs", function); let out = function.get_field(schema, ctx, &fields)?; if options.flags.contains(FunctionFlags::RETURNS_SCALAR) { - *nested = 0; + *agg_list = false; } else if matches!(ctx, Context::Aggregation) { - *nested += 1; + *agg_list = true; } Ok(out) }, - Slice { input, .. } => arena.get(*input).to_field_impl(schema, ctx, arena, nested), + Slice { input, .. } => arena + .get(*input) + .to_field_impl(schema, ctx, arena, agg_list), } } } @@ -323,25 +355,28 @@ fn func_args_to_fields( ctx: Context, schema: &Schema, arena: &Arena, - nested: &mut u8, + agg_list: &mut bool, ) -> PolarsResult> { - let mut first = true; input .iter() + .enumerate() // Default context because `col()` would return a list in aggregation context - .map(|e| { - // Only mutate first nested as that is the dtype of the function. - let mut nested_tmp = *nested; - let nested = if first { - first = false; - &mut *nested - } else { - &mut nested_tmp - }; + .map(|(i, e)| { + let tmp = &mut false; arena .get(e.node()) - .to_field_impl(schema, ctx, arena, nested) + .to_field_impl( + schema, + ctx, + arena, + if i == 0 { + // Only mutate first agg_list as that is the dtype of the function. + agg_list + } else { + tmp + }, + ) .map(|mut field| { field.name = e.output_name().clone(); field @@ -357,7 +392,7 @@ fn get_arithmetic_field( op: Operator, ctx: Context, schema: &Schema, - nested: &mut u8, + agg_list: &mut bool, ) -> PolarsResult { use DataType::*; let left_ae = arena.get(left); @@ -371,11 +406,11 @@ fn get_arithmetic_field( // leading to quadratic behavior. # 4736 // // further right_type is only determined when needed. - let mut left_field = left_ae.to_field_impl(schema, ctx, arena, nested)?; + let mut left_field = left_ae.to_field_impl(schema, ctx, arena, agg_list)?; let super_type = match op { Operator::Minus => { - let right_type = right_ae.to_field_impl(schema, ctx, arena, nested)?.dtype; + let right_type = right_ae.to_field_impl(schema, ctx, arena, agg_list)?.dtype; match (&left_field.dtype, &right_type) { #[cfg(feature = "dtype-struct")] (Struct(_), Struct(_)) => { @@ -430,7 +465,7 @@ fn get_arithmetic_field( } }, Operator::Plus => { - let right_type = right_ae.to_field_impl(schema, ctx, arena, nested)?.dtype; + let right_type = right_ae.to_field_impl(schema, ctx, arena, agg_list)?.dtype; match (&left_field.dtype, &right_type) { (Duration(_), Datetime(_, _)) | (Datetime(_, _), Duration(_)) @@ -472,7 +507,7 @@ fn get_arithmetic_field( } }, _ => { - let right_type = right_ae.to_field_impl(schema, ctx, arena, nested)?.dtype; + let right_type = right_ae.to_field_impl(schema, ctx, arena, agg_list)?.dtype; match (&left_field.dtype, &right_type) { #[cfg(feature = "dtype-struct")] @@ -558,10 +593,14 @@ fn get_truediv_field( arena: &Arena, ctx: Context, schema: &Schema, - nested: &mut u8, + agg_list: &mut bool, ) -> PolarsResult { - let mut left_field = arena.get(left).to_field_impl(schema, ctx, arena, nested)?; - let right_field = arena.get(right).to_field_impl(schema, ctx, arena, nested)?; + let mut left_field = arena + .get(left) + .to_field_impl(schema, ctx, arena, agg_list)?; + let right_field = arena + .get(right) + .to_field_impl(schema, ctx, arena, agg_list)?; use DataType::*; // TODO: Re-investigate this. A lot of "_" is being used on the RHS match because this code diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py index 78a277a3662f..a8f9e43d84c0 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/test_schema.py @@ -246,3 +246,35 @@ def test_lf_agg_lit_explode() -> None: schema = {"k": pl.Int64, "o": pl.List(pl.Int64)} assert q.collect_schema() == schema assert_frame_equal(q.collect(), pl.DataFrame({"k": 1, "o": [[1]]}, schema=schema)) # type: ignore[arg-type] + + +@pytest.mark.parametrize("expr_op", [ + "approx_n_unique", "arg_max", "arg_min", "bitwise_and", "bitwise_or", + "bitwise_xor", "count", "entropy", "first", "has_nulls", "implode", "kurtosis", + "last", "len", "lower_bound", "max", "mean", "median", "min", "n_unique", "nan_max", + "nan_min", "null_count", "product", "sample", "skew", "std", "sum", "upper_bound", + "var" +]) # fmt: skip +def test_lf_agg_auto_agg_list_19752(expr_op: str) -> None: + op = getattr(pl.Expr, expr_op) + + lf = pl.LazyFrame({"a": 1, "b": 1}) + + q = lf.group_by("a").agg(pl.col("b").reverse().pipe(op)) + assert q.collect_schema() == q.collect().collect_schema() + + q = lf.group_by("a").agg(pl.col("b").shuffle().reverse().pipe(op)) + + assert q.collect_schema() == q.collect().collect_schema() + + +@pytest.mark.parametrize( + "expr", [pl.col("b"), pl.col("b").sum(), pl.col("b").reverse()] +) +@pytest.mark.parametrize("mapping_strategy", ["explode", "join", "group_to_rows"]) +def test_lf_window_schema(expr: pl.Expr, mapping_strategy: str) -> None: + q = pl.LazyFrame({"a": 1, "b": 1}).select( + expr.over("a", mapping_strategy=mapping_strategy) # type: ignore[arg-type] + ) + + assert q.collect_schema() == q.collect().collect_schema() From 18786acd8d1eb68fc87982b07ce29ecbae0923f0 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 13 Nov 2024 03:01:31 -0500 Subject: [PATCH 06/18] fix(python): Release GIL in Python APIs, part 1 (#19705) Co-authored-by: Itamar Turner-Trauring --- .../polars-python/src/series/aggregation.rs | 118 +++++----- crates/polars-python/src/series/arithmetic.rs | 33 +-- crates/polars-python/src/series/buffers.rs | 9 +- crates/polars-python/src/series/comparison.rs | 79 ++++--- .../polars-python/src/series/construction.rs | 18 +- crates/polars-python/src/series/export.rs | 14 +- crates/polars-python/src/series/general.rs | 205 +++++++++++------- crates/polars-python/src/series/scatter.rs | 5 +- 8 files changed, 267 insertions(+), 214 deletions(-) diff --git a/crates/polars-python/src/series/aggregation.rs b/crates/polars-python/src/series/aggregation.rs index 5aa8ee16639e..c4fe8d3447ec 100644 --- a/crates/polars-python/src/series/aggregation.rs +++ b/crates/polars-python/src/series/aggregation.rs @@ -8,37 +8,39 @@ use crate::error::PyPolarsErr; #[pymethods] impl PySeries { - fn any(&self, ignore_nulls: bool) -> PyResult> { - let s = self.series.bool().map_err(PyPolarsErr::from)?; - Ok(if ignore_nulls { - Some(s.any()) - } else { - s.any_kleene() + fn any(&self, py: Python, ignore_nulls: bool) -> PyResult> { + py.allow_threads(|| { + let s = self.series.bool().map_err(PyPolarsErr::from)?; + Ok(if ignore_nulls { + Some(s.any()) + } else { + s.any_kleene() + }) }) } - fn all(&self, ignore_nulls: bool) -> PyResult> { - let s = self.series.bool().map_err(PyPolarsErr::from)?; - Ok(if ignore_nulls { - Some(s.all()) - } else { - s.all_kleene() + fn all(&self, py: Python, ignore_nulls: bool) -> PyResult> { + py.allow_threads(|| { + let s = self.series.bool().map_err(PyPolarsErr::from)?; + Ok(if ignore_nulls { + Some(s.all()) + } else { + s.all_kleene() + }) }) } - fn arg_max(&self) -> Option { - self.series.arg_max() + fn arg_max(&self, py: Python) -> Option { + py.allow_threads(|| self.series.arg_max()) } - fn arg_min(&self) -> Option { - self.series.arg_min() + fn arg_min(&self, py: Python) -> Option { + py.allow_threads(|| self.series.arg_min()) } fn max(&self, py: Python) -> PyResult { Ok(Wrap( - self.series - .max_reduce() - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.max_reduce().map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) @@ -47,49 +49,42 @@ impl PySeries { fn mean(&self, py: Python) -> PyResult { match self.series.dtype() { Boolean => Ok(Wrap( - self.series - .cast(&DataType::UInt8) - .unwrap() - .mean_reduce() + py.allow_threads(|| self.series.cast(&DataType::UInt8).unwrap().mean_reduce()) .as_any_value(), ) .into_py(py)), // For non-numeric output types we require mean_reduce. - dt if dt.is_temporal() => { - Ok(Wrap(self.series.mean_reduce().as_any_value()).into_py(py)) - }, - _ => Ok(self.series.mean().into_py(py)), + dt if dt.is_temporal() => Ok(Wrap( + py.allow_threads(|| self.series.mean_reduce()) + .as_any_value(), + ) + .into_py(py)), + _ => Ok(py.allow_threads(|| self.series.mean()).into_py(py)), } } fn median(&self, py: Python) -> PyResult { match self.series.dtype() { Boolean => Ok(Wrap( - self.series - .cast(&DataType::UInt8) - .unwrap() - .median_reduce() + py.allow_threads(|| self.series.cast(&DataType::UInt8).unwrap().median_reduce()) .map_err(PyPolarsErr::from)? .as_any_value(), ) .into_py(py)), // For non-numeric output types we require median_reduce. dt if dt.is_temporal() => Ok(Wrap( - self.series - .median_reduce() + py.allow_threads(|| self.series.median_reduce()) .map_err(PyPolarsErr::from)? .as_any_value(), ) .into_py(py)), - _ => Ok(self.series.median().into_py(py)), + _ => Ok(py.allow_threads(|| self.series.median()).into_py(py)), } } fn min(&self, py: Python) -> PyResult { Ok(Wrap( - self.series - .min_reduce() - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.min_reduce().map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) @@ -97,26 +92,27 @@ impl PySeries { fn product(&self, py: Python) -> PyResult { Ok(Wrap( - self.series - .product() - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.product().map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) } - fn quantile(&self, quantile: f64, interpolation: Wrap) -> PyResult { - let bind = self.series.quantile_reduce(quantile, interpolation.0); + fn quantile( + &self, + py: Python, + quantile: f64, + interpolation: Wrap, + ) -> PyResult { + let bind = py.allow_threads(|| self.series.quantile_reduce(quantile, interpolation.0)); let sc = bind.map_err(PyPolarsErr::from)?; - Ok(Python::with_gil(|py| Wrap(sc.as_any_value()).into_py(py))) + Ok(Wrap(sc.as_any_value()).into_py(py)) } fn std(&self, py: Python, ddof: u8) -> PyResult { Ok(Wrap( - self.series - .std_reduce(ddof) - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.std_reduce(ddof).map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) @@ -124,9 +120,7 @@ impl PySeries { fn var(&self, py: Python, ddof: u8) -> PyResult { Ok(Wrap( - self.series - .var_reduce(ddof) - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.var_reduce(ddof).map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) @@ -134,37 +128,31 @@ impl PySeries { fn sum(&self, py: Python) -> PyResult { Ok(Wrap( - self.series - .sum_reduce() - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.sum_reduce().map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) } fn first(&self, py: Python) -> PyObject { - Wrap(self.series.first().as_any_value()).into_py(py) + Wrap(py.allow_threads(|| self.series.first()).as_any_value()).into_py(py) } fn last(&self, py: Python) -> PyObject { - Wrap(self.series.last().as_any_value()).into_py(py) + Wrap(py.allow_threads(|| self.series.last()).as_any_value()).into_py(py) } #[cfg(feature = "approx_unique")] fn approx_n_unique(&self, py: Python) -> PyResult { - Ok(self - .series - .approx_n_unique() - .map_err(PyPolarsErr::from)? + Ok(py + .allow_threads(|| self.series.approx_n_unique().map_err(PyPolarsErr::from))? .into_py(py)) } #[cfg(feature = "bitwise")] fn bitwise_and(&self, py: Python) -> PyResult { Ok(Wrap( - self.series - .and_reduce() - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.and_reduce().map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) @@ -173,9 +161,7 @@ impl PySeries { #[cfg(feature = "bitwise")] fn bitwise_or(&self, py: Python) -> PyResult { Ok(Wrap( - self.series - .or_reduce() - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.or_reduce().map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) @@ -184,9 +170,7 @@ impl PySeries { #[cfg(feature = "bitwise")] fn bitwise_xor(&self, py: Python) -> PyResult { Ok(Wrap( - self.series - .xor_reduce() - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.xor_reduce().map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) diff --git a/crates/polars-python/src/series/arithmetic.rs b/crates/polars-python/src/series/arithmetic.rs index c5483aced1e7..62edd00a7656 100644 --- a/crates/polars-python/src/series/arithmetic.rs +++ b/crates/polars-python/src/series/arithmetic.rs @@ -6,28 +6,33 @@ use crate::error::PyPolarsErr; #[pymethods] impl PySeries { - fn add(&self, other: &PySeries) -> PyResult { - Ok((&self.series + &other.series) + fn add(&self, py: Python, other: &PySeries) -> PyResult { + Ok(py + .allow_threads(|| &self.series + &other.series) .map(Into::into) .map_err(PyPolarsErr::from)?) } - fn sub(&self, other: &PySeries) -> PyResult { - Ok((&self.series - &other.series) + fn sub(&self, py: Python, other: &PySeries) -> PyResult { + Ok(py + .allow_threads(|| &self.series - &other.series) .map(Into::into) .map_err(PyPolarsErr::from)?) } - fn div(&self, other: &PySeries) -> PyResult { - Ok((&self.series / &other.series) + fn div(&self, py: Python, other: &PySeries) -> PyResult { + Ok(py + .allow_threads(|| &self.series / &other.series) .map(Into::into) .map_err(PyPolarsErr::from)?) } - fn mul(&self, other: &PySeries) -> PyResult { - Ok((&self.series * &other.series) + fn mul(&self, py: Python, other: &PySeries) -> PyResult { + Ok(py + .allow_threads(|| &self.series * &other.series) .map(Into::into) .map_err(PyPolarsErr::from)?) } - fn rem(&self, other: &PySeries) -> PyResult { - Ok((&self.series % &other.series) + fn rem(&self, py: Python, other: &PySeries) -> PyResult { + Ok(py + .allow_threads(|| &self.series % &other.series) .map(Into::into) .map_err(PyPolarsErr::from)?) } @@ -37,8 +42,8 @@ macro_rules! impl_arithmetic { ($name:ident, $type:ty, $operand:tt) => { #[pymethods] impl PySeries { - fn $name(&self, other: $type) -> PyResult { - Ok((&self.series $operand other).into()) + fn $name(&self, py: Python, other: $type) -> PyResult { + Ok(py.allow_threads(|| {&self.series $operand other}).into()) } } }; @@ -103,8 +108,8 @@ macro_rules! impl_rhs_arithmetic { ($name:ident, $type:ty, $operand:ident) => { #[pymethods] impl PySeries { - fn $name(&self, other: $type) -> PyResult { - Ok(other.$operand(&self.series).into()) + fn $name(&self, py: Python, other: $type) -> PyResult { + Ok(py.allow_threads(|| other.$operand(&self.series)).into()) } } }; diff --git a/crates/polars-python/src/series/buffers.rs b/crates/polars-python/src/series/buffers.rs index 939159220277..e3b9402d4d47 100644 --- a/crates/polars-python/src/series/buffers.rs +++ b/crates/polars-python/src/series/buffers.rs @@ -82,9 +82,9 @@ impl PySeries { } /// Return the underlying values, validity, and offsets buffers as Series. - fn _get_buffers(&self) -> PyResult<(Self, Option, Option)> { + fn _get_buffers(&self, py: Python) -> PyResult<(Self, Option, Option)> { let s = &self.series; - match s.dtype().to_physical() { + py.allow_threads(|| match s.dtype().to_physical() { dt if dt.is_numeric() => get_buffers_from_primitive(s), DataType::Boolean => get_buffers_from_primitive(s), DataType::String => get_buffers_from_string(s), @@ -92,7 +92,7 @@ impl PySeries { let msg = format!("`_get_buffers` not implemented for `dtype` {dt}"); Err(PyTypeError::new_err(msg)) }, - } + }) } } @@ -253,6 +253,7 @@ impl PySeries { #[staticmethod] #[pyo3(signature = (dtype, data, validity=None))] unsafe fn _from_buffers( + py: Python, dtype: Wrap, data: Vec, validity: Option, @@ -320,7 +321,7 @@ impl PySeries { )), }; let values = series_to_buffer::(values); - from_buffers_string_impl(values, validity, offsets)? + py.allow_threads(|| from_buffers_string_impl(values, validity, offsets))? }, dt => { let msg = format!("`_from_buffers` not implemented for `dtype` {dt}"); diff --git a/crates/polars-python/src/series/comparison.rs b/crates/polars-python/src/series/comparison.rs index 7064edb7698a..2b7de37931f9 100644 --- a/crates/polars-python/src/series/comparison.rs +++ b/crates/polars-python/src/series/comparison.rs @@ -6,36 +6,45 @@ use crate::PySeries; #[pymethods] impl PySeries { - fn eq(&self, rhs: &PySeries) -> PyResult { - let s = self.series.equal(&rhs.series).map_err(PyPolarsErr::from)?; + fn eq(&self, py: Python, rhs: &PySeries) -> PyResult { + let s = py + .allow_threads(|| self.series.equal(&rhs.series)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } - fn neq(&self, rhs: &PySeries) -> PyResult { - let s = self - .series - .not_equal(&rhs.series) + fn neq(&self, py: Python, rhs: &PySeries) -> PyResult { + let s = py + .allow_threads(|| self.series.not_equal(&rhs.series)) .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } - fn gt(&self, rhs: &PySeries) -> PyResult { - let s = self.series.gt(&rhs.series).map_err(PyPolarsErr::from)?; + fn gt(&self, py: Python, rhs: &PySeries) -> PyResult { + let s = py + .allow_threads(|| self.series.gt(&rhs.series)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } - fn gt_eq(&self, rhs: &PySeries) -> PyResult { - let s = self.series.gt_eq(&rhs.series).map_err(PyPolarsErr::from)?; + fn gt_eq(&self, py: Python, rhs: &PySeries) -> PyResult { + let s = py + .allow_threads(|| self.series.gt_eq(&rhs.series)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } - fn lt(&self, rhs: &PySeries) -> PyResult { - let s = self.series.lt(&rhs.series).map_err(PyPolarsErr::from)?; + fn lt(&self, py: Python, rhs: &PySeries) -> PyResult { + let s = py + .allow_threads(|| self.series.lt(&rhs.series)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } - fn lt_eq(&self, rhs: &PySeries) -> PyResult { - let s = self.series.lt_eq(&rhs.series).map_err(PyPolarsErr::from)?; + fn lt_eq(&self, py: Python, rhs: &PySeries) -> PyResult { + let s = py + .allow_threads(|| self.series.lt_eq(&rhs.series)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } @@ -44,8 +53,10 @@ macro_rules! impl_eq_num { ($name:ident, $type:ty) => { #[pymethods] impl PySeries { - fn $name(&self, rhs: $type) -> PyResult { - let s = self.series.equal(rhs).map_err(PyPolarsErr::from)?; + fn $name(&self, py: Python, rhs: $type) -> PyResult { + let s = py + .allow_threads(|| self.series.equal(rhs)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } @@ -69,8 +80,10 @@ macro_rules! impl_neq_num { #[allow(clippy::nonstandard_macro_braces)] #[pymethods] impl PySeries { - fn $name(&self, rhs: $type) -> PyResult { - let s = self.series.not_equal(rhs).map_err(PyPolarsErr::from)?; + fn $name(&self, py: Python, rhs: $type) -> PyResult { + let s = py + .allow_threads(|| self.series.not_equal(rhs)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } @@ -93,8 +106,10 @@ macro_rules! impl_gt_num { ($name:ident, $type:ty) => { #[pymethods] impl PySeries { - fn $name(&self, rhs: $type) -> PyResult { - let s = self.series.gt(rhs).map_err(PyPolarsErr::from)?; + fn $name(&self, py: Python, rhs: $type) -> PyResult { + let s = py + .allow_threads(|| self.series.gt(rhs)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } @@ -117,8 +132,10 @@ macro_rules! impl_gt_eq_num { ($name:ident, $type:ty) => { #[pymethods] impl PySeries { - fn $name(&self, rhs: $type) -> PyResult { - let s = self.series.gt_eq(rhs).map_err(PyPolarsErr::from)?; + fn $name(&self, py: Python, rhs: $type) -> PyResult { + let s = py + .allow_threads(|| self.series.gt_eq(rhs)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } @@ -142,8 +159,10 @@ macro_rules! impl_lt_num { #[allow(clippy::nonstandard_macro_braces)] #[pymethods] impl PySeries { - fn $name(&self, rhs: $type) -> PyResult { - let s = self.series.lt(rhs).map_err(PyPolarsErr::from)?; + fn $name(&self, py: Python, rhs: $type) -> PyResult { + let s = py + .allow_threads(|| self.series.lt(rhs)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } @@ -166,8 +185,10 @@ macro_rules! impl_lt_eq_num { ($name:ident, $type:ty) => { #[pymethods] impl PySeries { - fn $name(&self, rhs: $type) -> PyResult { - let s = self.series.lt_eq(rhs).map_err(PyPolarsErr::from)?; + fn $name(&self, py: Python, rhs: $type) -> PyResult { + let s = py + .allow_threads(|| self.series.lt_eq(rhs)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } @@ -226,12 +247,14 @@ macro_rules! impl_decimal { ($name:ident, $method:ident) => { #[pymethods] impl PySeries { - fn $name(&self, rhs: PyDecimal) -> PyResult { + fn $name(&self, py: Python, rhs: PyDecimal) -> PyResult { let rhs = Series::new( PlSmallStr::from_static("decimal"), &[AnyValue::Decimal(rhs.0, rhs.1)], ); - let s = self.series.$method(&rhs).map_err(PyPolarsErr::from)?; + let s = py + .allow_threads(|| self.series.$method(&rhs)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } diff --git a/crates/polars-python/src/series/construction.rs b/crates/polars-python/src/series/construction.rs index 5935f1e7b0ce..e9dbdf264d8c 100644 --- a/crates/polars-python/src/series/construction.rs +++ b/crates/polars-python/src/series/construction.rs @@ -71,10 +71,11 @@ impl PySeries { if nan_is_null { let array = array.readonly(); let vals = array.as_slice().unwrap(); - let ca: Float32Chunked = vals - .iter() - .map(|&val| if f32::is_nan(val) { None } else { Some(val) }) - .collect_trusted(); + let ca: Float32Chunked = py.allow_threads(|| { + vals.iter() + .map(|&val| if f32::is_nan(val) { None } else { Some(val) }) + .collect_trusted() + }); ca.with_name(name.into()).into_series().into() } else { mmap_numpy_array(py, name, array) @@ -86,10 +87,11 @@ impl PySeries { if nan_is_null { let array = array.readonly(); let vals = array.as_slice().unwrap(); - let ca: Float64Chunked = vals - .iter() - .map(|&val| if f64::is_nan(val) { None } else { Some(val) }) - .collect_trusted(); + let ca: Float64Chunked = py.allow_threads(|| { + vals.iter() + .map(|&val| if f64::is_nan(val) { None } else { Some(val) }) + .collect_trusted() + }); ca.with_name(name.into()).into_series().into() } else { mmap_numpy_array(py, name, array) diff --git a/crates/polars-python/src/series/export.rs b/crates/polars-python/src/series/export.rs index 886b6114427a..959b2dd47293 100644 --- a/crates/polars-python/src/series/export.rs +++ b/crates/polars-python/src/series/export.rs @@ -147,17 +147,11 @@ impl PySeries { /// Return the underlying Arrow array. #[allow(clippy::wrong_self_convention)] - fn to_arrow(&mut self, compat_level: PyCompatLevel) -> PyResult { - self.rechunk(true); - Python::with_gil(|py| { - let pyarrow = py.import_bound("pyarrow")?; + fn to_arrow(&mut self, py: Python, compat_level: PyCompatLevel) -> PyResult { + self.rechunk(py, true); + let pyarrow = py.import_bound("pyarrow")?; - interop::arrow::to_py::to_py_array( - self.series.to_arrow(0, compat_level.0), - py, - &pyarrow, - ) - }) + interop::arrow::to_py::to_py_array(self.series.to_arrow(0, compat_level.0), py, &pyarrow) } #[allow(unused_variables)] diff --git a/crates/polars-python/src/series/general.rs b/crates/polars-python/src/series/general.rs index 7312995d7606..3134f5354f09 100644 --- a/crates/polars-python/src/series/general.rs +++ b/crates/polars-python/src/series/general.rs @@ -16,9 +16,9 @@ use crate::py_modules::POLARS; #[pymethods] impl PySeries { - fn struct_unnest(&self) -> PyResult { + fn struct_unnest(&self, py: Python) -> PyResult { let ca = self.series.struct_().map_err(PyPolarsErr::from)?; - let df: DataFrame = ca.clone().unnest(); + let df: DataFrame = py.allow_threads(|| ca.clone().unnest()); Ok(df.into()) } @@ -56,9 +56,9 @@ impl PySeries { Ok(ca.get_rev_map().is_local()) } - pub fn cat_to_local(&self) -> PyResult { + pub fn cat_to_local(&self, py: Python) -> PyResult { let ca = self.series.categorical().map_err(PyPolarsErr::from)?; - Ok(ca.to_local().into_series().into()) + Ok(py.allow_threads(|| ca.to_local().into_series().into())) } fn estimated_size(&self) -> usize { @@ -78,15 +78,14 @@ impl PySeries { } #[cfg(feature = "dtype-array")] - fn reshape(&self, dims: Vec) -> PyResult { + fn reshape(&self, py: Python, dims: Vec) -> PyResult { let dims = dims .into_iter() .map(ReshapeDimension::new) .collect::>(); - let out = self - .series - .reshape_array(&dims) + let out = py + .allow_threads(|| self.series.reshape_array(&dims)) .map_err(PyPolarsErr::from)?; Ok(out.into()) } @@ -114,8 +113,8 @@ impl PySeries { } } - pub fn rechunk(&mut self, in_place: bool) -> Option { - let series = self.series.rechunk(); + pub fn rechunk(&mut self, py: Python, in_place: bool) -> Option { + let series = py.allow_threads(|| self.series.rechunk()); if in_place { self.series = series; None @@ -167,16 +166,23 @@ impl PySeries { self.get_index(py, index) } - fn bitand(&self, other: &PySeries) -> PyResult { - let out = (&self.series & &other.series).map_err(PyPolarsErr::from)?; + fn bitand(&self, py: Python, other: &PySeries) -> PyResult { + let out = py + .allow_threads(|| &self.series & &other.series) + .map_err(PyPolarsErr::from)?; Ok(out.into()) } - fn bitor(&self, other: &PySeries) -> PyResult { - let out = (&self.series | &other.series).map_err(PyPolarsErr::from)?; + + fn bitor(&self, py: Python, other: &PySeries) -> PyResult { + let out = py + .allow_threads(|| &self.series | &other.series) + .map_err(PyPolarsErr::from)?; Ok(out.into()) } - fn bitxor(&self, other: &PySeries) -> PyResult { - let out = (&self.series ^ &other.series).map_err(PyPolarsErr::from)?; + fn bitxor(&self, py: Python, other: &PySeries) -> PyResult { + let out = py + .allow_threads(|| &self.series ^ &other.series) + .map_err(PyPolarsErr::from)?; Ok(out.into()) } @@ -217,48 +223,58 @@ impl PySeries { Ok(()) } - fn extend(&mut self, other: &PySeries) -> PyResult<()> { - self.series - .extend(&other.series) + fn extend(&mut self, py: Python, other: &PySeries) -> PyResult<()> { + py.allow_threads(|| self.series.extend(&other.series)) .map_err(PyPolarsErr::from)?; Ok(()) } - fn new_from_index(&self, index: usize, length: usize) -> PyResult { + fn new_from_index(&self, py: Python, index: usize, length: usize) -> PyResult { if index >= self.series.len() { Err(PyValueError::new_err("index is out of bounds")) } else { - Ok(self.series.new_from_index(index, length).into()) + Ok(py.allow_threads(|| self.series.new_from_index(index, length).into())) } } - fn filter(&self, filter: &PySeries) -> PyResult { + fn filter(&self, py: Python, filter: &PySeries) -> PyResult { let filter_series = &filter.series; if let Ok(ca) = filter_series.bool() { - let series = self.series.filter(ca).map_err(PyPolarsErr::from)?; + let series = py + .allow_threads(|| self.series.filter(ca)) + .map_err(PyPolarsErr::from)?; Ok(PySeries { series }) } else { Err(PyRuntimeError::new_err("Expected a boolean mask")) } } - fn sort(&mut self, descending: bool, nulls_last: bool, multithreaded: bool) -> PyResult { - Ok(self - .series - .sort( - SortOptions::default() - .with_order_descending(descending) - .with_nulls_last(nulls_last) - .with_multithreaded(multithreaded), - ) + fn sort( + &mut self, + py: Python, + descending: bool, + nulls_last: bool, + multithreaded: bool, + ) -> PyResult { + Ok(py + .allow_threads(|| { + self.series.sort( + SortOptions::default() + .with_order_descending(descending) + .with_nulls_last(nulls_last) + .with_multithreaded(multithreaded), + ) + }) .map_err(PyPolarsErr::from)? .into()) } - fn gather_with_series(&self, indices: &PySeries) -> PyResult { - let indices = indices.series.idx().map_err(PyPolarsErr::from)?; - let s = self.series.take(indices).map_err(PyPolarsErr::from)?; - Ok(s.into()) + fn gather_with_series(&self, py: Python, indices: &PySeries) -> PyResult { + py.allow_threads(|| { + let indices = indices.series.idx().map_err(PyPolarsErr::from)?; + let s = self.series.take(indices).map_err(PyPolarsErr::from)?; + Ok(s.into()) + }) } fn null_count(&self) -> PyResult { @@ -271,6 +287,7 @@ impl PySeries { fn equals( &self, + py: Python, other: &PySeries, check_dtypes: bool, check_names: bool, @@ -283,9 +300,9 @@ impl PySeries { return false; } if null_equal { - self.series.equals_missing(&other.series) + py.allow_threads(|| self.series.equals_missing(&other.series)) } else { - self.series.equals(&other.series) + py.allow_threads(|| self.series.equals(&other.series)) } } @@ -300,8 +317,10 @@ impl PySeries { /// Rechunk and return a pointer to the start of the Series. /// Only implemented for numeric types - fn as_single_ptr(&mut self) -> PyResult { - let ptr = self.series.as_single_ptr().map_err(PyPolarsErr::from)?; + fn as_single_ptr(&mut self, py: Python) -> PyResult { + let ptr = py + .allow_threads(|| self.series.as_single_ptr()) + .map_err(PyPolarsErr::from)?; Ok(ptr) } @@ -309,20 +328,23 @@ impl PySeries { self.series.clone().into() } - fn zip_with(&self, mask: &PySeries, other: &PySeries) -> PyResult { + fn zip_with(&self, py: Python, mask: &PySeries, other: &PySeries) -> PyResult { let mask = mask.series.bool().map_err(PyPolarsErr::from)?; - let s = self - .series - .zip_with(mask, &other.series) + let s = py + .allow_threads(|| self.series.zip_with(mask, &other.series)) .map_err(PyPolarsErr::from)?; Ok(s.into()) } #[pyo3(signature = (separator, drop_first=false))] - fn to_dummies(&self, separator: Option<&str>, drop_first: bool) -> PyResult { - let df = self - .series - .to_dummies(separator, drop_first) + fn to_dummies( + &self, + py: Python, + separator: Option<&str>, + drop_first: bool, + ) -> PyResult { + let df = py + .allow_threads(|| self.series.to_dummies(separator, drop_first)) .map_err(PyPolarsErr::from)?; Ok(df.into()) } @@ -332,18 +354,22 @@ impl PySeries { Some(ca.get_as_series(index)?.into()) } - fn n_unique(&self) -> PyResult { - let n = self.series.n_unique().map_err(PyPolarsErr::from)?; + fn n_unique(&self, py: Python) -> PyResult { + let n = py + .allow_threads(|| self.series.n_unique()) + .map_err(PyPolarsErr::from)?; Ok(n) } - fn floor(&self) -> PyResult { - let s = self.series.floor().map_err(PyPolarsErr::from)?; + fn floor(&self, py: Python) -> PyResult { + let s = py + .allow_threads(|| self.series.floor()) + .map_err(PyPolarsErr::from)?; Ok(s.into()) } - fn shrink_to_fit(&mut self) { - self.series.shrink_to_fit(); + fn shrink_to_fit(&mut self, py: Python) { + py.allow_threads(|| self.series.shrink_to_fit()); } fn dot(&self, other: &PySeries, py: Python) -> PyResult { @@ -358,15 +384,11 @@ impl PySeries { } let result: AnyValue = if lhs_dtype.is_float() || rhs_dtype.is_float() { - (&self.series * &other.series) - .map_err(PyPolarsErr::from)? - .sum::() + py.allow_threads(|| (&self.series * &other.series)?.sum::()) .map_err(PyPolarsErr::from)? .into() } else { - (&self.series * &other.series) - .map_err(PyPolarsErr::from)? - .sum::() + py.allow_threads(|| (&self.series * &other.series)?.sum::()) .map_err(PyPolarsErr::from)? .into() }; @@ -413,20 +435,27 @@ impl PySeries { } } - fn skew(&self, bias: bool) -> PyResult> { - let out = self.series.skew(bias).map_err(PyPolarsErr::from)?; + fn skew(&self, py: Python, bias: bool) -> PyResult> { + let out = py + .allow_threads(|| self.series.skew(bias)) + .map_err(PyPolarsErr::from)?; Ok(out) } - fn kurtosis(&self, fisher: bool, bias: bool) -> PyResult> { - let out = self - .series - .kurtosis(fisher, bias) + fn kurtosis(&self, py: Python, fisher: bool, bias: bool) -> PyResult> { + let out = py + .allow_threads(|| self.series.kurtosis(fisher, bias)) .map_err(PyPolarsErr::from)?; Ok(out) } - fn cast(&self, dtype: Wrap, strict: bool, wrap_numerical: bool) -> PyResult { + fn cast( + &self, + py: Python, + dtype: Wrap, + strict: bool, + wrap_numerical: bool, + ) -> PyResult { let options = if wrap_numerical { CastOptions::Overflowing } else if strict { @@ -436,7 +465,7 @@ impl PySeries { }; let dtype = dtype.0; - let out = self.series.cast_with_options(&dtype, options); + let out = py.allow_threads(|| self.series.cast_with_options(&dtype, options)); let out = out.map_err(PyPolarsErr::from)?; Ok(out.into()) } @@ -451,7 +480,7 @@ impl PySeries { }) } - fn is_sorted(&self, descending: bool, nulls_last: bool) -> PyResult { + fn is_sorted(&self, py: Python, descending: bool, nulls_last: bool) -> PyResult { let options = SortOptions { descending, nulls_last, @@ -459,31 +488,36 @@ impl PySeries { maintain_order: false, limit: None, }; - Ok(self.series.is_sorted(options).map_err(PyPolarsErr::from)?) + Ok(py + .allow_threads(|| self.series.is_sorted(options)) + .map_err(PyPolarsErr::from)?) } fn clear(&self) -> Self { self.series.clear().into() } - fn head(&self, n: usize) -> Self { - self.series.head(Some(n)).into() + fn head(&self, py: Python, n: usize) -> Self { + py.allow_threads(|| self.series.head(Some(n))).into() } - fn tail(&self, n: usize) -> Self { - self.series.tail(Some(n)).into() + fn tail(&self, py: Python, n: usize) -> Self { + py.allow_threads(|| self.series.tail(Some(n))).into() } fn value_counts( &self, + py: Python, sort: bool, parallel: bool, name: String, normalize: bool, ) -> PyResult { - let out = self - .series - .value_counts(sort, parallel, name.into(), normalize) + let out = py + .allow_threads(|| { + self.series + .value_counts(sort, parallel, name.into(), normalize) + }) .map_err(PyPolarsErr::from)?; Ok(out.into()) } @@ -494,8 +528,10 @@ impl PySeries { self.series.slice(offset, length).into() } - pub fn not_(&self) -> PyResult { - let out = polars_ops::series::negate_bitwise(&self.series).map_err(PyPolarsErr::from)?; + pub fn not_(&self, py: Python) -> PyResult { + let out = py + .allow_threads(|| polars_ops::series::negate_bitwise(&self.series)) + .map_err(PyPolarsErr::from)?; Ok(out.into()) } } @@ -516,8 +552,15 @@ macro_rules! impl_set_with_mask { #[pymethods] impl PySeries { #[pyo3(signature = (filter, value))] - fn $name(&self, filter: &PySeries, value: Option<$native>) -> PyResult { - let series = $name(&self.series, filter, value).map_err(PyPolarsErr::from)?; + fn $name( + &self, + py: Python, + filter: &PySeries, + value: Option<$native>, + ) -> PyResult { + let series = py + .allow_threads(|| $name(&self.series, filter, value)) + .map_err(PyPolarsErr::from)?; Ok(Self::new(series)) } } diff --git a/crates/polars-python/src/series/scatter.rs b/crates/polars-python/src/series/scatter.rs index 97df60ef205b..798cd189a9b6 100644 --- a/crates/polars-python/src/series/scatter.rs +++ b/crates/polars-python/src/series/scatter.rs @@ -7,11 +7,12 @@ use crate::error::PyPolarsErr; #[pymethods] impl PySeries { - fn scatter(&mut self, idx: PySeries, values: PySeries) -> PyResult<()> { + fn scatter(&mut self, py: Python, idx: PySeries, values: PySeries) -> PyResult<()> { // we take the value because we want a ref count of 1 so that we can // have mutable access cheaply via _get_inner_mut(). let s = std::mem::take(&mut self.series); - match scatter(s, &idx.series, &values.series) { + let result = py.allow_threads(|| scatter(s, &idx.series, &values.series)); + match result { Ok(out) => { self.series = out; Ok(()) From 8cb78391619968b1f596938d9a734b20b58f544e Mon Sep 17 00:00:00 2001 From: Gijs Burghoorn Date: Wed, 13 Nov 2024 10:08:32 +0100 Subject: [PATCH 07/18] feat: Add IPC source node for new streaming engine (#19454) Co-authored-by: Orson Peters --- crates/polars-arrow/src/io/ipc/read/common.rs | 18 +- crates/polars-arrow/src/io/ipc/read/file.rs | 2 +- crates/polars-arrow/src/io/ipc/read/mod.rs | 1 + crates/polars-arrow/src/io/ipc/read/reader.rs | 90 ++- crates/polars-arrow/src/io/ipc/read/stream.rs | 17 +- crates/polars-arrow/src/record_batch.rs | 22 +- crates/polars-core/src/frame/mod.rs | 26 + .../polars-core/src/frame/upstream_traits.rs | 28 + crates/polars-core/src/scalar/from.rs | 3 + crates/polars-io/src/utils/other.rs | 8 +- crates/polars-plan/src/plans/optimizer/mod.rs | 3 +- .../src/plans/optimizer/slice_pushdown_lp.rs | 30 +- .../polars-stream/src/nodes/io_sources/ipc.rs | 557 ++++++++++++++++++ .../polars-stream/src/nodes/io_sources/mod.rs | 1 + crates/polars-stream/src/nodes/mod.rs | 1 + .../src/physical_plan/lower_ir.rs | 67 ++- .../src/physical_plan/to_graph.rs | 17 + py-polars/tests/unit/io/test_lazy_ipc.py | 1 + 18 files changed, 843 insertions(+), 49 deletions(-) create mode 100644 crates/polars-stream/src/nodes/io_sources/ipc.rs create mode 100644 crates/polars-stream/src/nodes/io_sources/mod.rs diff --git a/crates/polars-arrow/src/io/ipc/read/common.rs b/crates/polars-arrow/src/io/ipc/read/common.rs index 6b893c0e8ce3..0a1297bf1184 100644 --- a/crates/polars-arrow/src/io/ipc/read/common.rs +++ b/crates/polars-arrow/src/io/ipc/read/common.rs @@ -318,10 +318,14 @@ pub fn read_dictionary( Ok(()) } -pub fn prepare_projection( - schema: &ArrowSchema, - mut projection: Vec, -) -> (Vec, PlHashMap, ArrowSchema) { +#[derive(Clone)] +pub struct ProjectionInfo { + pub columns: Vec, + pub map: PlHashMap, + pub schema: ArrowSchema, +} + +pub fn prepare_projection(schema: &ArrowSchema, mut projection: Vec) -> ProjectionInfo { let schema = projection .iter() .map(|x| { @@ -355,7 +359,11 @@ pub fn prepare_projection( } } - (projection, map, schema) + ProjectionInfo { + columns: projection, + map, + schema, + } } pub fn apply_projection( diff --git a/crates/polars-arrow/src/io/ipc/read/file.rs b/crates/polars-arrow/src/io/ipc/read/file.rs index a83e1b758d80..e75fae36730e 100644 --- a/crates/polars-arrow/src/io/ipc/read/file.rs +++ b/crates/polars-arrow/src/io/ipc/read/file.rs @@ -305,7 +305,7 @@ fn get_message_from_block_offset<'a, R: Read + Seek>( .map_err(|err| polars_err!(oos = OutOfSpecKind::InvalidFlatbufferMessage(err))) } -fn get_message_from_block<'a, R: Read + Seek>( +pub(super) fn get_message_from_block<'a, R: Read + Seek>( reader: &mut R, block: &arrow_format::ipc::Block, message_scratch: &'a mut Vec, diff --git a/crates/polars-arrow/src/io/ipc/read/mod.rs b/crates/polars-arrow/src/io/ipc/read/mod.rs index 88411f9b905f..f4430db7dea2 100644 --- a/crates/polars-arrow/src/io/ipc/read/mod.rs +++ b/crates/polars-arrow/src/io/ipc/read/mod.rs @@ -19,6 +19,7 @@ mod schema; mod stream; pub(crate) use common::first_dict_field; +pub use common::{prepare_projection, ProjectionInfo}; pub use error::OutOfSpecKind; pub use file::{ deserialize_footer, get_row_count, read_batch, read_file_dictionaries, read_file_metadata, diff --git a/crates/polars-arrow/src/io/ipc/read/reader.rs b/crates/polars-arrow/src/io/ipc/read/reader.rs index 8369d2960233..e9523477fe39 100644 --- a/crates/polars-arrow/src/io/ipc/read/reader.rs +++ b/crates/polars-arrow/src/io/ipc/read/reader.rs @@ -1,9 +1,9 @@ use std::io::{Read, Seek}; use polars_error::PolarsResult; -use polars_utils::aliases::PlHashMap; use super::common::*; +use super::file::{get_message_from_block, get_record_batch}; use super::{read_batch, read_file_dictionaries, Dictionaries, FileMetadata}; use crate::array::Array; use crate::datatypes::ArrowSchema; @@ -16,7 +16,7 @@ pub struct FileReader { // the dictionaries are going to be read dictionaries: Option, current_block: usize, - projection: Option<(Vec, PlHashMap, ArrowSchema)>, + projection: Option, remaining: usize, data_scratch: Vec, message_scratch: Vec, @@ -32,10 +32,29 @@ impl FileReader { projection: Option>, limit: Option, ) -> Self { - let projection = projection.map(|projection| { - let (p, h, schema) = prepare_projection(&metadata.schema, projection); - (p, h, schema) - }); + let projection = + projection.map(|projection| prepare_projection(&metadata.schema, projection)); + Self { + reader, + metadata, + dictionaries: Default::default(), + projection, + remaining: limit.unwrap_or(usize::MAX), + current_block: 0, + data_scratch: Default::default(), + message_scratch: Default::default(), + } + } + + /// Creates a new [`FileReader`]. Use `projection` to only take certain columns. + /// # Panic + /// Panics iff the projection is not in increasing order (e.g. `[1, 0]` nor `[0, 1, 1]` are valid) + pub fn new_with_projection_info( + reader: R, + metadata: FileMetadata, + projection: Option, + limit: Option, + ) -> Self { Self { reader, metadata, @@ -52,7 +71,7 @@ impl FileReader { pub fn schema(&self) -> &ArrowSchema { self.projection .as_ref() - .map(|x| &x.2) + .map(|x| &x.schema) .unwrap_or(&self.metadata.schema) } @@ -66,9 +85,23 @@ impl FileReader { self.reader } + pub fn set_current_block(&mut self, idx: usize) { + self.current_block = idx; + } + + pub fn get_current_block(&self) -> usize { + self.current_block + } + + /// Get the inner memory scratches so they can be reused in a new writer. + /// This can be utilized to save memory allocations for performance reasons. + pub fn take_projection_info(&mut self) -> Option { + std::mem::take(&mut self.projection) + } + /// Get the inner memory scratches so they can be reused in a new writer. /// This can be utilized to save memory allocations for performance reasons. - pub fn get_scratches(&mut self) -> (Vec, Vec) { + pub fn take_scratches(&mut self) -> (Vec, Vec) { ( std::mem::take(&mut self.data_scratch), std::mem::take(&mut self.message_scratch), @@ -91,6 +124,43 @@ impl FileReader { }; Ok(()) } + + /// Skip over blocks until we have seen at most `offset` rows, returning how many rows we are + /// still too see. + /// + /// This will never go over the `offset`. Meaning that if the `offset < current_block.len()`, + /// the block will not be skipped. + pub fn skip_blocks_till_limit(&mut self, offset: u64) -> PolarsResult { + let mut remaining_offset = offset; + + for (i, block) in self.metadata.blocks.iter().enumerate() { + let message = + get_message_from_block(&mut self.reader, block, &mut self.message_scratch)?; + let record_batch = get_record_batch(message)?; + + let length = record_batch.length()?; + let length = length as u64; + + if length > remaining_offset { + self.current_block = i; + return Ok(remaining_offset); + } + + remaining_offset -= length; + } + + self.current_block = self.metadata.blocks.len(); + Ok(remaining_offset) + } + + pub fn next_record_batch( + &mut self, + ) -> Option>> { + let block = self.metadata.blocks.get(self.current_block)?; + self.current_block += 1; + let message = get_message_from_block(&mut self.reader, block, &mut self.message_scratch); + Some(message.and_then(|m| get_record_batch(m))) + } } impl Iterator for FileReader { @@ -114,7 +184,7 @@ impl Iterator for FileReader { &mut self.reader, self.dictionaries.as_ref().unwrap(), &self.metadata, - self.projection.as_ref().map(|x| x.0.as_ref()), + self.projection.as_ref().map(|x| x.columns.as_ref()), Some(self.remaining), block, &mut self.message_scratch, @@ -122,7 +192,7 @@ impl Iterator for FileReader { ); self.remaining -= chunk.as_ref().map(|x| x.len()).unwrap_or_default(); - let chunk = if let Some((_, map, _)) = &self.projection { + let chunk = if let Some(ProjectionInfo { map, .. }) = &self.projection { // re-order according to projection chunk.map(|chunk| apply_projection(chunk, map)) } else { diff --git a/crates/polars-arrow/src/io/ipc/read/stream.rs b/crates/polars-arrow/src/io/ipc/read/stream.rs index 87241596cdbe..b2cfb727b385 100644 --- a/crates/polars-arrow/src/io/ipc/read/stream.rs +++ b/crates/polars-arrow/src/io/ipc/read/stream.rs @@ -2,7 +2,6 @@ use std::io::Read; use arrow_format::ipc::planus::ReadAsRoot; use polars_error::{polars_bail, polars_err, PolarsError, PolarsResult}; -use polars_utils::aliases::PlHashMap; use super::super::CONTINUATION_MARKER; use super::common::*; @@ -93,7 +92,7 @@ fn read_next( dictionaries: &mut Dictionaries, message_buffer: &mut Vec, data_buffer: &mut Vec, - projection: &Option<(Vec, PlHashMap, ArrowSchema)>, + projection: &Option, scratch: &mut Vec, ) -> PolarsResult> { // determine metadata length @@ -169,7 +168,7 @@ fn read_next( batch, &metadata.schema, &metadata.ipc_schema, - projection.as_ref().map(|x| x.0.as_ref()), + projection.as_ref().map(|x| x.columns.as_ref()), None, dictionaries, metadata.version, @@ -179,7 +178,7 @@ fn read_next( scratch, ); - if let Some((_, map, _)) = projection { + if let Some(ProjectionInfo { map, .. }) = projection { // re-order according to projection chunk .map(|chunk| apply_projection(chunk, map)) @@ -238,7 +237,7 @@ pub struct StreamReader { finished: bool, data_buffer: Vec, message_buffer: Vec, - projection: Option<(Vec, PlHashMap, ArrowSchema)>, + projection: Option, scratch: Vec, } @@ -249,10 +248,8 @@ impl StreamReader { /// encounter a schema. /// To check if the reader is done, use `is_finished(self)` pub fn new(reader: R, metadata: StreamMetadata, projection: Option>) -> Self { - let projection = projection.map(|projection| { - let (p, h, schema) = prepare_projection(&metadata.schema, projection); - (p, h, schema) - }); + let projection = + projection.map(|projection| prepare_projection(&metadata.schema, projection)); Self { reader, @@ -275,7 +272,7 @@ impl StreamReader { pub fn schema(&self) -> &ArrowSchema { self.projection .as_ref() - .map(|x| &x.2) + .map(|x| &x.schema) .unwrap_or(&self.metadata.schema) } diff --git a/crates/polars-arrow/src/record_batch.rs b/crates/polars-arrow/src/record_batch.rs index f58d129831f1..2b0b8112ea9e 100644 --- a/crates/polars-arrow/src/record_batch.rs +++ b/crates/polars-arrow/src/record_batch.rs @@ -9,7 +9,7 @@ use crate::array::{Array, ArrayRef}; /// the same length, [`RecordBatchT::len`]. #[derive(Debug, Clone, PartialEq, Eq)] pub struct RecordBatchT> { - length: usize, + height: usize, arrays: Vec, } @@ -29,14 +29,14 @@ impl> RecordBatchT { /// /// # Error /// - /// I.f.f. the length does not match the length of any of the arrays - pub fn try_new(length: usize, arrays: Vec) -> PolarsResult { + /// I.f.f. the height does not match the length of any of the arrays + pub fn try_new(height: usize, arrays: Vec) -> PolarsResult { polars_ensure!( - arrays.iter().all(|arr| arr.as_ref().len() == length), + arrays.iter().all(|arr| arr.as_ref().len() == height), ComputeError: "RecordBatch requires all its arrays to have an equal number of rows", ); - Ok(Self { length, arrays }) + Ok(Self { height, arrays }) } /// returns the [`Array`]s in [`RecordBatchT`] @@ -51,7 +51,17 @@ impl> RecordBatchT { /// returns the number of rows of every array pub fn len(&self) -> usize { - self.length + self.height + } + + /// returns the number of rows of every array + pub fn height(&self) -> usize { + self.height + } + + /// returns the number of arrays + pub fn width(&self) -> usize { + self.arrays.len() } /// returns whether the columns have any rows diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 7e2d7b050dcf..0d8fef7f4c4a 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -3,6 +3,7 @@ use std::borrow::Cow; use std::{mem, ops}; +use polars_row::ArrayRef; use polars_utils::itertools::Itertools; use rayon::prelude::*; @@ -3334,6 +3335,31 @@ impl DataFrame { pub(crate) fn infer_height(cols: &[Column]) -> usize { cols.first().map_or(0, Column::len) } + + pub fn append_record_batch(&mut self, rb: RecordBatchT) -> PolarsResult<()> { + polars_ensure!( + rb.arrays().len() == self.width(), + InvalidOperation: "attempt to extend dataframe of width {} with record batch of width {}", + self.width(), + rb.arrays().len(), + ); + + if rb.height() == 0 { + return Ok(()); + } + + // SAFETY: + // - we don't adjust the names of the columns + // - each column gets appended the same number of rows, which is an invariant of + // record_batch. + let columns = unsafe { self.get_columns_mut() }; + for (col, arr) in columns.iter_mut().zip(rb.into_arrays()) { + let arr_series = Series::from_arrow_chunks(PlSmallStr::EMPTY, vec![arr])?.into_column(); + col.append(&arr_series)?; + } + + Ok(()) + } } pub struct RecordBatchIter<'a> { diff --git a/crates/polars-core/src/frame/upstream_traits.rs b/crates/polars-core/src/frame/upstream_traits.rs index 38b346ace652..1392f87c052f 100644 --- a/crates/polars-core/src/frame/upstream_traits.rs +++ b/crates/polars-core/src/frame/upstream_traits.rs @@ -1,5 +1,7 @@ use std::ops::{Index, Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive}; +use arrow::record_batch::RecordBatchT; + use crate::prelude::*; impl FromIterator for DataFrame { @@ -22,6 +24,32 @@ impl FromIterator for DataFrame { } } +impl TryExtend>> for DataFrame { + fn try_extend>>>( + &mut self, + iter: I, + ) -> PolarsResult<()> { + for record_batch in iter { + self.append_record_batch(record_batch)?; + } + + Ok(()) + } +} + +impl TryExtend>>> for DataFrame { + fn try_extend>>>>( + &mut self, + iter: I, + ) -> PolarsResult<()> { + for record_batch in iter { + self.append_record_batch(record_batch?)?; + } + + Ok(()) + } +} + impl Index for DataFrame { type Output = Column; diff --git a/crates/polars-core/src/scalar/from.rs b/crates/polars-core/src/scalar/from.rs index 3af8671dadd1..c104c2ea8573 100644 --- a/crates/polars-core/src/scalar/from.rs +++ b/crates/polars-core/src/scalar/from.rs @@ -1,3 +1,5 @@ +use polars_utils::pl_str::PlSmallStr; + use super::{AnyValue, DataType, Scalar}; macro_rules! impl_from { @@ -25,4 +27,5 @@ impl_from! { (u64, UInt64, UInt64) (f32, Float32, Float32) (f64, Float64, Float64) + (PlSmallStr, StringOwned, String) } diff --git a/crates/polars-io/src/utils/other.rs b/crates/polars-io/src/utils/other.rs index 4e039124933f..f4ef629821a9 100644 --- a/crates/polars-io/src/utils/other.rs +++ b/crates/polars-io/src/utils/other.rs @@ -45,7 +45,7 @@ pub fn get_reader_bytes( feature = "parquet", feature = "avro" ))] -pub(crate) fn apply_projection(schema: &ArrowSchema, projection: &[usize]) -> ArrowSchema { +pub fn apply_projection(schema: &ArrowSchema, projection: &[usize]) -> ArrowSchema { projection .iter() .map(|idx| schema.get_at_index(*idx).unwrap()) @@ -59,14 +59,14 @@ pub(crate) fn apply_projection(schema: &ArrowSchema, projection: &[usize]) -> Ar feature = "avro", feature = "parquet" ))] -pub(crate) fn columns_to_projection( - columns: &[String], +pub fn columns_to_projection>( + columns: &[T], schema: &ArrowSchema, ) -> PolarsResult> { let mut prj = Vec::with_capacity(columns.len()); for column in columns { - let i = schema.try_index_of(column)?; + let i = schema.try_index_of(column.as_ref())?; prj.push(i); } diff --git a/crates/polars-plan/src/plans/optimizer/mod.rs b/crates/polars-plan/src/plans/optimizer/mod.rs index 70880ca78359..dc0d330d8b86 100644 --- a/crates/polars-plan/src/plans/optimizer/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/mod.rs @@ -89,6 +89,7 @@ pub fn optimize( let simplify_expr = opt_state.contains(OptFlags::SIMPLIFY_EXPR); let slice_pushdown = opt_state.contains(OptFlags::SLICE_PUSHDOWN); let streaming = opt_state.contains(OptFlags::STREAMING); + let new_streaming = opt_state.contains(OptFlags::NEW_STREAMING); let fast_projection = opt_state.contains(OptFlags::FAST_PROJECTION); // Don't run optimizations that don't make sense on a single node. @@ -181,7 +182,7 @@ pub fn optimize( } if slice_pushdown { - let slice_pushdown_opt = SlicePushDown::new(streaming); + let slice_pushdown_opt = SlicePushDown::new(streaming, new_streaming); let alp = lp_arena.take(lp_top); let alp = slice_pushdown_opt.optimize(alp, lp_arena, expr_arena)?; diff --git a/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs b/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs index 9c2f8497fac8..a5ff806abae9 100644 --- a/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs +++ b/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs @@ -5,6 +5,7 @@ use crate::prelude::*; pub(super) struct SlicePushDown { streaming: bool, + new_streaming: bool, pub scratch: Vec, } @@ -59,9 +60,10 @@ fn can_pushdown_slice_past_projections(exprs: &[ExprIR], arena: &Arena) - } impl SlicePushDown { - pub(super) fn new(streaming: bool) -> Self { + pub(super) fn new(streaming: bool, new_streaming: bool) -> Self { Self { streaming, + new_streaming, scratch: vec![], } } @@ -211,6 +213,32 @@ impl SlicePushDown { Ok(lp) }, + + #[cfg(feature = "ipc")] + (Scan { + sources, + file_info, + hive_parts, + output_schema, + mut file_options, + predicate, + scan_type: scan_type @ FileScan::Ipc { .. }, + }, Some(state)) if self.new_streaming && predicate.is_none() => { + file_options.slice = Some((state.offset, state.len as usize)); + + let lp = Scan { + sources, + file_info, + hive_parts, + output_schema, + scan_type, + file_options, + predicate, + }; + + Ok(lp) + }, + // TODO! we currently skip slice pushdown if there is a predicate. (Scan { sources, diff --git a/crates/polars-stream/src/nodes/io_sources/ipc.rs b/crates/polars-stream/src/nodes/io_sources/ipc.rs new file mode 100644 index 000000000000..3a83c8e3132c --- /dev/null +++ b/crates/polars-stream/src/nodes/io_sources/ipc.rs @@ -0,0 +1,557 @@ +use std::cmp::Reverse; +use std::io::Cursor; +use std::ops::Range; +use std::sync::Arc; + +use polars_core::config; +use polars_core::frame::DataFrame; +use polars_core::prelude::{Column, DataType}; +use polars_core::scalar::Scalar; +use polars_core::utils::arrow::array::TryExtend; +use polars_core::utils::arrow::io::ipc::read::{ + prepare_projection, read_file_metadata, FileMetadata, FileReader, ProjectionInfo, +}; +use polars_error::{ErrString, PolarsError, PolarsResult}; +use polars_expr::prelude::PhysicalExpr; +use polars_expr::state::ExecutionState; +use polars_io::cloud::CloudOptions; +use polars_io::ipc::IpcScanOptions; +use polars_io::utils::columns_to_projection; +use polars_io::RowIndex; +use polars_plan::plans::hive::HivePartitions; +use polars_plan::plans::{FileInfo, ScanSources}; +use polars_plan::prelude::FileScanOptions; +use polars_utils::mmap::MemSlice; +use polars_utils::pl_str::PlSmallStr; +use polars_utils::priority::Priority; +use polars_utils::IdxSize; + +use crate::async_primitives::distributor_channel::distributor_channel; +use crate::async_primitives::linearizer::Linearizer; +use crate::morsel::{get_ideal_morsel_size, SourceToken}; +use crate::nodes::{ + ComputeNode, JoinHandle, Morsel, MorselSeq, PortState, TaskPriority, TaskScope, +}; +use crate::pipe::{RecvPort, SendPort}; +use crate::{DEFAULT_DISTRIBUTOR_BUFFER_SIZE, DEFAULT_LINEARIZER_BUFFER_SIZE}; + +const ROW_COUNT_OVERFLOW_ERR: PolarsError = PolarsError::ComputeError(ErrString::new_static( + "\ +IPC file produces more than 2^32 rows; \ +consider compiling with polars-bigidx feature (polars-u64-idx package on python)", +)); + +pub struct IpcSourceNode { + sources: ScanSources, + + config: IpcSourceNodeConfig, + num_pipelines: usize, + + /// Every phase we need to be able to continue from where we left off, so we save the state of + /// the Walker task. + state: IpcSourceNodeState, +} + +pub struct IpcSourceNodeConfig { + row_index: Option, + projection_info: Option, + + rechunk: bool, + include_file_paths: Option, + + first_metadata: FileMetadata, +} + +pub struct IpcSourceNodeState { + morsel_seq: u64, + row_idx_offset: IdxSize, + + slice: Range, + + source_idx: usize, + source: Option, +} + +pub struct Source { + file_path: Option>, + + memslice: Arc, + metadata: Arc, + + block_offset: usize, +} + +impl IpcSourceNode { + #[allow(clippy::too_many_arguments)] + pub fn new( + sources: ScanSources, + _file_info: FileInfo, + _hive_parts: Option>>, // @TODO + predicate: Option>, + options: IpcScanOptions, + _cloud_options: Option, + file_options: FileScanOptions, + mut first_metadata: Option, + ) -> PolarsResult { + // These should have all been removed during lower_ir + assert!(predicate.is_none()); + assert!(!sources.is_empty()); + + let IpcScanOptions = options; + + let FileScanOptions { + slice, + with_columns, + cache: _, // @TODO + row_index, + rechunk, + file_counter: _, // @TODO + hive_options: _, // @TODO + glob: _, // @TODO + include_file_paths, + allow_missing_columns: _, // @TODO + } = file_options; + + let first_metadata = match first_metadata.take() { + Some(md) => md, + None => { + let source = sources.iter().next().unwrap(); + let source = source.to_memslice()?; + read_file_metadata(&mut std::io::Cursor::new(&*source))? + }, + }; + + let projection = with_columns + .as_ref() + .map(|cols| columns_to_projection(cols, &first_metadata.schema)) + .transpose()?; + let projection_info = projection + .as_ref() + .map(|p| prepare_projection(&first_metadata.schema, p.clone())); + + let state = IpcSourceNodeState { + morsel_seq: 0, + row_idx_offset: row_index.as_ref().map_or(0, |ri| ri.offset), + + // Always create a slice. If no slice was given, just make the biggest slice possible. + slice: slice.map_or(0..usize::MAX, |(offset, length)| { + let offset = offset as usize; + offset..offset + length + }), + + source_idx: 0, + source: None, + }; + + Ok(IpcSourceNode { + sources, + + config: IpcSourceNodeConfig { + row_index, + projection_info, + + rechunk, + include_file_paths, + + first_metadata, + }, + + num_pipelines: 0, + + state, + }) + } +} + +/// Move `slice` forward by `n` and return the slice until then. +fn slice_take(slice: &mut Range, n: usize) -> Range { + let offset = slice.start; + let length = slice.len(); + + assert!(offset < n); + + let chunk_length = (n - offset).min(length); + let rng = offset..offset + chunk_length; + *slice = 0..length - chunk_length; + + rng +} + +fn get_max_morsel_size() -> usize { + std::env::var("POLARS_STREAMING_IPC_SOURCE_MAX_MORSEL_SIZE") + .map_or_else( + |_| get_ideal_morsel_size(), + |v| { + v.parse::().expect( + "POLARS_STREAMING_IPC_SOURCE_MAX_MORSEL_SIZE does not contain valid size", + ) + }, + ) + .max(1) +} + +impl ComputeNode for IpcSourceNode { + fn name(&self) -> &str { + "ipc_source" + } + + fn initialize(&mut self, num_pipelines: usize) { + self.num_pipelines = num_pipelines; + } + + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { + assert!(recv.is_empty()); + assert_eq!(send.len(), 1); + + if self.state.slice.is_empty() || self.state.source_idx >= self.sources.len() { + send[0] = PortState::Done; + } + + if send[0] != PortState::Done { + send[0] = PortState::Ready; + } + + Ok(()) + } + + fn spawn<'env, 's>( + &'env mut self, + scope: &'s TaskScope<'s, 'env>, + recv_ports: &mut [Option>], + send_ports: &mut [Option>], + _state: &'s ExecutionState, + join_handles: &mut Vec>>, + ) { + assert!(recv_ports.is_empty()); + assert_eq!(send_ports.len(), 1); + + // Split size for morsels. + let max_morsel_size = get_max_morsel_size(); + let source_token = SourceToken::new(); + + let num_pipelines = self.num_pipelines; + let config = &self.config; + let sources = &self.sources; + let state = &mut self.state; + + /// Messages sent from Walker task to Decoder tasks. + struct BatchMessage { + memslice: Arc, + metadata: Arc, + file_path: Option>, + row_idx_offset: IdxSize, + slice: Range, + block_range: Range, + morsel_seq_base: u64, + } + + // Walker task -> Decoder tasks. + let (mut batch_tx, batch_rxs) = + distributor_channel::(num_pipelines, DEFAULT_DISTRIBUTOR_BUFFER_SIZE); + // Decoder tasks -> Distributor task. + let (mut decoded_rx, decoded_tx) = Linearizer::, Morsel>>::new( + num_pipelines, + DEFAULT_LINEARIZER_BUFFER_SIZE, + ); + // Distributor task -> output. + let mut sender = send_ports[0].take().unwrap().serial(); + + // Distributor task. + // + // Shuffles morsels from `n` producers amongst `n` consumers. + // + // If record batches in the source IPC file are large, one decoder might produce many + // morsels at the same time. At the same time, other decoders might not produce anything. + // Therefore, we would like to distribute the output of a single decoder task over the + // available output pipelines. + join_handles.push(scope.spawn_task(TaskPriority::High, async move { + while let Some(morsel) = decoded_rx.get().await { + if sender.send(morsel.1).await.is_err() { + break; + } + } + PolarsResult::Ok(()) + })); + + // Decoder tasks. + // + // Tasks a IPC file and certain number of blocks and decodes each block as a record batch. + // Then, all record batches are concatenated into a DataFrame. If the resulting DataFrame + // is too large, which happens when we have one very large block, the DataFrame is split + // into smaller pieces an spread among the pipelines. + let decoder_tasks = decoded_tx.into_iter().zip(batch_rxs) + .map(|(mut send, mut rx)| { + let source_token = source_token.clone(); + scope.spawn_task(TaskPriority::Low, async move { + // Amortize allocations. + let mut data_scratch = Vec::new(); + let mut message_scratch = Vec::new(); + let mut projection_info = config.projection_info.clone(); + + let schema = projection_info.as_ref().map_or(config.first_metadata.schema.as_ref(), |ProjectionInfo { schema, .. }| schema); + let pl_schema = schema + .iter() + .map(|(n, f)| (n.clone(), DataType::from_arrow(&f.dtype, true))) + .collect(); + + while let Ok(m) = rx.recv().await { + let BatchMessage { + memslice: source, + metadata, + file_path, + row_idx_offset, + slice, + morsel_seq_base, + block_range, + } = m; + + let mut reader = FileReader::new_with_projection_info( + Cursor::new(source.as_ref()), + metadata.as_ref().clone(), + std::mem::take(&mut projection_info), + None, + ); + reader.set_current_block(block_range.start); + reader.set_scratches(( + std::mem::take(&mut data_scratch), + std::mem::take(&mut message_scratch), + )); + + // Create the DataFrame with the appropriate schema and append all the record + // batches to it. This will perform schema validation as well. + let mut df = DataFrame::empty_with_schema(&pl_schema); + df.try_extend(reader.by_ref().take(block_range.len()))?; + + df = df.slice(slice.start as i64, slice.len()); + + if config.rechunk { + df.rechunk_mut(); + } + + if let Some(RowIndex { name, offset: _ }) = &config.row_index { + let offset = row_idx_offset + slice.start as IdxSize; + df = df.with_row_index(name.clone(), Some(offset))?; + } + + if let Some(col) = config.include_file_paths.as_ref() { + let file_path = file_path.unwrap(); + let file_path = Scalar::from(PlSmallStr::from(file_path.as_ref())); + df.with_column(Column::new_scalar( + col.clone(), + file_path, + df.height(), + ))?; + } + + // If the block is very large, we want to split the block amongst the + // pipelines. That will at least allow some parallelism. + if df.height() > max_morsel_size && config::verbose() { + eprintln!("IPC source encountered a (too) large record batch of {} rows. Splitting and continuing.", df.height()); + } + for i in 0..df.height().div_ceil(max_morsel_size) { + let morsel = df.slice((i * max_morsel_size) as i64, max_morsel_size); + let seq = MorselSeq::new(morsel_seq_base + i as u64); + let morsel = Morsel::new( + morsel, + seq, + source_token.clone(), + ); + if send.insert(Priority(Reverse(seq), morsel)).await.is_err() { + break; + } + } + + (data_scratch, message_scratch) = reader.take_scratches(); + projection_info = reader.take_projection_info(); + } + + PolarsResult::Ok(()) + }) + }) + .collect::>(); + + // Walker task. + // + // Walks all the sources and supplies block ranges to the decoder tasks. + join_handles.push(scope.spawn_task(TaskPriority::Low, async move { + struct Batch { + row_idx_offset: IdxSize, + block_start: usize, + num_rows: usize, + } + + // Batch completion parameters + let batch_size_limit = get_ideal_morsel_size(); + let sliced_batch_size_limit = state.slice.len().div_ceil(num_pipelines); + let batch_block_limit = if sources.len() >= num_pipelines { + // If there are more files than decoder tasks, try to subdivide the files instead + // of the blocks. + usize::MAX + } else { + config.first_metadata.blocks.len().div_ceil(num_pipelines) + }; + + // Amortize allocations + let mut data_scratch = Vec::new(); + let mut message_scratch = Vec::new(); + let mut projection_info = config.projection_info.clone(); + + 'source_loop: while !state.slice.is_empty() { + let source = match state.source { + Some(ref mut source) => source, + None => { + let Some(source) = sources.get(state.source_idx) else { + break; + }; + + let file_path: Option> = config + .include_file_paths + .as_ref() + .map(|_| source.to_include_path_name().into()); + let memslice = source.to_memslice()?; + let metadata = if state.source_idx == 0 { + config.first_metadata.clone() + } else { + read_file_metadata(&mut std::io::Cursor::new(memslice.as_ref()))? + }; + + state.source.insert(Source { + file_path, + memslice: Arc::new(memslice), + metadata: Arc::new(metadata), + block_offset: 0, + }) + }, + }; + + let mut reader = FileReader::new_with_projection_info( + Cursor::new(source.memslice.as_ref()), + source.metadata.as_ref().clone(), + std::mem::take(&mut projection_info), + None, + ); + reader.set_current_block(source.block_offset); + reader.set_scratches(( + std::mem::take(&mut data_scratch), + std::mem::take(&mut message_scratch), + )); + + if state.slice.start > 0 { + // Skip over all blocks that the slice would skip anyway. + let new_offset = reader.skip_blocks_till_limit(state.slice.start as u64)?; + + state.row_idx_offset += (state.slice.start as u64 - new_offset) as IdxSize; + state.slice = new_offset as usize..new_offset as usize + state.slice.len(); + + // If we skip the entire file. Don't even try to read from it. + if reader.get_current_block() == reader.metadata().blocks.len() { + (data_scratch, message_scratch) = reader.take_scratches(); + projection_info = reader.take_projection_info(); + state.source.take(); + state.source_idx += 1; + continue; + } + } + + let mut batch = Batch { + row_idx_offset: state.row_idx_offset, + block_start: reader.get_current_block(), + num_rows: 0, + }; + + // We don't yet want to commit these values to the state in case this batch gets + // cancelled. + let mut uncommitted_slice = state.slice.clone(); + let mut uncommitted_row_idx_offset = state.row_idx_offset; + while !state.slice.is_empty() { + let mut is_batch_complete = false; + + match reader.next_record_batch() { + None if batch.num_rows == 0 => break, + + // If we have no more record batches available, we want to send what is + // left. + None => is_batch_complete = true, + Some(record_batch) => { + let rb_num_rows = record_batch?.length()? as usize; + batch.num_rows += rb_num_rows; + + // We need to ensure that we are not overflowing the IdxSize maximum + // capacity. + let rb_num_rows = IdxSize::try_from(rb_num_rows) + .map_err(|_| ROW_COUNT_OVERFLOW_ERR)?; + uncommitted_row_idx_offset = uncommitted_row_idx_offset + .checked_add(rb_num_rows) + .ok_or(ROW_COUNT_OVERFLOW_ERR)?; + }, + } + + let current_block = reader.get_current_block(); + + // Subdivide into batches for large files. + is_batch_complete |= batch.num_rows >= batch_size_limit; + // Subdivide into batches if the file is sliced. + is_batch_complete |= batch.num_rows >= sliced_batch_size_limit; + // Subdivide into batches for small files. + is_batch_complete |= current_block - batch.block_start >= batch_block_limit; + + // Batch blocks such that we send appropriately sized morsels. We guarantee a + // lower bound here, but not an upper bound. + if is_batch_complete { + let batch_slice = slice_take(&mut uncommitted_slice, batch.num_rows); + let batch_slice_len = batch_slice.len(); + let block_range = batch.block_start..current_block; + + let message = BatchMessage { + memslice: source.memslice.clone(), + metadata: source.metadata.clone(), + file_path: source.file_path.clone(), + row_idx_offset: batch.row_idx_offset, + slice: batch_slice, + morsel_seq_base: state.morsel_seq, + block_range, + }; + + if source_token.stop_requested() { + break 'source_loop; + } + + if batch_tx.send(message).await.is_err() { + // This should only happen if the receiver of the decoder + // has broken off, meaning no further input will be needed. + break 'source_loop; + } + + // Commit the changes to the state. + // Now, we know that the a decoder will process it. + // + // This might generate several morsels if the record batch is very large. + state.morsel_seq += batch_slice_len.div_ceil(max_morsel_size) as u64; + state.slice = uncommitted_slice.clone(); + state.row_idx_offset = uncommitted_row_idx_offset; + source.block_offset = current_block; + + batch = Batch { + row_idx_offset: state.row_idx_offset, + block_start: current_block, + num_rows: 0, + }; + } + } + + (data_scratch, message_scratch) = reader.take_scratches(); + projection_info = reader.take_projection_info(); + + state.source.take(); + state.source_idx += 1; + } + + drop(batch_tx); // Inform decoder tasks to stop. + for decoder_task in decoder_tasks { + decoder_task.await?; + } + + PolarsResult::Ok(()) + })); + } +} diff --git a/crates/polars-stream/src/nodes/io_sources/mod.rs b/crates/polars-stream/src/nodes/io_sources/mod.rs new file mode 100644 index 000000000000..ce14ad3b0f7a --- /dev/null +++ b/crates/polars-stream/src/nodes/io_sources/mod.rs @@ -0,0 +1 @@ +pub mod ipc; diff --git a/crates/polars-stream/src/nodes/mod.rs b/crates/polars-stream/src/nodes/mod.rs index 936c0ceb3ada..effebe67c34b 100644 --- a/crates/polars-stream/src/nodes/mod.rs +++ b/crates/polars-stream/src/nodes/mod.rs @@ -5,6 +5,7 @@ pub mod in_memory_sink; pub mod in_memory_source; pub mod input_independent_select; pub mod io_sinks; +pub mod io_sources; pub mod joins; pub mod map; pub mod multiplexer; diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs index 95e3ae72224d..063c94081dbc 100644 --- a/crates/polars-stream/src/physical_plan/lower_ir.rs +++ b/crates/polars-stream/src/physical_plan/lower_ir.rs @@ -1,10 +1,11 @@ use std::sync::Arc; +use polars_core::frame::DataFrame; use polars_core::prelude::{InitHashMaps, PlHashMap, PlIndexMap}; use polars_core::schema::Schema; use polars_error::{polars_ensure, PolarsResult}; use polars_plan::plans::expr_ir::{ExprIR, OutputName}; -use polars_plan::plans::{AExpr, FunctionIR, IRAggExpr, IR}; +use polars_plan::plans::{AExpr, FileScan, FunctionIR, IRAggExpr, IR}; use polars_plan::prelude::{FileType, SinkType}; use polars_utils::arena::{Arena, Node}; use polars_utils::itertools::Itertools; @@ -314,23 +315,67 @@ pub fn lower_ir( sources: scan_sources, file_info, hive_parts, - output_schema, + output_schema: scan_output_schema, scan_type, - predicate, + mut predicate, file_options, } = v.clone() else { unreachable!(); }; - PhysNodeKind::FileScan { - scan_sources, - file_info, - hive_parts, - output_schema, - scan_type, - predicate, - file_options, + if scan_sources.is_empty() { + // If there are no sources, just provide an empty in-memory source with the right + // schema. + PhysNodeKind::InMemorySource { + df: Arc::new(DataFrame::empty_with_schema(output_schema.as_ref())), + } + } else { + if matches!(scan_type, FileScan::Ipc { .. }) { + // @TODO: All the things the IPC source does not support yet. + if hive_parts.is_some() + || scan_sources.is_cloud_url() + || file_options.allow_missing_columns + || file_options.slice.is_some_and(|(offset, _)| offset < 0) + { + todo!(); + } + } + + // If the node itself would just filter on the whole output then there is no real + // reason to do it in the source node itself. + let do_filter_in_separate_node = + predicate.is_some() && matches!(scan_type, FileScan::Ipc { .. }); + + if do_filter_in_separate_node { + assert!(file_options.slice.is_none()); // Invariant of the scan + let predicate = predicate.take().unwrap(); + + let input = phys_sm.insert(PhysNode::new( + output_schema.clone(), + PhysNodeKind::FileScan { + scan_sources, + file_info, + hive_parts, + output_schema: scan_output_schema, + scan_type, + predicate: None, + file_options, + }, + )); + + PhysNodeKind::Filter { input, predicate } + } else { + PhysNodeKind::FileScan { + scan_sources, + file_info, + hive_parts, + output_schema: scan_output_schema, + scan_type, + predicate, + file_options, + } + } } }, diff --git a/crates/polars-stream/src/physical_plan/to_graph.rs b/crates/polars-stream/src/physical_plan/to_graph.rs index befa9c3a93b9..b701696972a9 100644 --- a/crates/polars-stream/src/physical_plan/to_graph.rs +++ b/crates/polars-stream/src/physical_plan/to_graph.rs @@ -367,6 +367,23 @@ fn to_graph_rec<'a>( todo!() } }, + FileScan::Ipc { + options, + cloud_options, + metadata: first_metadata, + } => ctx.graph.add_node( + nodes::io_sources::ipc::IpcSourceNode::new( + scan_sources, + file_info, + hive_parts, + predicate, + options, + cloud_options, + file_options, + first_metadata, + )?, + [], + ), _ => todo!(), } } diff --git a/py-polars/tests/unit/io/test_lazy_ipc.py b/py-polars/tests/unit/io/test_lazy_ipc.py index 0d67b6b06f89..ec75d495ce8d 100644 --- a/py-polars/tests/unit/io/test_lazy_ipc.py +++ b/py-polars/tests/unit/io/test_lazy_ipc.py @@ -88,6 +88,7 @@ def test_ipc_list_arg(io_files_path: Path) -> None: assert df.row(0) == ("vegetables", 45, 0.5, 2) +@pytest.mark.may_fail_auto_streaming def test_scan_ipc_local_with_async( capfd: Any, monkeypatch: Any, From 1a8735f6e26abcf76b43e90ab91f9d99b76f2fe9 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 13 Nov 2024 15:18:01 +0100 Subject: [PATCH 08/18] feat(rust): Add dylib (#19759) --- Cargo.lock | 539 +++++++++++++++++++++++++-------- Cargo.toml | 1 + crates/Makefile | 3 +- crates/polars-dylib/Cargo.toml | 25 ++ crates/polars-dylib/README.md | 16 + crates/polars-dylib/src/lib.rs | 15 + crates/polars/Cargo.toml | 9 + 7 files changed, 487 insertions(+), 121 deletions(-) create mode 100644 crates/polars-dylib/Cargo.toml create mode 100644 crates/polars-dylib/README.md create mode 100644 crates/polars-dylib/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index f79742adafb7..281f2845ccfc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -62,9 +62,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.18" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" +checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9" [[package]] name = "android-tzdata" @@ -89,15 +89,15 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstyle" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8365de52b16c035ff4fcafe0092ba9390540e3e352870ac09933bebcaa2c8c56" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" [[package]] name = "anyhow" -version = "1.0.92" +version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74f37166d7d48a0284b99dd824694c26119c700b53bf0d1540cdb147dbdaaf13" +checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" [[package]] name = "apache-avro" @@ -120,7 +120,7 @@ dependencies = [ "snap", "strum", "strum_macros", - "thiserror", + "thiserror 1.0.69", "typed-builder", "uuid", ] @@ -206,7 +206,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -217,7 +217,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -265,9 +265,9 @@ dependencies = [ [[package]] name = "aws-config" -version = "1.5.9" +version = "1.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d6448cfb224dd6a9b9ac734f58622dd0d4751f3589f3b777345745f46b2eb14" +checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924" dependencies = [ "aws-credential-types", "aws-runtime", @@ -368,9 +368,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.48.0" +version = "1.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ded855583fa1d22e88fe39fd6062b062376e50a8211989e07cf5e38d52eb3453" +checksum = "09677244a9da92172c8dc60109b4a9658597d4d298b188dd0018b6a66b410ca4" dependencies = [ "aws-credential-types", "aws-runtime", @@ -390,9 +390,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.49.0" +version = "1.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9177ea1192e6601ae16c7273385690d88a7ed386a00b74a6bc894d12103cd933" +checksum = "81fea2f3a8bb3bd10932ae7ad59cc59f65f270fc9183a7e91f501dc5efbef7ee" dependencies = [ "aws-credential-types", "aws-runtime", @@ -412,9 +412,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.48.0" +version = "1.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "823ef553cf36713c97453e2ddff1eb8f62be7f4523544e2a5db64caf80100f0a" +checksum = "53dcf5e7d9bd1517b8b998e170e650047cea8a2b85fe1835abe3210713e541b7" dependencies = [ "aws-credential-types", "aws-runtime", @@ -574,9 +574,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.7.2" +version = "1.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e086682a53d3aa241192aa110fa8dfce98f2f5ac2ead0de84d41582c7e8fdb96" +checksum = "92165296a47a812b267b4f41032ff8069ab7ff783696d217f0994a0d7ab585cd" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -591,9 +591,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.8" +version = "1.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07c9cdc179e6afbf5d391ab08c85eac817b51c87e1892a5edb5f7bbdc64314b4" +checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510" dependencies = [ "base64-simd", "bytes", @@ -794,7 +794,7 @@ checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -845,9 +845,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.31" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2e7962b54006dcfcc61cb72735f4d89bb97061dd6a7ed882ec6b8ee53714c6f" +checksum = "1aeb932158bd710538c73702db6945cb68a8fb08c519e6e12706b94263b36db8" dependencies = [ "jobserver", "libc", @@ -1037,9 +1037,9 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0" +checksum = "0ca741a962e1b0bff6d724a1a0958b686406e853bb14061f218562e1896f95e6" dependencies = [ "libc", ] @@ -1252,6 +1252,17 @@ dependencies = [ "subtle", ] +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "doc-comment" version = "0.3.3" @@ -1314,7 +1325,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -1369,9 +1380,9 @@ checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" [[package]] name = "fastrand" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" +checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" [[package]] name = "ff" @@ -1432,9 +1443,9 @@ dependencies = [ [[package]] name = "fs4" -version = "0.11.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adc91b3da7f1a7968b00f9f65a4971252f6a927d3cb9eec05d91cbeaff678f9a" +checksum = "e871a4cfa68bb224863b53149d973df1ac8d1ed2fa1d1bfc37ac1bb65dd37207" dependencies = [ "rustix", "windows-sys 0.52.0", @@ -1496,7 +1507,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -1663,9 +1674,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.0" +version = "0.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" +checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" dependencies = [ "allocator-api2", "equivalent", @@ -1916,14 +1927,143 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "idna" -version = "0.5.0" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" dependencies = [ - "unicode-bidi", - "unicode-normalization", + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" +dependencies = [ + "icu_normalizer", + "icu_properties", ] [[package]] @@ -1933,7 +2073,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "serde", ] @@ -2053,9 +2193,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.161" +version = "0.2.162" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" +checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398" [[package]] name = "libflate" @@ -2143,6 +2283,12 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "litemap" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" + [[package]] name = "lock_api" version = "0.4.12" @@ -2165,7 +2311,7 @@ version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" dependencies = [ - "hashbrown 0.15.0", + "hashbrown 0.15.1", ] [[package]] @@ -2754,7 +2900,7 @@ dependencies = [ "flate2", "futures", "getrandom", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "hex", "indexmap", "itoa", @@ -2826,7 +2972,7 @@ dependencies = [ "comfy-table", "either", "hashbrown 0.14.5", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "indexmap", "ndarray", "num-traits", @@ -2844,7 +2990,7 @@ dependencies = [ "serde", "serde_json", "strum_macros", - "thiserror", + "thiserror 1.0.69", "version_check", "xxhash-rust", ] @@ -2863,6 +3009,20 @@ dependencies = [ "tokio", ] +[[package]] +name = "polars-dylib" +version = "0.44.2" +dependencies = [ + "polars", + "polars-arrow", + "polars-core", + "polars-expr", + "polars-lazy", + "polars-mem-engine", + "polars-plan", + "polars-python", +] + [[package]] name = "polars-error" version = "0.44.2" @@ -2872,7 +3032,7 @@ dependencies = [ "polars-arrow-format", "regex", "simdutf8", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -2881,7 +3041,7 @@ version = "0.44.2" dependencies = [ "ahash", "bitflags", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "num-traits", "once_cell", "polars-arrow", @@ -2922,7 +3082,7 @@ dependencies = [ "fs4", "futures", "glob", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "home", "itoa", "memchr", @@ -2963,7 +3123,7 @@ dependencies = [ "chrono", "chrono-tz", "fallible-streaming-iterator", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "indexmap", "itoa", "num-traits", @@ -3036,7 +3196,7 @@ dependencies = [ "chrono", "chrono-tz", "either", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "hex", "indexmap", "jsonpath_lib_polars_vendor", @@ -3074,7 +3234,7 @@ dependencies = [ "fallible-streaming-iterator", "flate2", "futures", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "lz4", "lz4_flex", "num-traits", @@ -3110,7 +3270,7 @@ dependencies = [ "crossbeam-queue", "enum_dispatch", "futures", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "num-traits", "polars-arrow", "polars-compute", @@ -3140,7 +3300,7 @@ dependencies = [ "ciborium", "either", "futures", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "libloading", "memmap2", "num-traits", @@ -3196,7 +3356,7 @@ dependencies = [ "pyo3", "recursive", "serde_json", - "thiserror", + "thiserror 1.0.69", "version_check", ] @@ -3297,7 +3457,7 @@ dependencies = [ "bytemuck", "bytes", "compact_str", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "indexmap", "libc", "memmap2", @@ -3385,9 +3545,9 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa37f80ca58604976033fae9515a8a2989fc13797d953f7c04fb8fa36a11f205" +checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" dependencies = [ "cc", ] @@ -3453,7 +3613,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -3466,14 +3626,14 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] name = "quad-rand" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b76f1009795ca44bb5aaae8fd3f18953e209259c33d9b059b1f53d58ab7511db" +checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" [[package]] name = "quick-xml" @@ -3498,9 +3658,9 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684" +checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef" dependencies = [ "bytes", "pin-project-lite", @@ -3509,33 +3669,36 @@ dependencies = [ "rustc-hash 2.0.0", "rustls 0.23.16", "socket2", - "thiserror", + "thiserror 2.0.3", "tokio", "tracing", ] [[package]] name = "quinn-proto" -version = "0.11.8" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6" +checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" dependencies = [ "bytes", + "getrandom", "rand", "ring", "rustc-hash 2.0.0", "rustls 0.23.16", + "rustls-pki-types", "slab", - "thiserror", + "thiserror 2.0.3", "tinyvec", "tracing", + "web-time", ] [[package]] name = "quinn-udp" -version = "0.5.6" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e346e016eacfff12233c243718197ca12f148c84e1e84268a896699b41c71780" +checksum = "7d5a626c6807713b15cac82a6acaccd6043c9a5408c24baae07611fec3f243da" dependencies = [ "cfg_aliases", "libc", @@ -3665,7 +3828,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -3694,7 +3857,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -3711,9 +3874,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -3845,9 +4008,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.38" +version = "0.38.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa260229e6538e52293eeb577aabd09945a09d6d9cc0fc550ed7529056c2e32a" +checksum = "99e4ea3e1cdc4b559b8e5650f9c8e5998e3e5c1343b4eaf034565f32318d63c0" dependencies = [ "bitflags", "errno", @@ -3930,6 +4093,9 @@ name = "rustls-pki-types" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" +dependencies = [ + "web-time", +] [[package]] name = "rustls-webpki" @@ -4073,9 +4239,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.12.0" +version = "2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6" +checksum = "fa39c7303dc58b5543c94d22c1766b0d31f2ee58306363ea622b10bbc075eaa2" dependencies = [ "core-foundation-sys", "libc", @@ -4089,9 +4255,9 @@ checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" [[package]] name = "serde" -version = "1.0.214" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" +checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" dependencies = [ "serde_derive", ] @@ -4107,13 +4273,13 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.214" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" +checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -4304,6 +4470,12 @@ dependencies = [ "log", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "stacker" version = "0.1.17" @@ -4360,7 +4532,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -4382,9 +4554,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.86" +version = "2.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e89275301d38033efb81a6e60e3497e734dfcc62571f2854bf4b16690398824c" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" dependencies = [ "proc-macro2", "quote", @@ -4400,6 +4572,17 @@ dependencies = [ "futures-core", ] +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "sysinfo" version = "0.32.0" @@ -4427,9 +4610,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.13.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b" +checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" dependencies = [ "cfg-if", "fastrand", @@ -4440,22 +4623,42 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.66" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" +dependencies = [ + "thiserror-impl 2.0.3", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d171f59dbaa811dbbb1aee1e73db92ec2b122911a48e1390dfe327a821ddede" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ - "thiserror-impl", + "proc-macro2", + "quote", + "syn 2.0.87", ] [[package]] name = "thiserror-impl" -version = "1.0.66" +version = "2.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b08be0f17bd307950653ce45db00cd31200d82b624b36e181337d9c7d92765b5" +checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -4488,6 +4691,16 @@ dependencies = [ "time-core", ] +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinytemplate" version = "1.2.1" @@ -4515,9 +4728,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.41.0" +version = "1.41.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb" +checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33" dependencies = [ "backtrace", "bytes", @@ -4538,7 +4751,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -4601,7 +4814,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -4646,7 +4859,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -4661,27 +4874,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" -[[package]] -name = "unicode-bidi" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" - [[package]] name = "unicode-ident" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" -[[package]] -name = "unicode-normalization" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" -dependencies = [ - "tinyvec", -] - [[package]] name = "unicode-reverse" version = "1.0.9" @@ -4717,9 +4915,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.2" +version = "2.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" +checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada" dependencies = [ "form_urlencoded", "idna", @@ -4732,6 +4930,18 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "uuid" version = "1.11.0" @@ -4813,7 +5023,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", "wasm-bindgen-shared", ] @@ -4847,7 +5057,7 @@ checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4881,6 +5091,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "winapi" version = "0.3.9" @@ -4951,7 +5171,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -4962,7 +5182,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -5143,6 +5363,18 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + [[package]] name = "x11rb" version = "0.13.1" @@ -5172,6 +5404,30 @@ version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a5cbf750400958819fb6178eaa83bee5cd9c29a26a40cc241df8c70fdd46984" +[[package]] +name = "yoke" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.7.35" @@ -5190,7 +5446,28 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", +] + +[[package]] +name = "zerofrom" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", + "synstructure", ] [[package]] @@ -5199,6 +5476,28 @@ version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "zstd" version = "0.13.2" diff --git a/Cargo.toml b/Cargo.toml index 35595086f981..34502bb5e9ee 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -94,6 +94,7 @@ zstd = "0.13" polars = { version = "0.44.2", path = "crates/polars", default-features = false } polars-compute = { version = "0.44.2", path = "crates/polars-compute", default-features = false } polars-core = { version = "0.44.2", path = "crates/polars-core", default-features = false } +polars-dylib = { version = "0.44.2", path = "crates/polars-dyn", default-features = false } polars-error = { version = "0.44.2", path = "crates/polars-error", default-features = false } polars-expr = { version = "0.44.2", path = "crates/polars-expr", default-features = false } polars-ffi = { version = "0.44.2", path = "crates/polars-ffi", default-features = false } diff --git a/crates/Makefile b/crates/Makefile index 28622ee061f5..6b3cf3372149 100644 --- a/crates/Makefile +++ b/crates/Makefile @@ -152,5 +152,6 @@ check-wasm: ## Check wasm build without supported features --exclude-features parquet \ --exclude-features performant \ --exclude-features streaming \ - --exclude-features http \ + --exclude-features http \ + --exclude-features full \ --exclude-features test diff --git a/crates/polars-dylib/Cargo.toml b/crates/polars-dylib/Cargo.toml new file mode 100644 index 000000000000..5cc963f2d701 --- /dev/null +++ b/crates/polars-dylib/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "polars-dylib" +version.workspace = true +authors.workspace = true +edition.workspace = true +homepage.workspace = true +license.workspace = true +repository.workspace = true + +[lib] +crate-type = ["dylib", "rlib"] + +[dependencies] +arrow = { workspace = true, optional = true, features = ["io_flight"] } +polars = { workspace = true, features = ["full"] } +polars-core = { workspace = true, optional = true } +polars-expr = { workspace = true, optional = true } +polars-lazy = { workspace = true, optional = true } +polars-mem-engine = { workspace = true, optional = true } +polars-plan = { workspace = true, optional = true } +polars-python = { workspace = true, optional = true, default-features = true } + +[features] +private = ["polars-plan", "arrow", "polars-core", "polars-lazy", "polars-expr", "polars-mem-engine"] +python = ["polars-plan?/python", "polars-python", "polars-lazy?/python"] diff --git a/crates/polars-dylib/README.md b/crates/polars-dylib/README.md new file mode 100644 index 000000000000..3fd4b30de8f7 --- /dev/null +++ b/crates/polars-dylib/README.md @@ -0,0 +1,16 @@ +# Polars dynamic library + +```toml +# Cargo.toml +[workspace.dependencies.polars] +package = "polars-dylib" +``` + +```toml +# .cargo/config.toml +[build] +rustflags = [ + "-C", + "prefer-dynamic", +] +``` diff --git a/crates/polars-dylib/src/lib.rs b/crates/polars-dylib/src/lib.rs new file mode 100644 index 000000000000..907ce175aec8 --- /dev/null +++ b/crates/polars-dylib/src/lib.rs @@ -0,0 +1,15 @@ +#[cfg(feature = "private")] +pub use arrow as _arrow; +pub use polars::*; +#[cfg(feature = "private")] +pub use polars_core as _core; +#[cfg(feature = "private")] +pub use polars_expr as _expr; +#[cfg(feature = "private")] +pub use polars_lazy as _lazy; +#[cfg(feature = "private")] +pub use polars_mem_engine as _mem_engine; +#[cfg(feature = "private")] +pub use polars_plan as _plan; +#[cfg(feature = "python")] +pub use polars_python as _python; diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index 9ff45610a3c7..7c054c21f59b 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -417,12 +417,21 @@ docs-selection = [ "replace", "approx_unique", "unique_counts", + "polars_cloud", + "serde", + "ir_serde", + "cloud", + "async", + "cloud_write", ] bench = [ "lazy", ] +# All features expect python +full = ["docs-selection", "performant", "fmt"] + [package.metadata.docs.rs] # all-features = true features = ["docs-selection"] From 9f791007ae702cd1c63c36f702523df0626e3793 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 13 Nov 2024 19:33:34 +0100 Subject: [PATCH 09/18] Python Polars 1.13.1 (#19768) --- Cargo.lock | 2 +- py-polars/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 281f2845ccfc..d00cfa7ff0a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3554,7 +3554,7 @@ dependencies = [ [[package]] name = "py-polars" -version = "1.13.0" +version = "1.13.1" dependencies = [ "jemallocator", "libc", diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index d17218a3b6cd..a2ff3d9882da 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-polars" -version = "1.13.0" +version = "1.13.1" edition = "2021" [lib] From 420c0d99b210816e89a01c041354d322424e2137 Mon Sep 17 00:00:00 2001 From: Alisa Petrova <60570090+sn0rkmaiden@users.noreply.github.com> Date: Thu, 14 Nov 2024 09:55:35 +0300 Subject: [PATCH 10/18] fix(python): Fixed typo in file lazy.py (#19769) --- py-polars/polars/functions/lazy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index dfd3f607791c..30185cc6a586 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -1030,7 +1030,7 @@ def map_groups( The output for group `1` can be understood as follows: - - group `1` contains Series `'a': [1, 3]` and `'b': [4, 5]` + - group `1` contains Series `'a': [1, 3]` and `'b': [5, 6]` - applying the function to those lists of Series, one gets the output `[1 / 4 + 5, 3 / 4 + 6]`, i.e. `[5.25, 6.75]` """ From b7fce25bdeef0b0a7fddc708763568d25ba166b1 Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Thu, 14 Nov 2024 21:28:16 +1100 Subject: [PATCH 11/18] fix: Fix incorrect lazy schema for explode on array columns (#19776) --- .../polars-plan/src/dsl/function_expr/array.rs | 6 +++++- crates/polars-plan/src/plans/aexpr/schema.rs | 13 ++++++++----- .../simplify_expr/simplify_functions.rs | 6 ------ py-polars/tests/unit/operations/test_join.py | 10 ++++++++++ py-polars/tests/unit/test_schema.py | 18 ++++++++++++++++++ 5 files changed, 41 insertions(+), 12 deletions(-) diff --git a/crates/polars-plan/src/dsl/function_expr/array.rs b/crates/polars-plan/src/dsl/function_expr/array.rs index 2ecd016981e3..08333beb3893 100644 --- a/crates/polars-plan/src/dsl/function_expr/array.rs +++ b/crates/polars-plan/src/dsl/function_expr/array.rs @@ -132,7 +132,7 @@ impl From for SpecialEq> { #[cfg(feature = "array_count")] CountMatches => map_as_slice!(count_matches), Shift => map_as_slice!(shift), - Explode => unreachable!(), + Explode => map_as_slice!(explode), } } } @@ -253,3 +253,7 @@ pub(super) fn shift(s: &[Column]) -> PolarsResult { ca.array_shift(n.as_materialized_series()).map(Column::from) } + +fn explode(c: &[Column]) -> PolarsResult { + c[0].explode() +} diff --git a/crates/polars-plan/src/plans/aexpr/schema.rs b/crates/polars-plan/src/plans/aexpr/schema.rs index 6c1b675b2bd8..6547c391eaae 100644 --- a/crates/polars-plan/src/plans/aexpr/schema.rs +++ b/crates/polars-plan/src/plans/aexpr/schema.rs @@ -84,11 +84,14 @@ impl AExpr { .get(*expr) .to_field_impl(schema, ctx, arena, &mut false)?; - if let List(inner) = field.dtype() { - Ok(Field::new(field.name().clone(), *inner.clone())) - } else { - Ok(field) - } + let field = match field.dtype() { + List(inner) => Field::new(field.name().clone(), *inner.clone()), + #[cfg(feature = "dtype-array")] + Array(inner, ..) => Field::new(field.name().clone(), *inner.clone()), + _ => field, + }; + + Ok(field) }, Alias(expr, name) => Ok(Field::new( name.clone(), diff --git a/crates/polars-plan/src/plans/optimizer/simplify_expr/simplify_functions.rs b/crates/polars-plan/src/plans/optimizer/simplify_expr/simplify_functions.rs index 03f274e5211a..2b5493c62e6b 100644 --- a/crates/polars-plan/src/plans/optimizer/simplify_expr/simplify_functions.rs +++ b/crates/polars-plan/src/plans/optimizer/simplify_expr/simplify_functions.rs @@ -7,12 +7,6 @@ pub(super) fn optimize_functions( expr_arena: &mut Arena, ) -> PolarsResult> { let out = match function { - #[cfg(feature = "dtype-array")] - // arr.explode() -> explode() - FunctionExpr::ArrayExpr(ArrayFunction::Explode) => { - let input_node = input[0].node(); - Some(AExpr::Explode(input_node)) - }, // is_null().any() -> null_count() > 0 // is_not_null().any() -> null_count() < len() // CORRECTNESS: we can ignore 'ignore_nulls' since is_null/is_not_null never produces NULLS diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index c65be5ad61c0..93395fafbdd5 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -1113,3 +1113,13 @@ def test_join_key_type_coercion_19597() -> None: left.join( right, left_on=pl.col("a") * 2, right_on=pl.col("a") * 2 ).collect_schema() + + +def test_array_explode_join_19763() -> None: + q = pl.LazyFrame().select( + pl.lit(pl.Series([[1], [2]], dtype=pl.Array(pl.Int64, 1))).explode().alias("k") + ) + + q = q.join(pl.LazyFrame({"k": [1, 2]}), on="k") + + assert_frame_equal(q.collect().sort("k"), pl.DataFrame({"k": [1, 2]})) diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py index a8f9e43d84c0..43e8840458d3 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/test_schema.py @@ -278,3 +278,21 @@ def test_lf_window_schema(expr: pl.Expr, mapping_strategy: str) -> None: ) assert q.collect_schema() == q.collect().collect_schema() + + +def test_lf_explode_schema() -> None: + lf = pl.LazyFrame({"k": [1], "x": pl.Series([[1]], dtype=pl.Array(pl.Int64, 1))}) + + q = lf.select(pl.col("x").explode()) + assert q.collect_schema() == {"x": pl.Int64} + + q = lf.select(pl.col("x").arr.explode()) + assert q.collect_schema() == {"x": pl.Int64} + + lf = pl.LazyFrame({"k": [1], "x": pl.Series([[1]], dtype=pl.List(pl.Int64))}) + + q = lf.select(pl.col("x").explode()) + assert q.collect_schema() == {"x": pl.Int64} + + q = lf.select(pl.col("x").list.explode()) + assert q.collect_schema() == {"x": pl.Int64} From 9af7ccdb1b3b22f11a000d5003e15bc29af2a654 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Thu, 14 Nov 2024 05:31:06 -0500 Subject: [PATCH 12/18] feat: A different approach to warning users of fork() issues with Polars (#19197) Co-authored-by: Itamar Turner-Trauring --- py-polars/polars/__init__.py | 28 +++++++++++++++++++ py-polars/tests/unit/test_polars_import.py | 31 ++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index 83ea52acc822..eb33f23bf53f 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -429,3 +429,31 @@ def __getattr__(name: str) -> Any: msg = f"module {__name__!r} has no attribute {name!r}" raise AttributeError(msg) + + +# fork() breaks Polars thread pool, so warn users who might be doing this. +def __install_postfork_hook() -> None: + message = """\ +Using fork() can cause Polars to deadlock in the child process. +In addition, using fork() with Python in general is a recipe for mysterious +deadlocks and crashes. + +The most likely reason you are seeing this error is because you are using the +multiprocessing module on Linux, which uses fork() by default. This will be +fixed in Python 3.14. Until then, you want to use the "spawn" context instead. + +See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details. +""" + + def before_hook() -> None: + import warnings + + warnings.warn(message, RuntimeWarning, stacklevel=2) + + import os + + if hasattr(os, "register_at_fork"): + os.register_at_fork(before=before_hook) + + +__install_postfork_hook() diff --git a/py-polars/tests/unit/test_polars_import.py b/py-polars/tests/unit/test_polars_import.py index fa1779de3478..2686c094999b 100644 --- a/py-polars/tests/unit/test_polars_import.py +++ b/py-polars/tests/unit/test_polars_import.py @@ -1,6 +1,8 @@ from __future__ import annotations import compileall +import multiprocessing +import os import subprocess import sys from pathlib import Path @@ -97,3 +99,32 @@ def test_polars_import() -> None: import_time_ms = polars_import_time // 1_000 msg = f"Possible import speed regression; took {import_time_ms}ms\n{df_import}" raise AssertionError(msg) + + +def run_in_child() -> int: + return 123 + + +@pytest.mark.skipif(not hasattr(os, "fork"), reason="Requires fork()") +def test_fork_safety(recwarn: pytest.WarningsRecorder) -> None: + def get_num_fork_warnings() -> int: + fork_warnings = 0 + for warning in recwarn: + if issubclass(warning.category, RuntimeWarning) and str( + warning.message + ).startswith("Using fork() can cause Polars"): + fork_warnings += 1 + return fork_warnings + + assert get_num_fork_warnings() == 0 + + # Using forkserver and spawn context should not do any of our warning: + for context in ["spawn", "forkserver"]: + with multiprocessing.get_context(context).Pool(1) as pool: + assert pool.apply(run_in_child) == 123 + assert get_num_fork_warnings() == 0 + + # Using fork()-based multiprocessing should raise a warning: + with multiprocessing.get_context("fork").Pool(1) as pool: + assert pool.apply(run_in_child) == 123 + assert get_num_fork_warnings() == 1 From 1e262ba431bfbc1f7000cf037a894a71d5b30aa7 Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Thu, 14 Nov 2024 23:09:10 +1100 Subject: [PATCH 13/18] fix: Fix incorrect filter after right-join on LazyFrame (#19775) --- .../optimizer/predicate_pushdown/join.rs | 33 ++++++++++--------- py-polars/tests/unit/test_predicates.py | 15 +++++++++ 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/join.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/join.rs index b79de0e2b959..7e4710e709a1 100644 --- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/join.rs +++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/join.rs @@ -66,22 +66,25 @@ fn should_block_join_specific( } } +/// Returns a tuple indicating whether predicates should be blocked for either side based on the +/// join type. +/// +/// * `true` indicates that predicates must not be pushed to that side fn join_produces_null(how: &JoinType) -> LeftRight { - #[cfg(feature = "asof_join")] - { - match how { - JoinType::Left => LeftRight(false, true), - JoinType::Full { .. } | JoinType::Cross | JoinType::AsOf(_) => LeftRight(true, true), - _ => LeftRight(false, false), - } - } - #[cfg(not(feature = "asof_join"))] - { - match how { - JoinType::Left => LeftRight(false, true), - JoinType::Full { .. } | JoinType::Cross => LeftRight(true, true), - _ => LeftRight(false, false), - } + match how { + JoinType::Left => LeftRight(false, true), + JoinType::Right => LeftRight(true, false), + + JoinType::Full { .. } => LeftRight(true, true), + JoinType::Cross => LeftRight(true, true), + #[cfg(feature = "asof_join")] + JoinType::AsOf(_) => LeftRight(true, true), + + JoinType::Inner => LeftRight(false, false), + #[cfg(feature = "semi_anti_join")] + JoinType::Semi | JoinType::Anti => LeftRight(false, false), + #[cfg(feature = "iejoin")] + JoinType::IEJoin(..) => LeftRight(false, false), } } diff --git a/py-polars/tests/unit/test_predicates.py b/py-polars/tests/unit/test_predicates.py index e752bacdf81d..e8f0be927cb9 100644 --- a/py-polars/tests/unit/test_predicates.py +++ b/py-polars/tests/unit/test_predicates.py @@ -553,3 +553,18 @@ def test_predicate_pushdown_struct_unnest_19632() -> None: q.collect(), pl.DataFrame({"a": 1, "count": 1}, schema={"a": pl.Int64, "count": pl.UInt32}), ) + + +def test_predicate_pushdown_right_join_19772() -> None: + left = pl.LazyFrame({"k": [1], "v": [7]}) + right = pl.LazyFrame({"k": [1, 2]}) + + q = left.join(right, on="k", how="right").filter(pl.col("v") == 7) + + plan = q.explain() + assert plan.startswith("FILTER") + + expect = pl.DataFrame({"v": 7, "k": 1}) + + assert_frame_equal(q.collect(no_optimization=True), expect) + assert_frame_equal(q.collect(), expect) From 869d1b93b52bdde17bc8fede4d81c1aee4324a87 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Thu, 14 Nov 2024 07:09:28 -0500 Subject: [PATCH 14/18] fix(python): Release the GIL in Python APIs, part 2 of 2 (#19762) Co-authored-by: Itamar Turner-Trauring --- crates/polars-python/src/cloud.rs | 4 +- .../src/dataframe/construction.rs | 4 +- crates/polars-python/src/dataframe/export.rs | 28 +- crates/polars-python/src/dataframe/general.rs | 330 +++++++++++------- crates/polars-python/src/functions/range.rs | 3 +- .../src/interop/arrow/to_rust.rs | 26 +- .../src/interop/numpy/to_numpy_df.rs | 1 + .../src/interop/numpy/to_numpy_series.rs | 7 +- crates/polars-python/src/map/mod.rs | 13 +- crates/polars-python/src/map/series.rs | 16 +- 10 files changed, 259 insertions(+), 173 deletions(-) diff --git a/crates/polars-python/src/cloud.rs b/crates/polars-python/src/cloud.rs index 39410a6fa7a1..19d4f6dfda07 100644 --- a/crates/polars-python/src/cloud.rs +++ b/crates/polars-python/src/cloud.rs @@ -49,9 +49,7 @@ pub fn _execute_ir_plan_with_gpu(ir_plan_ser: Vec, py: Python) -> PyResult

>) -> PyResult { - let df = interop::arrow::to_rust::to_rust_df(&rb)?; + pub fn from_arrow_record_batches(py: Python, rb: Vec>) -> PyResult { + let df = interop::arrow::to_rust::to_rust_df(py, &rb)?; Ok(Self::from(df)) } } diff --git a/crates/polars-python/src/dataframe/export.rs b/crates/polars-python/src/dataframe/export.rs index b32ad3d6afb3..36037865feb2 100644 --- a/crates/polars-python/src/dataframe/export.rs +++ b/crates/polars-python/src/dataframe/export.rs @@ -79,19 +79,17 @@ impl PyDataFrame { } #[allow(clippy::wrong_self_convention)] - pub fn to_arrow(&mut self, compat_level: PyCompatLevel) -> PyResult> { - self.df.align_chunks_par(); - Python::with_gil(|py| { - let pyarrow = py.import_bound("pyarrow")?; - let names = self.df.get_column_names_str(); + pub fn to_arrow(&mut self, py: Python, compat_level: PyCompatLevel) -> PyResult> { + py.allow_threads(|| self.df.align_chunks_par()); + let pyarrow = py.import_bound("pyarrow")?; + let names = self.df.get_column_names_str(); - let rbs = self - .df - .iter_chunks(compat_level.0, true) - .map(|rb| interop::arrow::to_py::to_py_rb(&rb, &names, py, &pyarrow)) - .collect::>()?; - Ok(rbs) - }) + let rbs = self + .df + .iter_chunks(compat_level.0, true) + .map(|rb| interop::arrow::to_py::to_py_rb(&rb, &names, py, &pyarrow)) + .collect::>()?; + Ok(rbs) } /// Create a `Vec` of PyArrow RecordBatch instances. @@ -100,8 +98,8 @@ impl PyDataFrame { /// since those can't be converted correctly via PyArrow. The calling Python /// code should make sure these are not included. #[allow(clippy::wrong_self_convention)] - pub fn to_pandas(&mut self) -> PyResult> { - self.df.as_single_chunk_par(); + pub fn to_pandas(&mut self, py: Python) -> PyResult> { + py.allow_threads(|| self.df.as_single_chunk_par()); Python::with_gil(|py| { let pyarrow = py.import_bound("pyarrow")?; let names = self.df.get_column_names_str(); @@ -154,7 +152,7 @@ impl PyDataFrame { py: Python<'py>, requested_schema: Option, ) -> PyResult> { - self.df.align_chunks_par(); + py.allow_threads(|| self.df.align_chunks_par()); dataframe_to_stream(&self.df, py) } } diff --git a/crates/polars-python/src/dataframe/general.rs b/crates/polars-python/src/dataframe/general.rs index 8f2321d103fd..e866e7db1004 100644 --- a/crates/polars-python/src/dataframe/general.rs +++ b/crates/polars-python/src/dataframe/general.rs @@ -45,67 +45,87 @@ impl PyDataFrame { .collect() } - pub fn add(&self, s: &PySeries) -> PyResult { - let df = (&self.df + &s.series).map_err(PyPolarsErr::from)?; + pub fn add(&self, py: Python, s: &PySeries) -> PyResult { + let df = py + .allow_threads(|| &self.df + &s.series) + .map_err(PyPolarsErr::from)?; Ok(df.into()) } - pub fn sub(&self, s: &PySeries) -> PyResult { - let df = (&self.df - &s.series).map_err(PyPolarsErr::from)?; + pub fn sub(&self, py: Python, s: &PySeries) -> PyResult { + let df = py + .allow_threads(|| &self.df - &s.series) + .map_err(PyPolarsErr::from)?; Ok(df.into()) } - pub fn div(&self, s: &PySeries) -> PyResult { - let df = (&self.df / &s.series).map_err(PyPolarsErr::from)?; + pub fn div(&self, py: Python, s: &PySeries) -> PyResult { + let df = py + .allow_threads(|| &self.df / &s.series) + .map_err(PyPolarsErr::from)?; Ok(df.into()) } - pub fn mul(&self, s: &PySeries) -> PyResult { - let df = (&self.df * &s.series).map_err(PyPolarsErr::from)?; + pub fn mul(&self, py: Python, s: &PySeries) -> PyResult { + let df = py + .allow_threads(|| &self.df * &s.series) + .map_err(PyPolarsErr::from)?; Ok(df.into()) } - pub fn rem(&self, s: &PySeries) -> PyResult { - let df = (&self.df % &s.series).map_err(PyPolarsErr::from)?; + pub fn rem(&self, py: Python, s: &PySeries) -> PyResult { + let df = py + .allow_threads(|| &self.df % &s.series) + .map_err(PyPolarsErr::from)?; Ok(df.into()) } - pub fn add_df(&self, s: &Self) -> PyResult { - let df = (&self.df + &s.df).map_err(PyPolarsErr::from)?; + pub fn add_df(&self, py: Python, s: &Self) -> PyResult { + let df = py + .allow_threads(|| &self.df + &s.df) + .map_err(PyPolarsErr::from)?; Ok(df.into()) } - pub fn sub_df(&self, s: &Self) -> PyResult { - let df = (&self.df - &s.df).map_err(PyPolarsErr::from)?; + pub fn sub_df(&self, py: Python, s: &Self) -> PyResult { + let df = py + .allow_threads(|| &self.df - &s.df) + .map_err(PyPolarsErr::from)?; Ok(df.into()) } - pub fn div_df(&self, s: &Self) -> PyResult { - let df = (&self.df / &s.df).map_err(PyPolarsErr::from)?; + pub fn div_df(&self, py: Python, s: &Self) -> PyResult { + let df = py + .allow_threads(|| &self.df / &s.df) + .map_err(PyPolarsErr::from)?; Ok(df.into()) } - pub fn mul_df(&self, s: &Self) -> PyResult { - let df = (&self.df * &s.df).map_err(PyPolarsErr::from)?; + pub fn mul_df(&self, py: Python, s: &Self) -> PyResult { + let df = py + .allow_threads(|| &self.df * &s.df) + .map_err(PyPolarsErr::from)?; Ok(df.into()) } - pub fn rem_df(&self, s: &Self) -> PyResult { - let df = (&self.df % &s.df).map_err(PyPolarsErr::from)?; + pub fn rem_df(&self, py: Python, s: &Self) -> PyResult { + let df = py + .allow_threads(|| &self.df % &s.df) + .map_err(PyPolarsErr::from)?; Ok(df.into()) } #[pyo3(signature = (n, with_replacement, shuffle, seed=None))] pub fn sample_n( &self, + py: Python, n: &PySeries, with_replacement: bool, shuffle: bool, seed: Option, ) -> PyResult { - let df = self - .df - .sample_n(&n.series, with_replacement, shuffle, seed) + let df = py + .allow_threads(|| self.df.sample_n(&n.series, with_replacement, shuffle, seed)) .map_err(PyPolarsErr::from)?; Ok(df.into()) } @@ -113,14 +133,17 @@ impl PyDataFrame { #[pyo3(signature = (frac, with_replacement, shuffle, seed=None))] pub fn sample_frac( &self, + py: Python, frac: &PySeries, with_replacement: bool, shuffle: bool, seed: Option, ) -> PyResult { - let df = self - .df - .sample_frac(&frac.series, with_replacement, shuffle, seed) + let df = py + .allow_threads(|| { + self.df + .sample_frac(&frac.series, with_replacement, shuffle, seed) + }) .map_err(PyPolarsErr::from)?; Ok(df.into()) } @@ -183,34 +206,41 @@ impl PyDataFrame { self.df.is_empty() } - pub fn hstack(&self, columns: Vec) -> PyResult { + pub fn hstack(&self, py: Python, columns: Vec) -> PyResult { let columns = columns.to_series(); // @scalar-opt let columns = columns.into_iter().map(Into::into).collect::>(); - let df = self.df.hstack(&columns).map_err(PyPolarsErr::from)?; + let df = py + .allow_threads(|| self.df.hstack(&columns)) + .map_err(PyPolarsErr::from)?; Ok(df.into()) } - pub fn hstack_mut(&mut self, columns: Vec) -> PyResult<()> { + pub fn hstack_mut(&mut self, py: Python, columns: Vec) -> PyResult<()> { let columns = columns.to_series(); // @scalar-opt let columns = columns.into_iter().map(Into::into).collect::>(); - self.df.hstack_mut(&columns).map_err(PyPolarsErr::from)?; + py.allow_threads(|| self.df.hstack_mut(&columns)) + .map_err(PyPolarsErr::from)?; Ok(()) } - pub fn vstack(&self, other: &PyDataFrame) -> PyResult { - let df = self.df.vstack(&other.df).map_err(PyPolarsErr::from)?; + pub fn vstack(&self, py: Python, other: &PyDataFrame) -> PyResult { + let df = py + .allow_threads(|| self.df.vstack(&other.df)) + .map_err(PyPolarsErr::from)?; Ok(df.into()) } - pub fn vstack_mut(&mut self, other: &PyDataFrame) -> PyResult<()> { - self.df.vstack_mut(&other.df).map_err(PyPolarsErr::from)?; + pub fn vstack_mut(&mut self, py: Python, other: &PyDataFrame) -> PyResult<()> { + py.allow_threads(|| self.df.vstack_mut(&other.df)) + .map_err(PyPolarsErr::from)?; Ok(()) } - pub fn extend(&mut self, other: &PyDataFrame) -> PyResult<()> { - self.df.extend(&other.df).map_err(PyPolarsErr::from)?; + pub fn extend(&mut self, py: Python, other: &PyDataFrame) -> PyResult<()> { + py.allow_threads(|| self.df.extend(&other.df)) + .map_err(PyPolarsErr::from)?; Ok(()) } @@ -254,10 +284,9 @@ impl PyDataFrame { Ok(series) } - pub fn select(&self, columns: Vec) -> PyResult { - let df = self - .df - .select(columns.iter().map(|x| &**x)) + pub fn select(&self, py: Python, columns: Vec) -> PyResult { + let df = py + .allow_threads(|| self.df.select(columns.iter().map(|x| &**x))) .map_err(PyPolarsErr::from)?; Ok(PyDataFrame::new(df)) } @@ -297,46 +326,55 @@ impl PyDataFrame { } #[pyo3(signature = (offset, length=None))] - pub fn slice(&self, offset: i64, length: Option) -> Self { - let df = self - .df - .slice(offset, length.unwrap_or_else(|| self.df.height())); + pub fn slice(&self, py: Python, offset: i64, length: Option) -> Self { + let df = py.allow_threads(|| { + self.df + .slice(offset, length.unwrap_or_else(|| self.df.height())) + }); df.into() } - pub fn head(&self, n: usize) -> Self { - let df = self.df.head(Some(n)); + pub fn head(&self, py: Python, n: usize) -> Self { + let df = py.allow_threads(|| self.df.head(Some(n))); PyDataFrame::new(df) } - pub fn tail(&self, n: usize) -> Self { - let df = self.df.tail(Some(n)); + pub fn tail(&self, py: Python, n: usize) -> Self { + let df = py.allow_threads(|| self.df.tail(Some(n))); PyDataFrame::new(df) } - pub fn is_unique(&self) -> PyResult { - let mask = self.df.is_unique().map_err(PyPolarsErr::from)?; + pub fn is_unique(&self, py: Python) -> PyResult { + let mask = py + .allow_threads(|| self.df.is_unique()) + .map_err(PyPolarsErr::from)?; Ok(mask.into_series().into()) } - pub fn is_duplicated(&self) -> PyResult { - let mask = self.df.is_duplicated().map_err(PyPolarsErr::from)?; + pub fn is_duplicated(&self, py: Python) -> PyResult { + let mask = py + .allow_threads(|| self.df.is_duplicated()) + .map_err(PyPolarsErr::from)?; Ok(mask.into_series().into()) } - pub fn equals(&self, other: &PyDataFrame, null_equal: bool) -> bool { + pub fn equals(&self, py: Python, other: &PyDataFrame, null_equal: bool) -> bool { if null_equal { - self.df.equals_missing(&other.df) + py.allow_threads(|| self.df.equals_missing(&other.df)) } else { - self.df.equals(&other.df) + py.allow_threads(|| self.df.equals(&other.df)) } } #[pyo3(signature = (name, offset=None))] - pub fn with_row_index(&self, name: &str, offset: Option) -> PyResult { - let df = self - .df - .with_row_index(name.into(), offset) + pub fn with_row_index( + &self, + py: Python, + name: &str, + offset: Option, + ) -> PyResult { + let df = py + .allow_threads(|| self.df.with_row_index(name.into(), offset)) .map_err(PyPolarsErr::from)?; Ok(df.into()) } @@ -398,6 +436,7 @@ impl PyDataFrame { #[pyo3(signature = (on, index, value_name=None, variable_name=None))] pub fn unpivot( &self, + py: Python, on: Vec, index: Vec, value_name: Option<&str>, @@ -411,7 +450,9 @@ impl PyDataFrame { variable_name: variable_name.map(|s| s.into()), }; - let df = self.df.unpivot2(args).map_err(PyPolarsErr::from)?; + let df = py + .allow_threads(|| self.df.unpivot2(args)) + .map_err(PyPolarsErr::from)?; Ok(PyDataFrame::new(df)) } @@ -419,6 +460,7 @@ impl PyDataFrame { #[pyo3(signature = (on, index, values, maintain_order, sort_columns, aggregate_expr, separator))] pub fn pivot_expr( &self, + py: Python, on: Vec, index: Option>, values: Option>, @@ -429,31 +471,38 @@ impl PyDataFrame { ) -> PyResult { let fun = if maintain_order { pivot_stable } else { pivot }; let agg_expr = aggregate_expr.map(|expr| expr.inner); - let df = fun( - &self.df, - on, - index, - values, - sort_columns, - agg_expr, - separator, - ) - .map_err(PyPolarsErr::from)?; + let df = py + .allow_threads(|| { + fun( + &self.df, + on, + index, + values, + sort_columns, + agg_expr, + separator, + ) + }) + .map_err(PyPolarsErr::from)?; Ok(PyDataFrame::new(df)) } pub fn partition_by( &self, + py: Python, by: Vec, maintain_order: bool, include_key: bool, ) -> PyResult> { - let out = if maintain_order { - self.df.partition_by_stable(by, include_key) - } else { - self.df.partition_by(by, include_key) - } - .map_err(PyPolarsErr::from)?; + let out = py + .allow_threads(|| { + if maintain_order { + self.df.partition_by_stable(by, include_key) + } else { + self.df.partition_by(by, include_key) + } + }) + .map_err(PyPolarsErr::from)?; // SAFETY: PyDataFrame is a repr(transparent) DataFrame. Ok(unsafe { std::mem::transmute::, Vec>(out) }) @@ -463,38 +512,40 @@ impl PyDataFrame { self.df.clone().lazy().into() } - pub fn max_horizontal(&self) -> PyResult> { - let s = self.df.max_horizontal().map_err(PyPolarsErr::from)?; + pub fn max_horizontal(&self, py: Python) -> PyResult> { + let s = py + .allow_threads(|| self.df.max_horizontal()) + .map_err(PyPolarsErr::from)?; Ok(s.map(|s| s.take_materialized_series().into())) } - pub fn min_horizontal(&self) -> PyResult> { - let s = self.df.min_horizontal().map_err(PyPolarsErr::from)?; + pub fn min_horizontal(&self, py: Python) -> PyResult> { + let s = py + .allow_threads(|| self.df.min_horizontal()) + .map_err(PyPolarsErr::from)?; Ok(s.map(|s| s.take_materialized_series().into())) } - pub fn sum_horizontal(&self, ignore_nulls: bool) -> PyResult> { + pub fn sum_horizontal(&self, py: Python, ignore_nulls: bool) -> PyResult> { let null_strategy = if ignore_nulls { NullStrategy::Ignore } else { NullStrategy::Propagate }; - let s = self - .df - .sum_horizontal(null_strategy) + let s = py + .allow_threads(|| self.df.sum_horizontal(null_strategy)) .map_err(PyPolarsErr::from)?; Ok(s.map(|s| s.into())) } - pub fn mean_horizontal(&self, ignore_nulls: bool) -> PyResult> { + pub fn mean_horizontal(&self, py: Python, ignore_nulls: bool) -> PyResult> { let null_strategy = if ignore_nulls { NullStrategy::Ignore } else { NullStrategy::Propagate }; - let s = self - .df - .mean_horizontal(null_strategy) + let s = py + .allow_threads(|| self.df.mean_horizontal(null_strategy)) .map_err(PyPolarsErr::from)?; Ok(s.map(|s| s.into())) } @@ -502,24 +553,26 @@ impl PyDataFrame { #[pyo3(signature = (columns, separator, drop_first=false))] pub fn to_dummies( &self, + py: Python, columns: Option>, separator: Option<&str>, drop_first: bool, ) -> PyResult { - let df = match columns { - Some(cols) => self.df.columns_to_dummies( - cols.iter().map(|x| x as &str).collect(), - separator, - drop_first, - ), - None => self.df.to_dummies(separator, drop_first), - } - .map_err(PyPolarsErr::from)?; + let df = py + .allow_threads(|| match columns { + Some(cols) => self.df.columns_to_dummies( + cols.iter().map(|x| x as &str).collect(), + separator, + drop_first, + ), + None => self.df.to_dummies(separator, drop_first), + }) + .map_err(PyPolarsErr::from)?; Ok(df.into()) } - pub fn null_count(&self) -> Self { - let df = self.df.null_count(); + pub fn null_count(&self, py: Python) -> Self { + let df = py.allow_threads(|| self.df.null_count()); df.into() } @@ -555,19 +608,29 @@ impl PyDataFrame { }) } - pub fn shrink_to_fit(&mut self) { - self.df.shrink_to_fit(); + pub fn shrink_to_fit(&mut self, py: Python) { + py.allow_threads(|| self.df.shrink_to_fit()); } - pub fn hash_rows(&mut self, k0: u64, k1: u64, k2: u64, k3: u64) -> PyResult { + pub fn hash_rows( + &mut self, + py: Python, + k0: u64, + k1: u64, + k2: u64, + k3: u64, + ) -> PyResult { let hb = PlRandomState::with_seeds(k0, k1, k2, k3); - let hash = self.df.hash_rows(Some(hb)).map_err(PyPolarsErr::from)?; + let hash = py + .allow_threads(|| self.df.hash_rows(Some(hb))) + .map_err(PyPolarsErr::from)?; Ok(hash.into_series().into()) } #[pyo3(signature = (keep_names_as, column_names))] pub fn transpose( &mut self, + py: Python, keep_names_as: Option<&str>, column_names: &Bound, ) -> PyResult { @@ -578,54 +641,61 @@ impl PyDataFrame { } else { None }; - Ok(self - .df - .transpose(keep_names_as, new_col_names) + Ok(py + .allow_threads(|| self.df.transpose(keep_names_as, new_col_names)) .map_err(PyPolarsErr::from)? .into()) } + pub fn upsample( &self, + py: Python, by: Vec, index_column: &str, every: &str, stable: bool, ) -> PyResult { let every = Duration::try_parse(every).map_err(PyPolarsErr::from)?; - let out = if stable { - self.df.upsample_stable(by, index_column, every) - } else { - self.df.upsample(by, index_column, every) - }; + let out = py.allow_threads(|| { + if stable { + self.df.upsample_stable(by, index_column, every) + } else { + self.df.upsample(by, index_column, every) + } + }); let out = out.map_err(PyPolarsErr::from)?; Ok(out.into()) } - pub fn to_struct(&self, name: &str, invalid_indices: Vec) -> PySeries { - let ca = self.df.clone().into_struct(name.into()); - - if !invalid_indices.is_empty() { - let mut validity = MutableBitmap::with_capacity(ca.len()); - validity.extend_constant(ca.len(), true); - for i in invalid_indices { - validity.set(i, false); + pub fn to_struct(&self, py: Python, name: &str, invalid_indices: Vec) -> PySeries { + py.allow_threads(|| { + let ca = self.df.clone().into_struct(name.into()); + + if !invalid_indices.is_empty() { + let mut validity = MutableBitmap::with_capacity(ca.len()); + validity.extend_constant(ca.len(), true); + for i in invalid_indices { + validity.set(i, false); + } + let ca = ca.rechunk(); + ca.with_outer_validity(Some(validity.freeze())) + .into_series() + .into() + } else { + ca.into_series().into() } - let ca = ca.rechunk(); - ca.with_outer_validity(Some(validity.freeze())) - .into_series() - .into() - } else { - ca.into_series().into() - } + }) } - pub fn unnest(&self, columns: Vec) -> PyResult { - let df = self.df.unnest(columns).map_err(PyPolarsErr::from)?; + pub fn unnest(&self, py: Python, columns: Vec) -> PyResult { + let df = py + .allow_threads(|| self.df.unnest(columns)) + .map_err(PyPolarsErr::from)?; Ok(df.into()) } - pub fn clear(&self) -> Self { - self.df.clear().into() + pub fn clear(&self, py: Python) -> Self { + py.allow_threads(|| self.df.clear()).into() } #[allow(clippy::wrong_self_convention)] diff --git a/crates/polars-python/src/functions/range.rs b/crates/polars-python/src/functions/range.rs index b6eae4400dd8..11ff3864fdfa 100644 --- a/crates/polars-python/src/functions/range.rs +++ b/crates/polars-python/src/functions/range.rs @@ -17,6 +17,7 @@ pub fn int_range(start: PyExpr, end: PyExpr, step: i64, dtype: Wrap) - /// Eager version of `int_range` to avoid overhead from the expression engine. #[pyfunction] pub fn eager_int_range( + py: Python, lower: &Bound<'_, PyAny>, upper: &Bound<'_, PyAny>, step: &Bound<'_, PyAny>, @@ -34,7 +35,7 @@ pub fn eager_int_range( let start_v: <$T as PolarsNumericType>::Native = lower.extract()?; let end_v: <$T as PolarsNumericType>::Native = upper.extract()?; let step: i64 = step.extract()?; - new_int_range::<$T>(start_v, end_v, step, PlSmallStr::from_static("literal")) + py.allow_threads(|| new_int_range::<$T>(start_v, end_v, step, PlSmallStr::from_static("literal"))) }); let s = ret.map_err(PyPolarsErr::from)?; diff --git a/crates/polars-python/src/interop/arrow/to_rust.rs b/crates/polars-python/src/interop/arrow/to_rust.rs index 1add88c96fd8..ee741c4279cc 100644 --- a/crates/polars-python/src/interop/arrow/to_rust.rs +++ b/crates/polars-python/src/interop/arrow/to_rust.rs @@ -46,7 +46,7 @@ pub fn array_to_rust(obj: &Bound) -> PyResult { } } -pub fn to_rust_df(rb: &[Bound]) -> PyResult { +pub fn to_rust_df(py: Python, rb: &[Bound]) -> PyResult { let schema = rb .first() .ok_or_else(|| PyPolarsErr::Other("empty table".into()))? @@ -79,17 +79,19 @@ pub fn to_rust_df(rb: &[Bound]) -> PyResult { // for instance string -> large-utf8 // dict encoded to categorical let columns = if run_parallel { - POOL.install(|| { - columns - .into_par_iter() - .enumerate() - .map(|(i, arr)| { - let s = Series::try_from((names[i].clone(), arr)) - .map_err(PyPolarsErr::from)? - .into_column(); - Ok(s) - }) - .collect::>>() + py.allow_threads(|| { + POOL.install(|| { + columns + .into_par_iter() + .enumerate() + .map(|(i, arr)| { + let s = Series::try_from((names[i].clone(), arr)) + .map_err(PyPolarsErr::from)? + .into_column(); + Ok(s) + }) + .collect::>>() + }) }) } else { columns diff --git a/crates/polars-python/src/interop/numpy/to_numpy_df.rs b/crates/polars-python/src/interop/numpy/to_numpy_df.rs index c14753bdc7a3..887d218f5fe0 100644 --- a/crates/polars-python/src/interop/numpy/to_numpy_df.rs +++ b/crates/polars-python/src/interop/numpy/to_numpy_df.rs @@ -251,6 +251,7 @@ fn try_df_to_numpy_numeric_supertype( }; Some(np_array) } + fn df_columns_to_numpy( py: Python, df: &DataFrame, diff --git a/crates/polars-python/src/interop/numpy/to_numpy_series.rs b/crates/polars-python/src/interop/numpy/to_numpy_series.rs index 12f71e2a551d..e2a6c439caad 100644 --- a/crates/polars-python/src/interop/numpy/to_numpy_series.rs +++ b/crates/polars-python/src/interop/numpy/to_numpy_series.rs @@ -85,20 +85,21 @@ fn try_series_to_numpy_view( if !allow_nulls && series_contains_null(s) { return None; } - let (s_owned, writable_flag) = handle_chunks(s, allow_rechunk)?; + let (s_owned, writable_flag) = handle_chunks(py, s, allow_rechunk)?; let array = series_to_numpy_view_recursive(py, s_owned, writable_flag); Some((array, writable_flag)) } + /// Rechunk the Series if required. /// /// NumPy arrays are always contiguous, so we may have to rechunk before creating a view. /// If we do so, we can flag the resulting array as writable. -fn handle_chunks(s: &Series, allow_rechunk: bool) -> Option<(Series, bool)> { +fn handle_chunks(py: Python, s: &Series, allow_rechunk: bool) -> Option<(Series, bool)> { let is_chunked = s.n_chunks() > 1; match (is_chunked, allow_rechunk) { (true, false) => None, - (true, true) => Some((s.rechunk(), true)), + (true, true) => Some((py.allow_threads(|| s.rechunk()), true)), (false, _) => Some((s.clone(), false)), } } diff --git a/crates/polars-python/src/map/mod.rs b/crates/polars-python/src/map/mod.rs index 3bf96f91e631..9ffc74961302 100644 --- a/crates/polars-python/src/map/mod.rs +++ b/crates/polars-python/src/map/mod.rs @@ -32,6 +32,7 @@ impl PyArrowPrimitiveType for Float32Type {} impl PyArrowPrimitiveType for Float64Type {} fn iterator_to_struct<'a>( + py: Python, it: impl Iterator>>, init_null_count: usize, first_value: AnyValue<'a>, @@ -115,11 +116,13 @@ fn iterator_to_struct<'a>( } } - let fields = POOL.install(|| { - field_names_ordered - .par_iter() - .map(|name| Series::new(name.clone(), struct_fields.get(name).unwrap())) - .collect::>() + let fields = py.allow_threads(|| { + POOL.install(|| { + field_names_ordered + .par_iter() + .map(|name| Series::new(name.clone(), struct_fields.get(name).unwrap())) + .collect::>() + }) }); Ok( diff --git a/crates/polars-python/src/map/series.rs b/crates/polars-python/src/map/series.rs index cb731e7c03f8..16d6212b8d1e 100644 --- a/crates/polars-python/src/map/series.rs +++ b/crates/polars-python/src/map/series.rs @@ -271,6 +271,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { .skip(init_null_count + skip) .map(|val| call_lambda(py, lambda, val).ok()); iterator_to_struct( + py, it, init_null_count, first_value, @@ -283,6 +284,7 @@ impl<'a> ApplyLambda<'a> for BooleanChunked { .skip(init_null_count + skip) .map(|opt_val| opt_val.and_then(|val| call_lambda(py, lambda, val).ok())); iterator_to_struct( + py, it, init_null_count, first_value, @@ -576,6 +578,7 @@ where .skip(init_null_count + skip) .map(|val| call_lambda(py, lambda, val).ok()); iterator_to_struct( + py, it, init_null_count, first_value, @@ -588,6 +591,7 @@ where .skip(init_null_count + skip) .map(|opt_val| opt_val.and_then(|val| call_lambda(py, lambda, val).ok())); iterator_to_struct( + py, it, init_null_count, first_value, @@ -874,6 +878,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { .skip(init_null_count + skip) .map(|val| call_lambda(py, lambda, val).ok()); iterator_to_struct( + py, it, init_null_count, first_value, @@ -886,6 +891,7 @@ impl<'a> ApplyLambda<'a> for StringChunked { .skip(init_null_count + skip) .map(|opt_val| opt_val.and_then(|val| call_lambda(py, lambda, val).ok())); iterator_to_struct( + py, it, init_null_count, first_value, @@ -1221,6 +1227,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { call_lambda(py, lambda, python_series_wrapper).ok() }); iterator_to_struct( + py, it, init_null_count, first_value, @@ -1245,6 +1252,7 @@ impl<'a> ApplyLambda<'a> for ListChunked { }) }); iterator_to_struct( + py, it, init_null_count, first_value, @@ -1648,6 +1656,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { call_lambda(py, lambda, python_series_wrapper).ok() }); iterator_to_struct( + py, it, init_null_count, first_value, @@ -1672,6 +1681,7 @@ impl<'a> ApplyLambda<'a> for ArrayChunked { }) }); iterator_to_struct( + py, it, init_null_count, first_value, @@ -2042,7 +2052,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { fn apply_into_struct( &'a self, - _py: Python, + py: Python, lambda: &Bound<'a, PyAny>, init_null_count: usize, first_value: AnyValue<'a>, @@ -2056,6 +2066,7 @@ impl<'a> ApplyLambda<'a> for ObjectChunked { Some(out) }); iterator_to_struct( + py, it, init_null_count, first_value, @@ -2329,7 +2340,7 @@ impl<'a> ApplyLambda<'a> for StructChunked { fn apply_into_struct( &'a self, - _py: Python, + py: Python, lambda: &Bound<'a, PyAny>, init_null_count: usize, first_value: AnyValue<'a>, @@ -2340,6 +2351,7 @@ impl<'a> ApplyLambda<'a> for StructChunked { Some(out) }); iterator_to_struct( + py, it, init_null_count, first_value, From 058491f60bb7cc4b51dc0429abc724512ff78878 Mon Sep 17 00:00:00 2001 From: Gijs Burghoorn Date: Thu, 14 Nov 2024 13:45:09 +0100 Subject: [PATCH 15/18] refactor: Migrate polars-expr AggregationContext to use `Column` (#19736) --- crates/polars-core/src/frame/column/mod.rs | 154 +++++++++++++++--- .../src/frame/column/partitioned.rs | 2 +- crates/polars-core/src/frame/column/scalar.rs | 5 + crates/polars-core/src/frame/column/series.rs | 71 ++++++++ crates/polars-core/src/frame/mod.rs | 14 +- crates/polars-core/src/scalar/mod.rs | 9 + .../src/expressions/aggregation.rs | 116 +++++++------ crates/polars-expr/src/expressions/alias.rs | 12 +- crates/polars-expr/src/expressions/apply.rs | 76 ++++----- crates/polars-expr/src/expressions/binary.rs | 30 ++-- crates/polars-expr/src/expressions/cast.rs | 10 +- crates/polars-expr/src/expressions/column.rs | 45 ++--- crates/polars-expr/src/expressions/count.rs | 4 +- crates/polars-expr/src/expressions/filter.rs | 2 +- crates/polars-expr/src/expressions/gather.rs | 28 ++-- .../polars-expr/src/expressions/group_iter.rs | 28 ++-- crates/polars-expr/src/expressions/literal.rs | 5 +- crates/polars-expr/src/expressions/mod.rs | 130 +++++++-------- crates/polars-expr/src/expressions/slice.rs | 16 +- crates/polars-expr/src/expressions/sort.rs | 2 +- crates/polars-expr/src/expressions/sortby.rs | 20 ++- crates/polars-expr/src/expressions/ternary.rs | 36 ++-- crates/polars-expr/src/expressions/window.rs | 22 +-- crates/polars-lazy/src/dsl/list.rs | 2 +- crates/polars-lazy/src/frame/pivot.rs | 2 +- .../polars-mem-engine/src/executors/filter.rs | 20 ++- .../src/executors/group_by.rs | 2 +- .../polars-mem-engine/src/executors/stack.rs | 14 +- crates/polars-ops/src/series/ops/index.rs | 11 +- 29 files changed, 531 insertions(+), 357 deletions(-) create mode 100644 crates/polars-core/src/frame/column/series.rs diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index d2eec86c1b15..cea56a2e87b7 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -1,5 +1,7 @@ use std::borrow::Cow; +use arrow::bitmap::MutableBitmap; +use arrow::trusted_len::TrustMyLength; use num_traits::{Num, NumCast}; use polars_error::PolarsResult; use polars_utils::index::check_bounds; @@ -8,6 +10,7 @@ pub use scalar::ScalarColumn; use self::gather::check_bounds_ca; use self::partitioned::PartitionedColumn; +use self::series::SeriesColumn; use crate::chunked_array::cast::CastOptions; use crate::chunked_array::metadata::{MetadataFlags, MetadataTrait}; use crate::datatypes::ReshapeDimension; @@ -20,6 +23,7 @@ mod arithmetic; mod compare; mod partitioned; mod scalar; +mod series; /// A column within a [`DataFrame`]. /// @@ -35,7 +39,7 @@ mod scalar; #[cfg_attr(feature = "serde", serde(from = "Series"))] #[cfg_attr(feature = "serde", serde(into = "_SerdeSeries"))] pub enum Column { - Series(Series), + Series(SeriesColumn), Partitioned(PartitionedColumn), Scalar(ScalarColumn), } @@ -47,12 +51,13 @@ pub trait IntoColumn: Sized { impl Column { #[inline] + #[track_caller] pub fn new(name: PlSmallStr, values: T) -> Self where Phantom: ?Sized, Series: NamedFrom, { - Self::Series(NamedFrom::new(name, values)) + Self::Series(SeriesColumn::new(NamedFrom::new(name, values))) } #[inline] @@ -95,7 +100,7 @@ impl Column { PartitionedColumn::new_empty(PlSmallStr::EMPTY, DataType::Null), ) .take_materialized_series(); - *self = Column::Series(series); + *self = Column::Series(series.into()); let Column::Series(s) = self else { unreachable!(); }; @@ -107,7 +112,7 @@ impl Column { ScalarColumn::new_empty(PlSmallStr::EMPTY, DataType::Null), ) .take_materialized_series(); - *self = Column::Series(series); + *self = Column::Series(series.into()); let Column::Series(s) = self else { unreachable!(); }; @@ -121,7 +126,7 @@ impl Column { #[inline] pub fn take_materialized_series(self) -> Series { match self { - Column::Series(s) => s, + Column::Series(s) => s.take(), Column::Partitioned(s) => s.take_materialized_series(), Column::Scalar(s) => s.take_materialized_series(), } @@ -586,13 +591,86 @@ impl Column { } } + /// General implementation for aggregation where a non-missing scalar would map to itself. + #[inline(always)] + #[cfg(any(feature = "algorithm_group_by", feature = "bitwise"))] + fn agg_with_unit_scalar( + &self, + groups: &GroupsProxy, + series_agg: impl Fn(&Series, &GroupsProxy) -> Series, + ) -> Column { + match self { + Column::Series(s) => series_agg(s, groups).into_column(), + // @partition-opt + Column::Partitioned(s) => series_agg(s.as_materialized_series(), groups).into_column(), + Column::Scalar(s) => { + if s.is_empty() { + return self.clone(); + } + + // We utilize the aggregation on Series to see: + // 1. the output datatype of the aggregation + // 2. whether this aggregation is even defined + let series_aggregation = series_agg( + &s.as_single_value_series(), + &GroupsProxy::Slice { + // @NOTE: this group is always valid since s is non-empty. + groups: vec![[0, 1]], + rolling: false, + }, + ); + + // If the aggregation is not defined, just return all nulls. + if series_aggregation.has_nulls() { + return Self::new_scalar( + series_aggregation.name().clone(), + Scalar::new(series_aggregation.dtype().clone(), AnyValue::Null), + groups.len(), + ); + } + + let mut scalar_col = s.resize(groups.len()); + // The aggregation might change the type (e.g. mean changes int -> float), so we do + // a cast here to the output type. + if series_aggregation.dtype() != s.dtype() { + scalar_col = scalar_col.cast(series_aggregation.dtype()).unwrap(); + } + + let Some(first_empty_idx) = groups.iter().position(|g| g.is_empty()) else { + // Fast path: no empty groups. keep the scalar intact. + return scalar_col.into_column(); + }; + + // All empty groups produce a *missing* or `null` value. + let mut validity = MutableBitmap::with_capacity(groups.len()); + validity.extend_constant(first_empty_idx, true); + // SAFETY: We trust the length of this iterator. + let iter = unsafe { + TrustMyLength::new( + groups.iter().skip(first_empty_idx).map(|g| !g.is_empty()), + groups.len() - first_empty_idx, + ) + }; + validity.extend_from_trusted_len_iter(iter); + let validity = validity.freeze(); + + let mut s = scalar_col.take_materialized_series().rechunk(); + // SAFETY: We perform a compute_len afterwards. + let chunks = unsafe { s.chunks_mut() }; + chunks[0].with_validity(Some(validity)); + s.compute_len(); + + s.into_column() + }, + } + } + /// # Safety /// /// Does no bounds checks, groups must be correct. #[cfg(feature = "algorithm_group_by")] pub unsafe fn agg_min(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_min(groups) }.into() + self.agg_with_unit_scalar(groups, |s, g| unsafe { s.agg_min(g) }) } /// # Safety @@ -600,8 +678,7 @@ impl Column { /// Does no bounds checks, groups must be correct. #[cfg(feature = "algorithm_group_by")] pub unsafe fn agg_max(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_max(groups) }.into() + self.agg_with_unit_scalar(groups, |s, g| unsafe { s.agg_max(g) }) } /// # Safety @@ -609,8 +686,7 @@ impl Column { /// Does no bounds checks, groups must be correct. #[cfg(feature = "algorithm_group_by")] pub unsafe fn agg_mean(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_mean(groups) }.into() + self.agg_with_unit_scalar(groups, |s, g| unsafe { s.agg_mean(g) }) } /// # Safety @@ -627,8 +703,7 @@ impl Column { /// Does no bounds checks, groups must be correct. #[cfg(feature = "algorithm_group_by")] pub unsafe fn agg_first(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_first(groups) }.into() + self.agg_with_unit_scalar(groups, |s, g| unsafe { s.agg_first(g) }) } /// # Safety @@ -636,8 +711,7 @@ impl Column { /// Does no bounds checks, groups must be correct. #[cfg(feature = "algorithm_group_by")] pub unsafe fn agg_last(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_last(groups) }.into() + self.agg_with_unit_scalar(groups, |s, g| unsafe { s.agg_last(g) }) } /// # Safety @@ -672,8 +746,7 @@ impl Column { /// Does no bounds checks, groups must be correct. #[cfg(feature = "algorithm_group_by")] pub unsafe fn agg_median(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_median(groups) }.into() + self.agg_with_unit_scalar(groups, |s, g| unsafe { s.agg_median(g) }) } /// # Safety @@ -689,7 +762,7 @@ impl Column { /// /// Does no bounds checks, groups must be correct. #[cfg(feature = "algorithm_group_by")] - pub(crate) unsafe fn agg_std(&self, groups: &GroupsProxy, ddof: u8) -> Self { + pub unsafe fn agg_std(&self, groups: &GroupsProxy, ddof: u8) -> Self { // @scalar-opt unsafe { self.as_materialized_series().agg_std(groups, ddof) }.into() } @@ -713,6 +786,30 @@ impl Column { unsafe { self.as_materialized_series().agg_valid_count(groups) }.into() } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "bitwise")] + pub fn agg_and(&self, groups: &GroupsProxy) -> Self { + self.agg_with_unit_scalar(groups, |s, g| unsafe { s.agg_and(g) }) + } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "bitwise")] + pub fn agg_or(&self, groups: &GroupsProxy) -> Self { + self.agg_with_unit_scalar(groups, |s, g| unsafe { s.agg_or(g) }) + } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "bitwise")] + pub fn agg_xor(&self, groups: &GroupsProxy) -> Self { + // @partition-opt + // @scalar-opt + unsafe { self.as_materialized_series().agg_xor(groups) }.into() + } + pub fn full_null(name: PlSmallStr, size: usize, dtype: &DataType) -> Self { Self::new_scalar(name, Scalar::new(dtype.clone(), AnyValue::Null), size) } @@ -877,6 +974,13 @@ impl Column { } } + /// Packs every element into a list. + pub fn as_list(&self) -> ListChunked { + // @scalar-opt + // @partition-opt + self.as_materialized_series().as_list() + } + pub fn is_sorted_flag(&self) -> IsSorted { // @scalar-opt self.as_materialized_series().is_sorted_flag() @@ -1105,19 +1209,25 @@ impl Column { pub fn try_add_owned(self, other: Self) -> PolarsResult { match (self, other) { - (Column::Series(lhs), Column::Series(rhs)) => lhs.try_add_owned(rhs).map(Column::from), + (Column::Series(lhs), Column::Series(rhs)) => { + lhs.take().try_add_owned(rhs.take()).map(Column::from) + }, (lhs, rhs) => lhs + rhs, } } pub fn try_sub_owned(self, other: Self) -> PolarsResult { match (self, other) { - (Column::Series(lhs), Column::Series(rhs)) => lhs.try_sub_owned(rhs).map(Column::from), + (Column::Series(lhs), Column::Series(rhs)) => { + lhs.take().try_sub_owned(rhs.take()).map(Column::from) + }, (lhs, rhs) => lhs - rhs, } } pub fn try_mul_owned(self, other: Self) -> PolarsResult { match (self, other) { - (Column::Series(lhs), Column::Series(rhs)) => lhs.try_mul_owned(rhs).map(Column::from), + (Column::Series(lhs), Column::Series(rhs)) => { + lhs.take().try_mul_owned(rhs.take()).map(Column::from) + }, (lhs, rhs) => lhs * rhs, } } @@ -1443,7 +1553,7 @@ impl From for Column { return Self::Scalar(ScalarColumn::unit_scalar_from_series(series)); } - Self::Series(series) + Self::Series(SeriesColumn::new(series)) } } diff --git a/crates/polars-core/src/frame/column/partitioned.rs b/crates/polars-core/src/frame/column/partitioned.rs index 16d4e9538634..93471c662d72 100644 --- a/crates/polars-core/src/frame/column/partitioned.rs +++ b/crates/polars-core/src/frame/column/partitioned.rs @@ -124,7 +124,7 @@ impl PartitionedColumn { fn _to_series(name: PlSmallStr, values: &Series, ends: &[IdxSize]) -> Series { let dtype = values.dtype(); - let mut column = Column::Series(Series::new_empty(name, dtype)); + let mut column = Column::Series(Series::new_empty(name, dtype).into()); let mut prev_offset = 0; for (i, &offset) in ends.iter().enumerate() { diff --git a/crates/polars-core/src/frame/column/scalar.rs b/crates/polars-core/src/frame/column/scalar.rs index e3d8105362c4..c08a9e3cfee0 100644 --- a/crates/polars-core/src/frame/column/scalar.rs +++ b/crates/polars-core/src/frame/column/scalar.rs @@ -284,6 +284,11 @@ impl ScalarColumn { self.scalar.update(AnyValue::Null); self } + + pub fn map_scalar(&mut self, map_scalar: impl Fn(Scalar) -> Scalar) { + self.scalar = map_scalar(std::mem::take(&mut self.scalar)); + self.materialized.take(); + } } impl IntoColumn for ScalarColumn { diff --git a/crates/polars-core/src/frame/column/series.rs b/crates/polars-core/src/frame/column/series.rs new file mode 100644 index 000000000000..c7f79906ea0d --- /dev/null +++ b/crates/polars-core/src/frame/column/series.rs @@ -0,0 +1,71 @@ +use std::ops::{Deref, DerefMut}; + +use super::Series; + +/// A very thin wrapper around [`Series`] that represents a [`Column`]ized version of [`Series`]. +/// +/// At the moment this just conditionally tracks where it was created so that materialization +/// problems can be tracked down. +#[derive(Debug, Clone)] +pub struct SeriesColumn { + inner: Series, + + #[cfg(debug_assertions)] + materialized_at: Option>, +} + +impl SeriesColumn { + #[track_caller] + pub fn new(series: Series) -> Self { + Self { + inner: series, + + #[cfg(debug_assertions)] + materialized_at: if std::env::var("POLARS_TRACK_SERIES_MATERIALIZATION").as_deref() + == Ok("1") + { + Some(std::sync::Arc::new( + std::backtrace::Backtrace::force_capture(), + )) + } else { + None + }, + } + } + + pub fn materialized_at(&self) -> Option<&std::backtrace::Backtrace> { + #[cfg(debug_assertions)] + { + self.materialized_at.as_ref().map(|v| v.as_ref()) + } + + #[cfg(not(debug_assertions))] + None + } + + pub fn take(self) -> Series { + self.inner + } +} + +impl From for SeriesColumn { + #[track_caller] + #[inline(always)] + fn from(value: Series) -> Self { + Self::new(value) + } +} + +impl Deref for SeriesColumn { + type Target = Series; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +impl DerefMut for SeriesColumn { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 0d8fef7f4c4a..6fed5c25071c 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -538,7 +538,7 @@ impl DataFrame { // Don't parallelize this. Memory overhead for s in &mut self.columns { if let Column::Series(s) = s { - *s = s.rechunk(); + *s = s.rechunk().into(); } } self @@ -2085,6 +2085,8 @@ impl DataFrame { let mut max_value_ca = StringChunkedBuilder::new(PlSmallStr::from_static("max_value"), num_columns); let mut distinct_count_ca: Vec> = Vec::with_capacity(num_columns); + let mut materialized_at_ca = + StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns); for col in &self.columns { let metadata = col.get_metadata(); @@ -2099,10 +2101,10 @@ impl DataFrame { ) }); - let repr = match col { - Column::Series(_) => "series", - Column::Partitioned(_) => "partitioned", - Column::Scalar(_) => "scalar", + let (repr, materialized_at) = match col { + Column::Series(s) => ("series", s.materialized_at()), + Column::Partitioned(_) => ("partitioned", None), + Column::Scalar(_) => ("scalar", None), }; let sorted_asc = flags.contains(MetadataFlags::SORTED_ASC); let sorted_dsc = flags.contains(MetadataFlags::SORTED_DSC); @@ -2116,6 +2118,7 @@ impl DataFrame { min_value_ca.append_option(min_value.map(|v| v.as_any_value().to_string())); max_value_ca.append_option(max_value.map(|v| v.as_any_value().to_string())); distinct_count_ca.push(distinct_count); + materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}"))); } unsafe { @@ -2134,6 +2137,7 @@ impl DataFrame { &distinct_count_ca[..], ) .into_column(), + materialized_at_ca.finish().into_column(), ], ) } diff --git a/crates/polars-core/src/scalar/mod.rs b/crates/polars-core/src/scalar/mod.rs index 3e456837e534..7487603ff998 100644 --- a/crates/polars-core/src/scalar/mod.rs +++ b/crates/polars-core/src/scalar/mod.rs @@ -15,6 +15,15 @@ pub struct Scalar { value: AnyValue<'static>, } +impl Default for Scalar { + fn default() -> Self { + Self { + dtype: DataType::Null, + value: AnyValue::Null, + } + } +} + impl Scalar { #[inline(always)] pub fn new(dtype: DataType, value: AnyValue<'static>) -> Self { diff --git a/crates/polars-expr/src/expressions/aggregation.rs b/crates/polars-expr/src/expressions/aggregation.rs index fb691d746715..883598789622 100644 --- a/crates/polars-expr/src/expressions/aggregation.rs +++ b/crates/polars-expr/src/expressions/aggregation.rs @@ -206,7 +206,7 @@ impl PhysicalExpr for AggregationExpr { ) -> PolarsResult> { let mut ac = self.input.evaluate_on_groups(df, groups, state)?; // don't change names by aggregations as is done in polars-core - let keep_name = ac.series().name().clone(); + let keep_name = ac.get_values().name().clone(); polars_ensure!(!matches!(ac.agg_state(), AggState::Literal(_)), ComputeError: "cannot aggregate a literal"); if let AggregatedScalar(_) = ac.agg_state() { @@ -223,37 +223,37 @@ impl PhysicalExpr for AggregationExpr { let out = unsafe { match self.agg_type.groupby { GroupByMethod::Min => { - let (s, groups) = ac.get_final_aggregation(); - let agg_s = s.agg_min(&groups); - AggregatedScalar(agg_s.with_name(keep_name)) + let (c, groups) = ac.get_final_aggregation(); + let agg_c = c.agg_min(&groups); + AggregatedScalar(agg_c.with_name(keep_name)) }, GroupByMethod::Max => { - let (s, groups) = ac.get_final_aggregation(); - let agg_s = s.agg_max(&groups); - AggregatedScalar(agg_s.with_name(keep_name)) + let (c, groups) = ac.get_final_aggregation(); + let agg_c = c.agg_max(&groups); + AggregatedScalar(agg_c.with_name(keep_name)) }, GroupByMethod::Median => { - let (s, groups) = ac.get_final_aggregation(); - let agg_s = s.agg_median(&groups); - AggregatedScalar(agg_s.with_name(keep_name)) + let (c, groups) = ac.get_final_aggregation(); + let agg_c = c.agg_median(&groups); + AggregatedScalar(agg_c.with_name(keep_name)) }, GroupByMethod::Mean => { - let (s, groups) = ac.get_final_aggregation(); - let agg_s = s.agg_mean(&groups); - AggregatedScalar(agg_s.with_name(keep_name)) + let (c, groups) = ac.get_final_aggregation(); + let agg_c = c.agg_mean(&groups); + AggregatedScalar(agg_c.with_name(keep_name)) }, GroupByMethod::Sum => { - let (s, groups) = ac.get_final_aggregation(); - let agg_s = s.agg_sum(&groups); - AggregatedScalar(agg_s.with_name(keep_name)) + let (c, groups) = ac.get_final_aggregation(); + let agg_c = c.agg_sum(&groups); + AggregatedScalar(agg_c.with_name(keep_name)) }, GroupByMethod::Count { include_nulls } => { - if include_nulls || ac.series().null_count() == 0 { + if include_nulls || ac.get_values().null_count() == 0 { // a few fast paths that prevent materializing new groups match ac.update_groups { UpdateGroups::WithSeriesLen => { let list = ac - .series() + .get_values() .list() .expect("impl error, should be a list at this point"); @@ -288,7 +288,7 @@ impl PhysicalExpr for AggregationExpr { }, }; s.rename(keep_name); - AggregatedScalar(s.into_series()) + AggregatedScalar(s.into_column()) }, UpdateGroups::WithGroupsLen => { // no need to update the groups @@ -296,20 +296,20 @@ impl PhysicalExpr for AggregationExpr { // not the correct order let mut ca = ac.groups.group_count(); ca.rename(keep_name); - AggregatedScalar(ca.into_series()) + AggregatedScalar(ca.into_column()) }, // materialize groups _ => { let mut ca = ac.groups().group_count(); ca.rename(keep_name); - AggregatedScalar(ca.into_series()) + AggregatedScalar(ca.into_column()) }, } } else { // TODO: optimize this/and write somewhere else. match ac.agg_state() { AggState::Literal(s) | AggState::AggregatedScalar(s) => { - AggregatedScalar(Series::new( + AggregatedScalar(Column::new( keep_name, [(s.len() as IdxSize - s.null_count() as IdxSize)], )) @@ -323,7 +323,7 @@ impl PhysicalExpr for AggregationExpr { .map(|s| s.len() as IdxSize - s.null_count() as IdxSize) }) .collect(); - AggregatedScalar(out.into_series().with_name(keep_name)) + AggregatedScalar(out.into_column().with_name(keep_name)) }, AggState::NotAggregated(s) => { let s = s.clone(); @@ -334,7 +334,9 @@ impl PhysicalExpr for AggregationExpr { match groups.as_ref() { GroupsProxy::Idx(idx) => { let s = s.rechunk(); - let array = &s.chunks()[0]; + // @scalar-opt + // @partition-opt + let array = &s.as_materialized_series().chunks()[0]; let validity = array.validity().unwrap(); idx.iter() .map(|(_, g)| { @@ -365,7 +367,7 @@ impl PhysicalExpr for AggregationExpr { }, } }; - AggregatedScalar(out.into_series()) + AggregatedScalar(out.into_column()) }, } } @@ -392,10 +394,10 @@ impl PhysicalExpr for AggregationExpr { // // if it is not, we traverse the groups and create // a list per group. - let s = match ac.agg_state() { + let c = match ac.agg_state() { // mean agg: // -> f64 -> list - AggState::AggregatedScalar(s) => s + AggState::AggregatedScalar(c) => c .reshape_list(&[ ReshapeDimension::Infer, ReshapeDimension::new_dimension(1), @@ -403,25 +405,25 @@ impl PhysicalExpr for AggregationExpr { .unwrap(), _ => { let agg = ac.aggregated(); - agg.as_list().into_series() + agg.as_list().into_column() }, }; - AggregatedList(s.with_name(keep_name)) + AggregatedList(c.with_name(keep_name)) }, GroupByMethod::Groups => { let mut column: ListChunked = ac.groups().as_list_chunked(); column.rename(keep_name); - AggregatedScalar(column.into_series()) + AggregatedScalar(column.into_column()) }, GroupByMethod::Std(ddof) => { - let (s, groups) = ac.get_final_aggregation(); - let agg_s = s.agg_std(&groups, ddof); - AggregatedScalar(agg_s.with_name(keep_name)) + let (c, groups) = ac.get_final_aggregation(); + let agg_c = c.agg_std(&groups, ddof); + AggregatedScalar(agg_c.with_name(keep_name)) }, GroupByMethod::Var(ddof) => { - let (s, groups) = ac.get_final_aggregation(); - let agg_s = s.agg_var(&groups, ddof); - AggregatedScalar(agg_s.with_name(keep_name)) + let (c, groups) = ac.get_final_aggregation(); + let agg_c = c.agg_var(&groups, ddof); + AggregatedScalar(agg_c.with_name(keep_name)) }, GroupByMethod::Quantile(_, _) => { // implemented explicitly in AggQuantile struct @@ -429,24 +431,28 @@ impl PhysicalExpr for AggregationExpr { }, #[cfg(feature = "bitwise")] GroupByMethod::Bitwise(f) => { - let (s, groups) = ac.get_final_aggregation(); - let agg_s = match f { - GroupByBitwiseMethod::And => s.agg_and(&groups), - GroupByBitwiseMethod::Or => s.agg_or(&groups), - GroupByBitwiseMethod::Xor => s.agg_xor(&groups), + let (c, groups) = ac.get_final_aggregation(); + let agg_c = match f { + GroupByBitwiseMethod::And => c.agg_and(&groups), + GroupByBitwiseMethod::Or => c.agg_or(&groups), + GroupByBitwiseMethod::Xor => c.agg_xor(&groups), }; - AggregatedScalar(agg_s.with_name(keep_name)) + AggregatedScalar(agg_c.with_name(keep_name)) }, GroupByMethod::NanMin => { #[cfg(feature = "propagate_nans")] { - let (s, groups) = ac.get_final_aggregation(); - let agg_s = if s.dtype().is_float() { - nan_propagating_aggregate::group_agg_nan_min_s(&s, &groups) + let (c, groups) = ac.get_final_aggregation(); + let agg_c = if c.dtype().is_float() { + nan_propagating_aggregate::group_agg_nan_min_s( + c.as_materialized_series(), + &groups, + ) + .into_column() } else { - s.agg_min(&groups) + c.agg_min(&groups) }; - AggregatedScalar(agg_s.with_name(keep_name)) + AggregatedScalar(agg_c.with_name(keep_name)) } #[cfg(not(feature = "propagate_nans"))] { @@ -456,13 +462,17 @@ impl PhysicalExpr for AggregationExpr { GroupByMethod::NanMax => { #[cfg(feature = "propagate_nans")] { - let (s, groups) = ac.get_final_aggregation(); - let agg_s = if s.dtype().is_float() { - nan_propagating_aggregate::group_agg_nan_max_s(&s, &groups) + let (c, groups) = ac.get_final_aggregation(); + let agg_c = if c.dtype().is_float() { + nan_propagating_aggregate::group_agg_nan_max_s( + c.as_materialized_series(), + &groups, + ) + .into_column() } else { - s.agg_max(&groups) + c.agg_max(&groups) }; - AggregatedScalar(agg_s.with_name(keep_name)) + AggregatedScalar(agg_c.with_name(keep_name)) } #[cfg(not(feature = "propagate_nans"))] { @@ -757,7 +767,7 @@ impl PhysicalExpr for AggQuantileExpr { ) -> PolarsResult> { let mut ac = self.input.evaluate_on_groups(df, groups, state)?; // don't change names by aggregations as is done in polars-core - let keep_name = ac.series().name().clone(); + let keep_name = ac.get_values().name().clone(); let quantile = self.get_quantile(df, state)?; diff --git a/crates/polars-expr/src/expressions/alias.rs b/crates/polars-expr/src/expressions/alias.rs index f2065289e1ae..131d2ca2f16c 100644 --- a/crates/polars-expr/src/expressions/alias.rs +++ b/crates/polars-expr/src/expressions/alias.rs @@ -48,17 +48,13 @@ impl PhysicalExpr for AliasExpr { state: &ExecutionState, ) -> PolarsResult> { let mut ac = self.physical_expr.evaluate_on_groups(df, groups, state)?; - let s = ac.take(); - let s = self.finish(s.into()); + let c = ac.take(); + let c = self.finish(c); if ac.is_literal() { - ac.with_literal(s.take_materialized_series()); + ac.with_literal(c); } else { - ac.with_series( - s.take_materialized_series(), - ac.is_aggregated(), - Some(&self.expr), - )?; + ac.with_values(c, ac.is_aggregated(), Some(&self.expr))?; } Ok(ac) } diff --git a/crates/polars-expr/src/expressions/apply.rs b/crates/polars-expr/src/expressions/apply.rs index f8e2619c4153..c03511a64734 100644 --- a/crates/polars-expr/src/expressions/apply.rs +++ b/crates/polars-expr/src/expressions/apply.rs @@ -92,11 +92,11 @@ impl ApplyExpr { let all_unit_len = all_unit_length(&ca); if all_unit_len && self.function_returns_scalar { ac.with_agg_state(AggState::AggregatedScalar( - ca.explode().unwrap().into_series(), + ca.explode().unwrap().into_column(), )); ac.with_update_groups(UpdateGroups::No); } else { - ac.with_series(ca.into_series(), true, Some(&self.expr))?; + ac.with_values(ca.into_column(), true, Some(&self.expr))?; ac.with_update_groups(UpdateGroups::WithSeriesLen); } @@ -120,7 +120,7 @@ impl ApplyExpr { &self, mut ac: AggregationContext<'a>, ) -> PolarsResult> { - let s = ac.series(); + let s = ac.get_values(); polars_ensure!( !matches!(ac.agg_state(), AggState::AggregatedScalar(_)), @@ -131,7 +131,7 @@ impl ApplyExpr { let name = s.name().clone(); let agg = ac.aggregated(); // Collection of empty list leads to a null dtype. See: #3687. - if agg.len() == 0 { + if agg.is_empty() { // Create input for the function to determine the output dtype, see #3946. let agg = agg.list().unwrap(); let input_dtype = agg.inner_dtype(); @@ -199,35 +199,28 @@ impl ApplyExpr { &self, mut ac: AggregationContext<'a>, ) -> PolarsResult> { - let (s, aggregated) = match ac.agg_state() { - AggState::AggregatedList(s) => { - let ca = s.list().unwrap(); + let (c, aggregated) = match ac.agg_state() { + AggState::AggregatedList(c) => { + let ca = c.list().unwrap(); let out = ca.apply_to_inner(&|s| { - self.eval_and_flatten(&mut [s.into()]) - .map(|c| c.as_materialized_series().clone()) + Ok(self + .eval_and_flatten(&mut [s.into_column()])? + .take_materialized_series()) })?; - (out.into_series(), true) + (out.into_column(), true) }, - AggState::NotAggregated(s) => { - let (out, aggregated) = ( - self.eval_and_flatten(&mut [s.clone().into()])? - .as_materialized_series() - .clone(), - false, - ); - check_map_output_len(s.len(), out.len(), &self.expr)?; + AggState::NotAggregated(c) => { + let (out, aggregated) = (self.eval_and_flatten(&mut [c.clone()])?, false); + check_map_output_len(c.len(), out.len(), &self.expr)?; (out, aggregated) }, agg_state => { - ac.with_agg_state(agg_state.try_map(|s| { - self.eval_and_flatten(&mut [s.clone().into()]) - .map(|c| c.as_materialized_series().clone()) - })?); + ac.with_agg_state(agg_state.try_map(|s| self.eval_and_flatten(&mut [s.clone()]))?); return Ok(ac); }, }; - ac.with_series_and_args(s, aggregated, Some(&self.expr), true)?; + ac.with_values_and_args(c, aggregated, Some(&self.expr), true)?; Ok(ac) } fn apply_multiple_group_aware<'a>( @@ -385,11 +378,8 @@ impl PhysicalExpr for ApplyExpr { match self.collect_groups { ApplyOptions::ApplyList => { - let s = self - .eval_and_flatten(&mut [ac.aggregated().into()])? - .as_materialized_series() - .clone(); - ac.with_series(s, true, Some(&self.expr))?; + let c = self.eval_and_flatten(&mut [ac.aggregated()])?; + ac.with_values(c, true, Some(&self.expr))?; Ok(ac) }, ApplyOptions::GroupWise => self.apply_single_group_aware(ac), @@ -400,18 +390,12 @@ impl PhysicalExpr for ApplyExpr { match self.collect_groups { ApplyOptions::ApplyList => { - let mut s = acs - .iter_mut() - .map(|ac| ac.aggregated().into()) - .collect::>(); - let s = self - .eval_and_flatten(&mut s)? - .as_materialized_series() - .clone(); + let mut c = acs.iter_mut().map(|ac| ac.aggregated()).collect::>(); + let c = self.eval_and_flatten(&mut c)?; // take the first aggregation context that as that is the input series let mut ac = acs.swap_remove(0); ac.with_update_groups(UpdateGroups::WithGroupsLen); - ac.with_series(s, true, Some(&self.expr))?; + ac.with_values(c, true, Some(&self.expr))?; Ok(ac) }, ApplyOptions::GroupWise => self.apply_multiple_group_aware(acs, df), @@ -487,7 +471,7 @@ fn apply_multiple_elementwise<'a>( let other = acs[1..] .iter() - .map(|ac| ac.flat_naive().into_owned().into()) + .map(|ac| ac.flat_naive().into_owned()) .collect::>(); let out = ca.apply_to_inner(&|s| { @@ -501,14 +485,14 @@ fn apply_multiple_elementwise<'a>( .clone()) })?; let mut ac = acs.swap_remove(0); - ac.with_series(out.into_series(), true, None)?; + ac.with_values(out.into_column(), true, None)?; Ok(ac) }, first_as => { let check_lengths = check_lengths && !matches!(first_as, AggState::Literal(_)); let aggregated = acs.iter().all(|ac| ac.is_aggregated() | ac.is_literal()) && acs.iter().any(|ac| ac.is_aggregated()); - let mut s = acs + let mut c = acs .iter_mut() .enumerate() .map(|(i, ac)| { @@ -523,19 +507,15 @@ fn apply_multiple_elementwise<'a>( .map(Column::from) .collect::>(); - let input_len = s[0].len(); - let s = function - .call_udf(&mut s)? - .unwrap() - .as_materialized_series() - .clone(); + let input_len = c[0].len(); + let c = function.call_udf(&mut c)?.unwrap(); if check_lengths { - check_map_output_len(input_len, s.len(), expr)?; + check_map_output_len(input_len, c.len(), expr)?; } // Take the first aggregation context that as that is the input series. let mut ac = acs.swap_remove(0); - ac.with_series_and_args(s, aggregated, None, true)?; + ac.with_values_and_args(c, aggregated, None, true)?; Ok(ac) }, } diff --git a/crates/polars-expr/src/expressions/binary.rs b/crates/polars-expr/src/expressions/binary.rs index 10f217844ab1..0976afc5e608 100644 --- a/crates/polars-expr/src/expressions/binary.rs +++ b/crates/polars-expr/src/expressions/binary.rs @@ -121,14 +121,14 @@ impl BinaryExpr { aggregated: bool, ) -> PolarsResult> { // We want to be able to mutate in place, so we take the lhs to make sure that we drop. - let lhs = ac_l.series().clone(); - let rhs = ac_r.series().clone(); + let lhs = ac_l.get_values().clone(); + let rhs = ac_r.get_values().clone(); // Drop lhs so that we might operate in place. drop(ac_l.take()); - let out = apply_operator_owned(lhs.into_column(), rhs.into_column(), self.op)?; - ac_l.with_series(out.take_materialized_series(), aggregated, Some(&self.expr))?; + let out = apply_operator_owned(lhs, rhs, self.op)?; + ac_l.with_values(out, aggregated, Some(&self.expr))?; Ok(ac_l) } @@ -137,20 +137,20 @@ impl BinaryExpr { mut ac_l: AggregationContext<'a>, mut ac_r: AggregationContext<'a>, ) -> PolarsResult> { - let name = ac_l.series().name().clone(); + let name = ac_l.get_values().name().clone(); ac_l.groups(); ac_r.groups(); polars_ensure!(ac_l.groups.len() == ac_r.groups.len(), ComputeError: "lhs and rhs should have same group length"); - let left_s = ac_l.series().rechunk().into_column(); - let right_s = ac_r.series().rechunk().into_column(); - let res_s = apply_operator(&left_s, &right_s, self.op)?; + let left_c = ac_l.get_values().rechunk().into_column(); + let right_c = ac_r.get_values().rechunk().into_column(); + let res_c = apply_operator(&left_c, &right_c, self.op)?; ac_l.with_update_groups(UpdateGroups::WithSeriesLen); - let res_s = if res_s.len() == 1 { - res_s.new_from_index(0, ac_l.groups.len()) + let res_s = if res_c.len() == 1 { + res_c.new_from_index(0, ac_l.groups.len()) } else { - ListChunked::full(name, res_s.as_materialized_series(), ac_l.groups.len()).into_column() + ListChunked::full(name, res_c.as_materialized_series(), ac_l.groups.len()).into_column() }; - ac_l.with_series(res_s.take_materialized_series(), true, Some(&self.expr))?; + ac_l.with_values(res_s, true, Some(&self.expr))?; Ok(ac_l) } @@ -159,7 +159,7 @@ impl BinaryExpr { mut ac_l: AggregationContext<'a>, mut ac_r: AggregationContext<'a>, ) -> PolarsResult> { - let name = ac_l.series().name().clone(); + let name = ac_l.get_values().name().clone(); let ca = ac_l .iter_groups(false) .zip(ac_r.iter_groups(false)) @@ -175,7 +175,7 @@ impl BinaryExpr { .with_name(name); ac_l.with_update_groups(UpdateGroups::WithSeriesLen); - ac_l.with_agg_state(AggState::AggregatedList(ca.into_series())); + ac_l.with_agg_state(AggState::AggregatedList(ca.into_column())); Ok(ac_l) } } @@ -260,7 +260,7 @@ impl PhysicalExpr for BinaryExpr { apply_operator(&lhs.into_column(), &rhs.get_inner().into_column(), self.op) .map(|c| c.take_materialized_series()) })?; - ac_l.with_series(out.into_series(), true, Some(&self.expr))?; + ac_l.with_values(out.into_column(), true, Some(&self.expr))?; Ok(ac_l) }, _ => self.apply_group_aware(ac_l, ac_r), diff --git a/crates/polars-expr/src/expressions/cast.rs b/crates/polars-expr/src/expressions/cast.rs index 95f0c9eebee5..1bc230ceab8f 100644 --- a/crates/polars-expr/src/expressions/cast.rs +++ b/crates/polars-expr/src/expressions/cast.rs @@ -59,14 +59,14 @@ impl PhysicalExpr for CastExpr { self.finish(&s.into_column()) .map(|c| c.take_materialized_series()) })?; - ac.with_series(casted.into_series(), true, None)?; + ac.with_values(casted.into_column(), true, None)?; }, AggState::AggregatedScalar(s) => { let s = self.finish(&s.clone().into_column())?; if ac.is_literal() { - ac.with_literal(s.take_materialized_series()); + ac.with_literal(s); } else { - ac.with_series(s.take_materialized_series(), true, None)?; + ac.with_values(s, true, None)?; } }, _ => { @@ -77,9 +77,9 @@ impl PhysicalExpr for CastExpr { let s = self.finish(&s.as_ref().clone().into_column())?; if ac.is_literal() { - ac.with_literal(s.take_materialized_series()); + ac.with_literal(s); } else { - ac.with_series(s.take_materialized_series(), false, None)?; + ac.with_values(s, false, None)?; } }, } diff --git a/crates/polars-expr/src/expressions/column.rs b/crates/polars-expr/src/expressions/column.rs index 2142d22df6d9..99b5ba9fe262 100644 --- a/crates/polars-expr/src/expressions/column.rs +++ b/crates/polars-expr/src/expressions/column.rs @@ -21,9 +21,9 @@ impl ColumnExpr { impl ColumnExpr { fn check_external_context( &self, - out: PolarsResult, + out: PolarsResult, state: &ExecutionState, - ) -> PolarsResult { + ) -> PolarsResult { match out { Ok(col) => Ok(col), Err(e) => { @@ -33,7 +33,7 @@ impl ColumnExpr { for df in state.ext_contexts.as_ref() { let out = df.column(&self.name); if out.is_ok() { - return out.map(Column::as_materialized_series).cloned(); + return out.cloned(); } } Err(e) @@ -44,12 +44,12 @@ impl ColumnExpr { fn process_by_idx( &self, - out: &Series, + out: &Column, _state: &ExecutionState, _schema: &Schema, df: &DataFrame, check_state_schema: bool, - ) -> PolarsResult { + ) -> PolarsResult { if out.name() != &*self.name { if check_state_schema { if let Some(schema) = _state.get_schema() { @@ -75,9 +75,7 @@ impl ColumnExpr { // in release we fallback to linear search #[allow(unreachable_code)] { - df.column(&self.name) - .map(Column::as_materialized_series) - .cloned() + df.column(&self.name).cloned() } } else { Ok(out.clone()) @@ -88,7 +86,7 @@ impl ColumnExpr { df: &DataFrame, _state: &ExecutionState, _panic_during_test: bool, - ) -> PolarsResult { + ) -> PolarsResult { #[cfg(feature = "panic_on_schema")] { if _panic_during_test @@ -100,9 +98,7 @@ impl ColumnExpr { } // in release we fallback to linear search #[allow(unreachable_code)] - df.column(&self.name) - .map(Column::as_materialized_series) - .cloned() + df.column(&self.name).cloned() } fn process_from_state_schema( @@ -110,19 +106,17 @@ impl ColumnExpr { df: &DataFrame, state: &ExecutionState, schema: &Schema, - ) -> PolarsResult { + ) -> PolarsResult { match schema.get_full(&self.name) { None => self.process_by_linear_search(df, state, true), Some((idx, _, _)) => match df.get_columns().get(idx) { - Some(out) => { - self.process_by_idx(out.as_materialized_series(), state, schema, df, false) - }, + Some(out) => self.process_by_idx(out, state, schema, df, false), None => self.process_by_linear_search(df, state, true), }, } } - fn process_cse(&self, df: &DataFrame, schema: &Schema) -> PolarsResult { + fn process_cse(&self, df: &DataFrame, schema: &Schema) -> PolarsResult { // The CSE columns are added on the rhs. let offset = schema.len(); let columns = &df.get_columns()[offset..]; @@ -131,7 +125,6 @@ impl ColumnExpr { .iter() .find(|s| s.name() == &self.name) .unwrap() - .as_materialized_series() .clone()) } } @@ -146,13 +139,7 @@ impl PhysicalExpr for ColumnExpr { // check if the schema was correct // if not do O(n) search match df.get_columns().get(idx) { - Some(out) => self.process_by_idx( - out.as_materialized_series(), - state, - &self.schema, - df, - true, - ), + Some(out) => self.process_by_idx(out, state, &self.schema, df, true), None => { // partitioned group_by special case if let Some(schema) = state.get_schema() { @@ -183,12 +170,8 @@ impl PhysicalExpr for ColumnExpr { groups: &'a GroupsProxy, state: &ExecutionState, ) -> PolarsResult> { - let s = self.evaluate(df, state)?; - Ok(AggregationContext::new( - s.take_materialized_series(), - Cow::Borrowed(groups), - false, - )) + let c = self.evaluate(df, state)?; + Ok(AggregationContext::new(c, Cow::Borrowed(groups), false)) } fn as_partitioned_aggregator(&self) -> Option<&dyn PartitionedAggregation> { diff --git a/crates/polars-expr/src/expressions/count.rs b/crates/polars-expr/src/expressions/count.rs index 6102caf5a354..db25f0d9e73b 100644 --- a/crates/polars-expr/src/expressions/count.rs +++ b/crates/polars-expr/src/expressions/count.rs @@ -32,8 +32,8 @@ impl PhysicalExpr for CountExpr { _state: &ExecutionState, ) -> PolarsResult> { let ca = groups.group_count().with_name(PlSmallStr::from_static(LEN)); - let s = ca.into_series(); - Ok(AggregationContext::new(s, Cow::Borrowed(groups), true)) + let c = ca.into_column(); + Ok(AggregationContext::new(c, Cow::Borrowed(groups), true)) } fn to_field(&self, _input_schema: &Schema) -> PolarsResult { diff --git a/crates/polars-expr/src/expressions/filter.rs b/crates/polars-expr/src/expressions/filter.rs index 6f847a7fa8ed..f2b1383059ee 100644 --- a/crates/polars-expr/src/expressions/filter.rs +++ b/crates/polars-expr/src/expressions/filter.rs @@ -73,7 +73,7 @@ impl PhysicalExpr for FilterExpr { .with_name(s.name().clone()) } }; - ac_s.with_series(out.into_series(), true, Some(&self.expr))?; + ac_s.with_values(out.into_column(), true, Some(&self.expr))?; ac_s.update_groups = WithSeriesLen; Ok(ac_s) } else { diff --git a/crates/polars-expr/src/expressions/gather.rs b/crates/polars-expr/src/expressions/gather.rs index 19a0e35ff315..5c0ccae4f2bc 100644 --- a/crates/polars-expr/src/expressions/gather.rs +++ b/crates/polars-expr/src/expressions/gather.rs @@ -2,7 +2,7 @@ use arrow::legacy::utils::CustomIterTools; use polars_core::chunked_array::builder::get_list_builder; use polars_core::prelude::*; use polars_core::utils::NoNull; -use polars_ops::prelude::{convert_to_unsigned_index, is_positive_idx_uncertain}; +use polars_ops::prelude::{convert_to_unsigned_index, is_positive_idx_uncertain_col}; use super::*; use crate::expressions::{AggState, AggregationContext, PhysicalExpr, UpdateGroups}; @@ -33,14 +33,14 @@ impl PhysicalExpr for GatherExpr { let mut ac = self.phys_expr.evaluate_on_groups(df, groups, state)?; let mut idx = self.idx.evaluate_on_groups(df, groups, state)?; - let s_idx = idx.series(); - match s_idx.dtype() { + let c_idx = idx.get_values(); + match c_idx.dtype() { DataType::List(inner) => { polars_ensure!(inner.is_integer(), InvalidOperation: "expected numeric dtype as index, got {:?}", inner) }, dt if dt.is_integer() => { // Unsigned integers will fall through and will use faster paths. - if !is_positive_idx_uncertain(s_idx) { + if !is_positive_idx_uncertain_col(c_idx) { return self.process_negative_indices_agg(ac, idx, groups); } }, @@ -80,10 +80,10 @@ impl PhysicalExpr for GatherExpr { .map(|(s, idx)| Some(s?.as_ref().take(idx?.as_ref().idx().unwrap()))) .map(|opt_res| opt_res.transpose()) .collect::>()? - .with_name(ac.series().name().clone()) + .with_name(ac.get_values().name().clone()) }; - ac.with_series(taken.into_series(), true, Some(&self.expr))?; + ac.with_values(taken.into_column(), true, Some(&self.expr))?; ac.with_update_groups(UpdateGroups::WithSeriesLen); Ok(ac) } @@ -162,10 +162,10 @@ impl GatherExpr { let taken = if self.returns_scalar { taken } else { - taken.as_list().into_series() + taken.as_list().into_column() }; - ac.with_series(taken, true, Some(&self.expr))?; + ac.with_values(taken, true, Some(&self.expr))?; Ok(ac) } else { self.gather_aggregated_expensive(ac, idx) @@ -183,7 +183,7 @@ impl GatherExpr { .unwrap() .try_apply_amortized(|s| s.as_ref().take(idx))?; - ac.with_series(out.into_series(), true, Some(&self.expr))?; + ac.with_values(out.into_column(), true, Some(&self.expr))?; ac.with_update_groups(UpdateGroups::WithGroupsLen); Ok(ac) } @@ -228,10 +228,10 @@ impl GatherExpr { let taken = if self.returns_scalar { taken } else { - taken.as_list().into_series() + taken.as_list().into_column() }; - ac.with_series(taken, true, Some(&self.expr))?; + ac.with_values(taken, true, Some(&self.expr))?; ac.with_update_groups(UpdateGroups::WithGroupsLen); Ok(ac) }, @@ -249,9 +249,9 @@ impl GatherExpr { ) -> PolarsResult> { let mut builder = get_list_builder( &ac.dtype(), - idx.series().len(), + idx.get_values().len(), groups.len(), - ac.series().name().clone(), + ac.get_values().name().clone(), ); let iter = ac.iter_groups(false).zip(idx.iter_groups(false)); @@ -265,7 +265,7 @@ impl GatherExpr { _ => builder.append_null(), }; } - let out = builder.finish().into_series(); + let out = builder.finish().into_column(); ac.with_agg_state(AggState::AggregatedList(out)); Ok(ac) } diff --git a/crates/polars-expr/src/expressions/group_iter.rs b/crates/polars-expr/src/expressions/group_iter.rs index b42851e49d2a..31a694fe4a86 100644 --- a/crates/polars-expr/src/expressions/group_iter.rs +++ b/crates/polars-expr/src/expressions/group_iter.rs @@ -12,45 +12,45 @@ impl AggregationContext<'_> { match self.agg_state() { AggState::Literal(_) => { self.groups(); - let s = self.series().rechunk(); + let c = self.get_values().rechunk(); let name = if keep_names { - s.name().clone() + c.name().clone() } else { PlSmallStr::EMPTY }; // SAFETY: dtype is correct unsafe { Box::new(LitIter::new( - s.array_ref(0).clone(), + c.as_materialized_series().array_ref(0).clone(), self.groups.len(), - s._dtype(), + c.dtype(), name, )) } }, AggState::AggregatedScalar(_) => { self.groups(); - let s = self.series(); + let c = self.get_values(); let name = if keep_names { - s.name().clone() + c.name().clone() } else { PlSmallStr::EMPTY }; // SAFETY: dtype is correct unsafe { Box::new(FlatIter::new( - s.chunks(), + c.as_materialized_series().chunks(), self.groups.len(), - s.dtype(), + c.dtype(), name, )) } }, AggState::AggregatedList(_) => { - let s = self.series(); - let list = s.list().unwrap(); + let c = self.get_values(); + let list = c.list().unwrap(); let name = if keep_names { - s.name().clone() + c.name().clone() } else { PlSmallStr::EMPTY }; @@ -59,10 +59,10 @@ impl AggregationContext<'_> { AggState::NotAggregated(_) => { // we don't take the owned series as we want a reference let _ = self.aggregated(); - let s = self.series(); - let list = s.list().unwrap(); + let c = self.get_values(); + let list = c.list().unwrap(); let name = if keep_names { - s.name().clone() + c.name().clone() } else { PlSmallStr::EMPTY }; diff --git a/crates/polars-expr/src/expressions/literal.rs b/crates/polars-expr/src/expressions/literal.rs index 0ab9ad9872b3..e2ea2f6d0f90 100644 --- a/crates/polars-expr/src/expressions/literal.rs +++ b/crates/polars-expr/src/expressions/literal.rs @@ -139,10 +139,7 @@ impl PhysicalExpr for LiteralExpr { state: &ExecutionState, ) -> PolarsResult> { let s = self.evaluate(df, state)?; - Ok(AggregationContext::from_literal( - s.take_materialized_series(), - Cow::Borrowed(groups), - )) + Ok(AggregationContext::from_literal(s, Cow::Borrowed(groups))) } fn as_partitioned_aggregator(&self) -> Option<&dyn PartitionedAggregation> { diff --git a/crates/polars-expr/src/expressions/mod.rs b/crates/polars-expr/src/expressions/mod.rs index 277afddb41f2..70963dde7eec 100644 --- a/crates/polars-expr/src/expressions/mod.rs +++ b/crates/polars-expr/src/expressions/mod.rs @@ -48,28 +48,28 @@ use crate::state::ExecutionState; #[derive(Clone, Debug)] pub enum AggState { - /// Already aggregated: `.agg_list(group_tuples`) is called + /// Already aggregated: `.agg_list(group_tuples)` is called /// and produced a `Series` of dtype `List` - AggregatedList(Series), + AggregatedList(Column), /// Already aggregated: `.agg` is called on an aggregation /// that produces a scalar. /// think of `sum`, `mean`, `variance` like aggregations. - AggregatedScalar(Series), + AggregatedScalar(Column), /// Not yet aggregated: `agg_list` still has to be called. - NotAggregated(Series), - Literal(Series), + NotAggregated(Column), + Literal(Column), } impl AggState { fn try_map(&self, func: F) -> PolarsResult where - F: FnOnce(&Series) -> PolarsResult, + F: FnOnce(&Column) -> PolarsResult, { Ok(match self { - AggState::AggregatedList(s) => AggState::AggregatedList(func(s)?), - AggState::AggregatedScalar(s) => AggState::AggregatedScalar(func(s)?), - AggState::Literal(s) => AggState::Literal(func(s)?), - AggState::NotAggregated(s) => AggState::NotAggregated(func(s)?), + AggState::AggregatedList(c) => AggState::AggregatedList(func(c)?), + AggState::AggregatedScalar(c) => AggState::AggregatedScalar(func(c)?), + AggState::Literal(c) => AggState::Literal(func(c)?), + AggState::NotAggregated(c) => AggState::NotAggregated(func(c)?), }) } } @@ -152,14 +152,14 @@ impl<'a> AggregationContext<'a> { self.update_groups = UpdateGroups::No; }, UpdateGroups::WithSeriesLen => { - let s = self.series().clone(); - self.det_groups_from_list(&s); + let s = self.get_values().clone(); + self.det_groups_from_list(s.as_materialized_series()); }, } &self.groups } - pub(crate) fn series(&self) -> &Series { + pub(crate) fn get_values(&self) -> &Column { match &self.state { AggState::NotAggregated(s) | AggState::AggregatedScalar(s) @@ -191,20 +191,20 @@ impl<'a> AggregationContext<'a> { /// - `aggregated` sets if the Series is a list due to aggregation (could also be a list because its /// the columns dtype) fn new( - series: Series, + column: Column, groups: Cow<'a, GroupsProxy>, aggregated: bool, ) -> AggregationContext<'a> { - let series = match (aggregated, series.dtype()) { + let series = match (aggregated, column.dtype()) { (true, &DataType::List(_)) => { - assert_eq!(series.len(), groups.len()); - AggState::AggregatedList(series) + assert_eq!(column.len(), groups.len()); + AggState::AggregatedList(column) }, (true, _) => { - assert_eq!(series.len(), groups.len()); - AggState::AggregatedScalar(series) + assert_eq!(column.len(), groups.len()); + AggState::AggregatedScalar(column) }, - _ => AggState::NotAggregated(series), + _ => AggState::NotAggregated(column), }; Self { @@ -230,7 +230,7 @@ impl<'a> AggregationContext<'a> { } } - fn from_literal(lit: Series, groups: Cow<'a, GroupsProxy>) -> AggregationContext<'a> { + fn from_literal(lit: Column, groups: Cow<'a, GroupsProxy>) -> AggregationContext<'a> { Self { state: AggState::Literal(lit), groups, @@ -283,7 +283,7 @@ impl<'a> AggregationContext<'a> { }, _ => { let groups = { - self.series() + self.get_values() .list() .expect("impl error, should be a list at this point") .amortized_iter() @@ -312,27 +312,27 @@ impl<'a> AggregationContext<'a> { /// # Arguments /// - `aggregated` sets if the Series is a list due to aggregation (could also be a list because its /// the columns dtype) - pub(crate) fn with_series( + pub(crate) fn with_values( &mut self, - series: Series, + column: Column, aggregated: bool, expr: Option<&Expr>, ) -> PolarsResult<&mut Self> { - self.with_series_and_args(series, aggregated, expr, false) + self.with_values_and_args(column, aggregated, expr, false) } - pub(crate) fn with_series_and_args( + pub(crate) fn with_values_and_args( &mut self, - series: Series, + column: Column, aggregated: bool, expr: Option<&Expr>, // if the applied function was a `map` instead of an `apply` // this will keep functions applied over literals as literals: F(lit) = lit mapped: bool, ) -> PolarsResult<&mut Self> { - self.state = match (aggregated, series.dtype()) { + self.state = match (aggregated, column.dtype()) { (true, &DataType::List(_)) => { - if series.len() != self.groups.len() { + if column.len() != self.groups.len() { let fmt_expr = if let Some(e) = expr { format!("'{e:?}' ") } else { @@ -342,30 +342,30 @@ impl<'a> AggregationContext<'a> { ComputeError: "aggregation expression '{}' produced a different number of elements: {} \ than the number of groups: {} (this is likely invalid)", - fmt_expr, series.len(), self.groups.len(), + fmt_expr, column.len(), self.groups.len(), ); } - AggState::AggregatedList(series) + AggState::AggregatedList(column) }, - (true, _) => AggState::AggregatedScalar(series), + (true, _) => AggState::AggregatedScalar(column), _ => { match self.state { // already aggregated to sum, min even this series was flattened it never could // retrieve the length before grouping, so it stays in this state. - AggState::AggregatedScalar(_) => AggState::AggregatedScalar(series), + AggState::AggregatedScalar(_) => AggState::AggregatedScalar(column), // applying a function on a literal, keeps the literal state - AggState::Literal(_) if series.len() == 1 && mapped => { - AggState::Literal(series) + AggState::Literal(_) if column.len() == 1 && mapped => { + AggState::Literal(column) }, - _ => AggState::NotAggregated(series), + _ => AggState::NotAggregated(column.into_column()), } }, }; Ok(self) } - pub(crate) fn with_literal(&mut self, series: Series) -> &mut Self { - self.state = AggState::Literal(series); + pub(crate) fn with_literal(&mut self, column: Column) -> &mut Self { + self.state = AggState::Literal(column); self } @@ -373,7 +373,7 @@ impl<'a> AggregationContext<'a> { pub(crate) fn with_groups(&mut self, groups: GroupsProxy) -> &mut Self { if let AggState::AggregatedList(_) = self.agg_state() { // In case of new groups, a series always needs to be flattened - self.with_series(self.flat_naive().into_owned(), false, None) + self.with_values(self.flat_naive().into_owned(), false, None) .unwrap(); } self.groups = Cow::Owned(groups); @@ -383,7 +383,7 @@ impl<'a> AggregationContext<'a> { } /// Get the aggregated version of the series. - pub fn aggregated(&mut self) -> Series { + pub fn aggregated(&mut self) -> Column { // we clone, because we only want to call `self.groups()` if needed. // self groups may instantiate new groups and thus can be expensive. match self.state.clone() { @@ -409,7 +409,7 @@ impl<'a> AggregationContext<'a> { self.update_groups = UpdateGroups::WithGroupsLen; out }, - AggState::AggregatedList(s) | AggState::AggregatedScalar(s) => s, + AggState::AggregatedList(s) | AggState::AggregatedScalar(s) => s.into_column(), AggState::Literal(s) => { self.groups(); let rows = self.groups.len(); @@ -421,21 +421,21 @@ impl<'a> AggregationContext<'a> { ]) .unwrap(); self.state = AggState::AggregatedList(out.clone()); - out + out.into_column() }, } } /// Get the final aggregated version of the series. - pub fn finalize(&mut self) -> Series { + pub fn finalize(&mut self) -> Column { // we clone, because we only want to call `self.groups()` if needed. // self groups may instantiate new groups and thus can be expensive. match &self.state { - AggState::Literal(s) => { - let s = s.clone(); + AggState::Literal(c) => { + let c = c.clone(); self.groups(); let rows = self.groups.len(); - s.new_from_index(0, rows) + c.new_from_index(0, rows) }, _ => self.aggregated(), } @@ -452,15 +452,15 @@ impl<'a> AggregationContext<'a> { } } - pub fn get_final_aggregation(mut self) -> (Series, Cow<'a, GroupsProxy>) { + pub fn get_final_aggregation(mut self) -> (Column, Cow<'a, GroupsProxy>) { let _ = self.groups(); let groups = self.groups; match self.state { - AggState::NotAggregated(s) => (s, groups), - AggState::AggregatedScalar(s) => (s, groups), - AggState::Literal(s) => (s, groups), - AggState::AggregatedList(s) => { - let flattened = s.explode().unwrap(); + AggState::NotAggregated(c) => (c, groups), + AggState::AggregatedScalar(c) => (c, groups), + AggState::Literal(c) => (c, groups), + AggState::AggregatedList(c) => { + let flattened = c.explode().unwrap(); let groups = groups.into_owned(); // unroll the possible flattened state // say we have groups with overlapping windows: @@ -496,10 +496,10 @@ impl<'a> AggregationContext<'a> { /// Note that we call it naive, because if a previous expr /// has filtered or sorted this, this information is in the /// group tuples not the flattened series. - pub(crate) fn flat_naive(&self) -> Cow<'_, Series> { + pub(crate) fn flat_naive(&self) -> Cow<'_, Column> { match &self.state { - AggState::NotAggregated(s) => Cow::Borrowed(s), - AggState::AggregatedList(s) => { + AggState::NotAggregated(c) => Cow::Borrowed(c), + AggState::AggregatedList(c) => { #[cfg(debug_assertions)] { // panic so we find cases where we accidentally explode overlapping groups @@ -509,22 +509,22 @@ impl<'a> AggregationContext<'a> { } } - Cow::Owned(s.explode().unwrap()) + Cow::Owned(c.explode().unwrap()) }, - AggState::AggregatedScalar(s) => Cow::Borrowed(s), - AggState::Literal(s) => Cow::Borrowed(s), + AggState::AggregatedScalar(c) => Cow::Borrowed(c), + AggState::Literal(c) => Cow::Borrowed(c), } } /// Take the series. - pub(crate) fn take(&mut self) -> Series { - let s = match &mut self.state { - AggState::NotAggregated(s) - | AggState::AggregatedScalar(s) - | AggState::AggregatedList(s) => s, - AggState::Literal(s) => s, + pub(crate) fn take(&mut self) -> Column { + let c = match &mut self.state { + AggState::NotAggregated(c) + | AggState::AggregatedScalar(c) + | AggState::AggregatedList(c) => c, + AggState::Literal(c) => c, }; - std::mem::take(s) + std::mem::take(c) } } diff --git a/crates/polars-expr/src/expressions/slice.rs b/crates/polars-expr/src/expressions/slice.rs index 2b805edd1bb0..0c2688d7999a 100644 --- a/crates/polars-expr/src/expressions/slice.rs +++ b/crates/polars-expr/src/expressions/slice.rs @@ -1,5 +1,5 @@ use polars_core::prelude::*; -use polars_core::utils::{slice_offsets, Container, CustomIterTools}; +use polars_core::utils::{slice_offsets, CustomIterTools}; use polars_core::POOL; use rayon::prelude::*; use AnyValue::Null; @@ -14,7 +14,7 @@ pub struct SliceExpr { pub(crate) expr: Expr, } -fn extract_offset(offset: &Series, expr: &Expr) -> PolarsResult { +fn extract_offset(offset: &Column, expr: &Expr) -> PolarsResult { polars_ensure!( offset.len() <= 1, expr = expr, ComputeError: "invalid argument to slice; expected an offset literal, got series of length {}", @@ -25,7 +25,7 @@ fn extract_offset(offset: &Series, expr: &Expr) -> PolarsResult { ) } -fn extract_length(length: &Series, expr: &Expr) -> PolarsResult { +fn extract_length(length: &Column, expr: &Expr) -> PolarsResult { polars_ensure!( length.len() <= 1, expr = expr, ComputeError: "invalid argument to slice; expected a length literal, got series of length {}", @@ -39,11 +39,11 @@ fn extract_length(length: &Series, expr: &Expr) -> PolarsResult { } } -fn extract_args(offset: &Series, length: &Series, expr: &Expr) -> PolarsResult<(i64, usize)> { +fn extract_args(offset: &Column, length: &Column, expr: &Expr) -> PolarsResult<(i64, usize)> { Ok((extract_offset(offset, expr)?, extract_length(length, expr)?)) } -fn check_argument(arg: &Series, groups: &GroupsProxy, name: &str, expr: &Expr) -> PolarsResult<()> { +fn check_argument(arg: &Column, groups: &GroupsProxy, name: &str, expr: &Expr) -> PolarsResult<()> { polars_ensure!( !matches!(arg.dtype(), DataType::List(_)), expr = expr, ComputeError: "invalid slice argument: cannot use an array as {} argument", name, @@ -92,11 +92,7 @@ impl PhysicalExpr for SliceExpr { let offset = &results[0]; let length = &results[1]; let series = &results[2]; - let (offset, length) = extract_args( - offset.as_materialized_series(), - length.as_materialized_series(), - &self.expr, - )?; + let (offset, length) = extract_args(offset, length, &self.expr)?; Ok(series.slice(offset, length)) } diff --git a/crates/polars-expr/src/expressions/sort.rs b/crates/polars-expr/src/expressions/sort.rs index be9fe57e29ce..df816f9b48e7 100644 --- a/crates/polars-expr/src/expressions/sort.rs +++ b/crates/polars-expr/src/expressions/sort.rs @@ -63,7 +63,7 @@ impl PhysicalExpr for SortExpr { AggState::AggregatedList(s) => { let ca = s.list().unwrap(); let out = ca.lst_sort(self.options)?; - ac.with_series(out.into_series(), true, Some(&self.expr))?; + ac.with_values(out.into_column(), true, Some(&self.expr))?; }, _ => { let series = ac.flat_naive().into_owned(); diff --git a/crates/polars-expr/src/expressions/sortby.rs b/crates/polars-expr/src/expressions/sortby.rs index fad081cb49ed..ed34ed6414cd 100644 --- a/crates/polars-expr/src/expressions/sortby.rs +++ b/crates/polars-expr/src/expressions/sortby.rs @@ -133,8 +133,8 @@ fn sort_by_groups_no_match_single<'a>( }) .collect_ca_with_dtype(PlSmallStr::EMPTY, dtype) }); - let s = ca?.with_name(s_in.name().clone()).into_series(); - ac_in.with_series(s, true, Some(expr))?; + let c = ca?.with_name(s_in.name().clone()).into_column(); + ac_in.with_values(c, true, Some(expr))?; Ok(ac_in) } @@ -281,12 +281,16 @@ impl PhysicalExpr for SortByExpr { .collect::>>()?; let mut sort_by_s = ac_sort_by .iter() - .map(|s| { - let s = s.flat_naive(); - match s.dtype() { + .map(|c| { + let c = c.flat_naive(); + match c.dtype() { #[cfg(feature = "dtype-categorical")] - DataType::Categorical(_, _) | DataType::Enum(_, _) => s.into_owned(), - _ => s.to_physical_repr().into_owned(), + DataType::Categorical(_, _) | DataType::Enum(_, _) => { + c.as_materialized_series().clone() + }, + // @scalar-opt + // @partition-opt + _ => c.to_physical_repr().take_materialized_series(), } }) .collect::>(); @@ -363,7 +367,7 @@ impl PhysicalExpr for SortByExpr { // group_by operation - we must ensure that we are as well. if ordered_by_group_operation { let s = ac_in.aggregated(); - ac_in.with_series(s.explode().unwrap(), false, None)?; + ac_in.with_values(s.explode().unwrap(), false, None)?; } ac_in.with_groups(groups); diff --git a/crates/polars-expr/src/expressions/ternary.rs b/crates/polars-expr/src/expressions/ternary.rs index 2d1035c22eb7..bbd0c5f7d936 100644 --- a/crates/polars-expr/src/expressions/ternary.rs +++ b/crates/polars-expr/src/expressions/ternary.rs @@ -56,13 +56,13 @@ fn finish_as_iters<'a>( .transpose() }) .collect::>()? - .with_name(ac_truthy.series().name().clone()); + .with_name(ac_truthy.get_values().name().clone()); // Aggregation leaves only a single chunk. let arr = ca.downcast_iter().next().unwrap(); let list_vals_len = arr.values().len(); - let mut out = ca.into_series(); + let mut out = ca.into_column(); if ac_truthy.arity_should_explode() && ac_falsy.arity_should_explode() && ac_mask.arity_should_explode() && // Exploded list should be equal to groups length. list_vals_len == ac_truthy.groups.len() @@ -70,7 +70,7 @@ fn finish_as_iters<'a>( out = out.explode()? } - ac_truthy.with_series(out, true, None)?; + ac_truthy.with_values(out, true, None)?; Ok(ac_truthy) } @@ -168,8 +168,8 @@ impl PhysicalExpr for TernaryExpr { } let out = ac_truthy - .series() - .zip_with(ac_mask.series().bool()?, ac_falsy.series())?; + .get_values() + .zip_with(ac_mask.get_values().bool()?, ac_falsy.get_values())?; for ac in [&ac_mask, &ac_truthy, &ac_falsy].into_iter() { if matches!(ac.agg_state(), NotAggregated(_)) { @@ -257,21 +257,21 @@ impl PhysicalExpr for TernaryExpr { } let truthy = if let AggregatedList(s) = ac_truthy.agg_state() { - s.list().unwrap().get_inner() + s.list().unwrap().get_inner().into_column() } else { - ac_truthy.series().clone() + ac_truthy.get_values().clone() }; let falsy = if let AggregatedList(s) = ac_falsy.agg_state() { - s.list().unwrap().get_inner() + s.list().unwrap().get_inner().into_column() } else { - ac_falsy.series().clone() + ac_falsy.get_values().clone() }; let mask = if let AggregatedList(s) = ac_mask.agg_state() { - s.list().unwrap().get_inner() + s.list().unwrap().get_inner().into_column() } else { - ac_mask.series().clone() + ac_mask.get_values().clone() }; let out = truthy.zip_with(mask.bool()?, &falsy)?; @@ -280,8 +280,10 @@ impl PhysicalExpr for TernaryExpr { // offsets buffer of the result, so we construct the result // ListChunked directly from the 2. let out = out.rechunk(); - let values = out.array_ref(0); - let offsets = ac_target.series().list().unwrap().offsets()?; + // @scalar-opt + // @partition-opt + let values = out.as_materialized_series().array_ref(0); + let offsets = ac_target.get_values().list().unwrap().offsets()?; let inner_type = out.dtype(); let dtype = LargeListArray::default_datatype(values.dtype().clone()); @@ -291,11 +293,11 @@ impl PhysicalExpr for TernaryExpr { let mut out = ListChunked::with_chunk(truthy.name().clone(), out); unsafe { out.to_logical(inner_type.clone()) }; - if ac_target.series().list().unwrap()._can_fast_explode() { + if ac_target.get_values().list().unwrap()._can_fast_explode() { out.set_fast_explode(); }; - let out = out.into_series(); + let out = out.into_column(); AggregatedList(out) }, @@ -305,8 +307,8 @@ impl PhysicalExpr for TernaryExpr { } let out = ac_truthy - .series() - .zip_with(ac_mask.series().bool()?, ac_falsy.series())?; + .get_values() + .zip_with(ac_mask.get_values().bool()?, ac_falsy.get_values())?; AggregatedScalar(out) }, _ => { diff --git a/crates/polars-expr/src/expressions/window.rs b/crates/polars-expr/src/expressions/window.rs index e15a301f68b4..bbb9a1cface1 100644 --- a/crates/polars-expr/src/expressions/window.rs +++ b/crates/polars-expr/src/expressions/window.rs @@ -45,13 +45,13 @@ enum MapStrategy { impl WindowExpr { fn map_list_agg_by_arg_sort( &self, - out_column: Series, - flattened: Series, + out_column: Column, + flattened: Column, mut ac: AggregationContext, gb: GroupBy, state: &ExecutionState, cache_key: &str, - ) -> PolarsResult { + ) -> PolarsResult { // idx (new-idx, original-idx) let mut idx_mapping = Vec::with_capacity(out_column.len()); @@ -124,14 +124,14 @@ impl WindowExpr { fn map_by_arg_sort( &self, df: &DataFrame, - out_column: Series, - flattened: Series, + out_column: Column, + flattened: Column, mut ac: AggregationContext, group_by_columns: &[Column], gb: GroupBy, state: &ExecutionState, cache_key: &str, - ) -> PolarsResult { + ) -> PolarsResult { // we use an arg_sort to map the values back // This is a bit more complicated because the final group tuples may differ from the original @@ -656,7 +656,7 @@ impl PhysicalExpr for WindowExpr { } } -fn materialize_column(join_opt_ids: &ChunkJoinOptIds, out_column: &Series) -> Series { +fn materialize_column(join_opt_ids: &ChunkJoinOptIds, out_column: &Column) -> Column { { use arrow::Either; use polars_ops::chunked_array::TakeChunked; @@ -680,11 +680,11 @@ fn cache_gb(gb: GroupBy, state: &ExecutionState, cache_key: &str) { /// Simple reducing aggregation can be set by the groups fn set_by_groups( - s: &Series, + s: &Column, groups: &GroupsProxy, len: usize, update_groups: bool, -) -> Option { +) -> Option { if update_groups { return None; } @@ -697,7 +697,9 @@ fn set_by_groups( Some(set_numeric($ca, groups, len)) }}; } - downcast_as_macro_arg_physical!(&s, dispatch).map(|s| s.cast(dtype).unwrap()) + downcast_as_macro_arg_physical!(&s, dispatch) + .map(|s| s.cast(dtype).unwrap()) + .map(Column::from) } else { None } diff --git a/crates/polars-lazy/src/dsl/list.rs b/crates/polars-lazy/src/dsl/list.rs index c706ee9b6957..d23d99b90e5c 100644 --- a/crates/polars-lazy/src/dsl/list.rs +++ b/crates/polars-lazy/src/dsl/list.rs @@ -138,7 +138,7 @@ fn run_on_group_by_engine( let out = match ac.agg_state() { AggState::AggregatedScalar(_) => { let out = ac.aggregated(); - out.as_list().into_series() + out.as_list().into_column() }, _ => ac.aggregated(), }; diff --git a/crates/polars-lazy/src/frame/pivot.rs b/crates/polars-lazy/src/frame/pivot.rs index 4d89eebef010..70eed4d8f58c 100644 --- a/crates/polars-lazy/src/frame/pivot.rs +++ b/crates/polars-lazy/src/frame/pivot.rs @@ -29,7 +29,7 @@ impl PhysicalAggExpr for PivotExpr { )?; phys_expr .evaluate_on_groups(df, groups, &state) - .map(|mut ac| ac.aggregated()) + .map(|mut ac| ac.aggregated().take_materialized_series()) } fn root_name(&self) -> PolarsResult<&PlSmallStr> { diff --git a/crates/polars-mem-engine/src/executors/filter.rs b/crates/polars-mem-engine/src/executors/filter.rs index 417a7ecf766e..a47e9b6f5ed9 100644 --- a/crates/polars-mem-engine/src/executors/filter.rs +++ b/crates/polars-mem-engine/src/executors/filter.rs @@ -10,10 +10,10 @@ pub struct FilterExec { streamable: bool, } -fn series_to_mask(s: &Series) -> PolarsResult<&BooleanChunked> { - s.bool().map_err(|_| { +fn column_to_mask(c: &Column) -> PolarsResult<&BooleanChunked> { + c.bool().map_err(|_| { polars_err!( - ComputeError: "filter predicate must be of type `Boolean`, got `{}`", s.dtype() + ComputeError: "filter predicate must be of type `Boolean`, got `{}`", c.dtype() ) }) } @@ -41,11 +41,14 @@ impl FilterExec { if self.has_window { state.insert_has_window_function_flag() } - let s = self.predicate.evaluate(&df, state)?; + let c = self.predicate.evaluate(&df, state)?; if self.has_window { state.clear_window_expr_cache() } - df.filter(series_to_mask(s.as_materialized_series())?) + + // @scalar-opt + // @partition-opt + df.filter(column_to_mask(&c)?) } fn execute_chunks( @@ -54,8 +57,11 @@ impl FilterExec { state: &ExecutionState, ) -> PolarsResult { let iter = chunks.into_par_iter().map(|df| { - let s = self.predicate.evaluate(&df, state)?; - df.filter(series_to_mask(s.as_materialized_series())?) + let c = self.predicate.evaluate(&df, state)?; + + // @scalar-opt + // @partition-opt + df.filter(column_to_mask(&c)?) }); let df = POOL.install(|| iter.collect::>>())?; Ok(accumulate_dataframes_vertical_unchecked(df)) diff --git a/crates/polars-mem-engine/src/executors/group_by.rs b/crates/polars-mem-engine/src/executors/group_by.rs index 1ae612f64d67..437b7fb574aa 100644 --- a/crates/polars-mem-engine/src/executors/group_by.rs +++ b/crates/polars-mem-engine/src/executors/group_by.rs @@ -7,7 +7,7 @@ pub(super) fn evaluate_aggs( aggs: &[Arc], groups: &GroupsProxy, state: &ExecutionState, -) -> PolarsResult> { +) -> PolarsResult> { POOL.install(|| { aggs.par_iter() .map(|expr| { diff --git a/crates/polars-mem-engine/src/executors/stack.rs b/crates/polars-mem-engine/src/executors/stack.rs index a93d4fc72d89..0b2dbfd01da3 100644 --- a/crates/polars-mem-engine/src/executors/stack.rs +++ b/crates/polars-mem-engine/src/executors/stack.rs @@ -38,12 +38,7 @@ impl StackExec { self.options.run_parallel, )?; // We don't have to do a broadcast check as cse is not allowed to hit this. - df._add_series( - res.into_iter() - .map(|c| c.take_materialized_series()) - .collect(), - schema, - )?; + df._add_columns(res.into_iter().collect(), schema)?; Ok(df) }); @@ -100,12 +95,7 @@ impl StackExec { } } } - df._add_series( - res.into_iter() - .map(|v| v.take_materialized_series()) - .collect(), - schema, - )?; + df._add_columns(res.into_iter().collect(), schema)?; } df }; diff --git a/crates/polars-ops/src/series/ops/index.rs b/crates/polars-ops/src/series/ops/index.rs index 51811cf0c319..b56f499895ff 100644 --- a/crates/polars-ops/src/series/ops/index.rs +++ b/crates/polars-ops/src/series/ops/index.rs @@ -1,7 +1,9 @@ use num_traits::{Signed, Zero}; use polars_core::error::{polars_ensure, PolarsResult}; use polars_core::prelude::arity::unary_elementwise_values; -use polars_core::prelude::{ChunkedArray, DataType, IdxCa, PolarsIntegerType, Series, IDX_DTYPE}; +use polars_core::prelude::{ + ChunkedArray, Column, DataType, IdxCa, PolarsIntegerType, Series, IDX_DTYPE, +}; use polars_utils::index::ToIdx; fn convert(ca: &ChunkedArray, target_len: usize) -> PolarsResult @@ -97,3 +99,10 @@ pub fn is_positive_idx_uncertain(s: &Series) -> bool { _ => unreachable!(), } } + +/// May give false negatives because it ignores the null values. +pub fn is_positive_idx_uncertain_col(c: &Column) -> bool { + // @scalar-opt + // @partition-opt + is_positive_idx_uncertain(c.as_materialized_series()) +} From 85b8de22a1823459a38b794791903724472e4a35 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Thu, 14 Nov 2024 16:59:56 +0400 Subject: [PATCH 16/18] feat: Add an `is_literal` method to expression `meta` namespace (#19773) --- crates/polars-plan/src/dsl/meta.rs | 18 +++++++++-- crates/polars-python/src/expr/meta.rs | 4 +++ .../source/reference/expressions/meta.rst | 1 + py-polars/polars/expr/meta.py | 31 +++++++++++++++++-- .../unit/operations/namespaces/test_meta.py | 31 ++++++++++++++++++- 5 files changed, 79 insertions(+), 6 deletions(-) diff --git a/crates/polars-plan/src/dsl/meta.rs b/crates/polars-plan/src/dsl/meta.rs index b23c5181cc13..76a881f08ed1 100644 --- a/crates/polars-plan/src/dsl/meta.rs +++ b/crates/polars-plan/src/dsl/meta.rs @@ -83,9 +83,21 @@ impl MetaNameSpace { | Expr::IndexColumn(_) | Expr::Selector(_) | Expr::Wildcard => true, - Expr::Alias(_, _) | Expr::KeepName(_) | Expr::RenameAlias { .. } if allow_aliasing => { - true - }, + Expr::Alias(_, _) | Expr::KeepName(_) | Expr::RenameAlias { .. } => allow_aliasing, + _ => false, + }) + } + + /// Indicate if this expression represents a literal value (optionally aliased). + pub fn is_literal(&self, allow_aliasing: bool) -> bool { + self.0.into_iter().all(|e| match e { + Expr::Literal(_) => true, + Expr::Alias(_, _) => allow_aliasing, + Expr::Cast { + expr, + dtype: DataType::Datetime(_, _), + options: CastOptions::Strict, + } if matches!(&**expr, Expr::Literal(LiteralValue::DateTime(_, _, _))) => true, _ => false, }) } diff --git a/crates/polars-python/src/expr/meta.rs b/crates/polars-python/src/expr/meta.rs index d0e3a8b3e1df..891d37d26afa 100644 --- a/crates/polars-python/src/expr/meta.rs +++ b/crates/polars-python/src/expr/meta.rs @@ -58,6 +58,10 @@ impl PyExpr { .is_column_selection(allow_aliasing) } + fn meta_is_literal(&self, allow_aliasing: bool) -> bool { + self.inner.clone().meta().is_literal(allow_aliasing) + } + fn _meta_selector_add(&self, other: PyExpr) -> PyResult { let out = self .inner diff --git a/py-polars/docs/source/reference/expressions/meta.rst b/py-polars/docs/source/reference/expressions/meta.rst index 514067e0166f..6e4428381a34 100644 --- a/py-polars/docs/source/reference/expressions/meta.rst +++ b/py-polars/docs/source/reference/expressions/meta.rst @@ -13,6 +13,7 @@ The following methods are available under the `expr.meta` attribute. Expr.meta.has_multiple_outputs Expr.meta.is_column Expr.meta.is_column_selection + Expr.meta.is_literal Expr.meta.is_regex_projection Expr.meta.ne Expr.meta.output_name diff --git a/py-polars/polars/expr/meta.py b/py-polars/polars/expr/meta.py index e6ebc6f40944..d949f7583d14 100644 --- a/py-polars/polars/expr/meta.py +++ b/py-polars/polars/expr/meta.py @@ -108,7 +108,7 @@ def is_column_selection(self, *, allow_aliasing: bool = False) -> bool: """ Indicate if this expression only selects columns (optionally with aliasing). - This can include bare columns, column matches by regex or dtype, selectors + This can include bare columns, columns matched by regex or dtype, selectors and exclude ops, and (optionally) column/expression aliasing. .. versionadded:: 0.20.30 @@ -116,7 +116,7 @@ def is_column_selection(self, *, allow_aliasing: bool = False) -> bool: Parameters ---------- allow_aliasing - If False (default), any aliasing is not considered pure column selection. + If False (default), any aliasing is not considered to be column selection. Set True to allow for column selection that also includes aliasing. Examples @@ -142,6 +142,33 @@ def is_column_selection(self, *, allow_aliasing: bool = False) -> bool: """ return self._pyexpr.meta_is_column_selection(allow_aliasing) + def is_literal(self, *, allow_aliasing: bool = False) -> bool: + """ + Indicate if this expression is a literal value (optionally aliased). + + .. versionadded:: 1.14 + + Parameters + ---------- + allow_aliasing + If False (default), only a bare literal will match. + Set True to also allow for aliased literals. + + Examples + -------- + >>> from datetime import datetime + >>> e = pl.lit(123) + >>> e.meta.is_literal() + True + >>> e = pl.lit(987.654321).alias("foo") + >>> e.meta.is_literal() + False + >>> e = pl.lit(datetime.now()).alias("bar") + >>> e.meta.is_literal(allow_aliasing=True) + True + """ + return self._pyexpr.meta_is_literal(allow_aliasing) + @overload def output_name(self, *, raise_if_undetermined: Literal[True] = True) -> str: ... diff --git a/py-polars/tests/unit/operations/namespaces/test_meta.py b/py-polars/tests/unit/operations/namespaces/test_meta.py index 38835244557e..5a0c253fbfed 100644 --- a/py-polars/tests/unit/operations/namespaces/test_meta.py +++ b/py-polars/tests/unit/operations/namespaces/test_meta.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from datetime import date, datetime, time, timedelta +from typing import TYPE_CHECKING, Any import pytest @@ -123,6 +124,34 @@ def test_is_column_selection( assert not expr.meta.is_column_selection() +@pytest.mark.parametrize( + "value", + [ + None, + 1234, + 567.89, + float("inf"), + date.today(), + datetime.now(), + time(10, 30, 45), + timedelta(hours=-24), + ["x", "y", "z"], + pl.Series([None, None]), + [[10, 20], [30, 40]], + "this is the way", + ], +) +def test_is_literal(value: Any) -> None: + e = pl.lit(value) + assert e.meta.is_literal() + + e = pl.lit(value).alias("foo") + assert not e.meta.is_literal() + + e = pl.lit(value).alias("foo") + assert e.meta.is_literal(allow_aliasing=True) + + def test_meta_is_regex_projection() -> None: e = pl.col("^.*$").name.suffix("_foo") assert e.meta.is_regex_projection() From 97c82d0e45488681a784e0742574c903b4fc2b9d Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Fri, 15 Nov 2024 00:07:06 +1100 Subject: [PATCH 17/18] fix: Fix scanning google cloud with service account credentials file (#19782) --- .../polars/io/cloud/credential_provider.py | 28 +++++++++++++++++-- py-polars/polars/meta/versions.py | 2 ++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/py-polars/polars/io/cloud/credential_provider.py b/py-polars/polars/io/cloud/credential_provider.py index 26e8ebf6826e..69f2bdbdf67a 100644 --- a/py-polars/polars/io/cloud/credential_provider.py +++ b/py-polars/polars/io/cloud/credential_provider.py @@ -150,8 +150,21 @@ class CredentialProviderGCP(CredentialProvider): at any point without it being considered a breaking change. """ - def __init__(self) -> None: - """Initialize a credential provider for Google Cloud (GCP).""" + def __init__( + self, + *, + scopes: Any | None = None, + request: Any | None = None, + quota_project_id: Any | None = None, + default_scopes: Any | None = None, + ) -> None: + """ + Initialize a credential provider for Google Cloud (GCP). + + Parameters + ---------- + Parameters are passed to `google.auth.default()` + """ msg = "`CredentialProviderAWS` functionality is considered unstable" issue_unstable_warning(msg) @@ -168,7 +181,16 @@ def __init__(self) -> None: # # So we just bypass it with a __dict__[] (because ruff complains about # getattr) :| - creds, _ = google.auth.__dict__["default"]() + creds, _ = google.auth.__dict__["default"]( + scopes=( + scopes + if scopes is not None + else ["https://www.googleapis.com/auth/cloud-platform"] + ), + request=request, + quota_project_id=quota_project_id, + default_scopes=default_scopes, + ) self.creds = creds def __call__(self) -> CredentialProviderFunctionReturn: diff --git a/py-polars/polars/meta/versions.py b/py-polars/polars/meta/versions.py index 425f01d91a85..f9f631dac5fd 100644 --- a/py-polars/polars/meta/versions.py +++ b/py-polars/polars/meta/versions.py @@ -68,12 +68,14 @@ def _get_dependency_list() -> list[str]: return [ "adbc_driver_manager", "altair", + "boto3", "cloudpickle", "connectorx", "deltalake", "fastexcel", "fsspec", "gevent", + "google.auth", "great_tables", "matplotlib", "nest_asyncio", From 5f11dd958947d7880a0c092b3c94a4c630a32f20 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 14 Nov 2024 14:26:16 +0100 Subject: [PATCH 18/18] fix(python): Respect schema_overrides in batched csv reader (#19755) --- crates/polars-python/src/batched_csv.rs | 11 +++++------ py-polars/polars/io/csv/batched_reader.py | 2 +- py-polars/tests/unit/io/test_csv.py | 10 ++++++++++ 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/crates/polars-python/src/batched_csv.rs b/crates/polars-python/src/batched_csv.rs index d6a2353c6438..a406d7b6ddf3 100644 --- a/crates/polars-python/src/batched_csv.rs +++ b/crates/polars-python/src/batched_csv.rs @@ -23,7 +23,7 @@ impl PyBatchedCsv { #[staticmethod] #[pyo3(signature = ( infer_schema_length, chunk_size, has_header, ignore_errors, n_rows, skip_rows, - projection, separator, rechunk, columns, encoding, n_threads, path, overwrite_dtype, + projection, separator, rechunk, columns, encoding, n_threads, path, schema_overrides, overwrite_dtype_slice, low_memory, comment_prefix, quote_char, null_values, missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header, row_index, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma) @@ -42,7 +42,7 @@ impl PyBatchedCsv { encoding: Wrap, n_threads: Option, path: PathBuf, - overwrite_dtype: Option)>>, + schema_overrides: Option)>>, overwrite_dtype_slice: Option>>, low_memory: bool, comment_prefix: Option<&str>, @@ -73,7 +73,7 @@ impl PyBatchedCsv { None }; - let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| { + let schema_overrides = schema_overrides.map(|overwrite_dtype| { overwrite_dtype .iter() .map(|(name, dtype)| { @@ -105,6 +105,7 @@ impl PyBatchedCsv { .with_n_threads(n_threads) .with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new)) .with_low_memory(low_memory) + .with_schema_overwrite(schema_overrides.map(Arc::new)) .with_skip_rows_after_header(skip_rows_after_header) .with_row_index(row_index) .with_raise_if_empty(raise_if_empty) @@ -123,9 +124,7 @@ impl PyBatchedCsv { ) .into_reader_with_file_handle(reader); - let reader = reader - .batched(overwrite_dtype.map(Arc::new)) - .map_err(PyPolarsErr::from)?; + let reader = reader.batched(None).map_err(PyPolarsErr::from)?; Ok(PyBatchedCsv { reader: Mutex::new(reader), diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py index e0384f03dde2..4207f476ee5c 100644 --- a/py-polars/polars/io/csv/batched_reader.py +++ b/py-polars/polars/io/csv/batched_reader.py @@ -89,7 +89,7 @@ def __init__( encoding=encoding, n_threads=n_threads, path=path, - overwrite_dtype=dtype_list, + schema_overrides=dtype_list, overwrite_dtype_slice=dtype_slice, low_memory=low_memory, comment_prefix=comment_prefix, diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 226a4e31e2b9..628dc7587387 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -2346,3 +2346,13 @@ def test_csv_read_time_dtype_overwrite(tmp_path: Path) -> None: ), df, ) + + +def test_batched_csv_schema_overrides(io_files_path: Path) -> None: + foods = io_files_path / "foods1.csv" + batched = pl.read_csv_batched(foods, schema_overrides={"calories": pl.String}) + res = batched.next_batches(1) + assert res is not None + b = res[0] + assert b["calories"].dtype == pl.String + assert b.width == 4