Skip to content

Commit

Permalink
perf: Delay selection expansion (#18011)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Aug 2, 2024
1 parent 6d2af37 commit d5265d3
Show file tree
Hide file tree
Showing 13 changed files with 126 additions and 43 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 4 additions & 9 deletions crates/polars-lazy/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -426,12 +426,9 @@ impl LazyFrame {
fn _drop<I, T>(self, columns: I, strict: bool) -> Self
where
I: IntoIterator<Item = T>,
T: AsRef<str>,
T: Into<Selector>,
{
let to_drop = columns
.into_iter()
.map(|s| s.as_ref().to_string())
.collect::<PlHashSet<_>>();
let to_drop = columns.into_iter().map(|c| c.into()).collect();

let opt_state = self.get_opt_state();
let lp = self.get_plan_builder().drop(to_drop, strict).build();
Expand All @@ -444,11 +441,10 @@ impl LazyFrame {
///
/// Any given columns that are not in the schema will give a [`PolarsError::ColumnNotFound`]
/// error while materializing the [`LazyFrame`].
#[inline]
pub fn drop<I, T>(self, columns: I) -> Self
where
I: IntoIterator<Item = T>,
T: AsRef<str>,
T: Into<Selector>,
{
self._drop(columns, true)
}
Expand All @@ -458,11 +454,10 @@ impl LazyFrame {
/// and let the projection pushdown optimize away the unneeded columns.
///
/// If a column name does not exist in the schema, it will quietly be ignored.
#[inline]
pub fn drop_no_validate<I, T>(self, columns: I) -> Self
where
I: IntoIterator<Item = T>,
T: AsRef<str>,
T: Into<Selector>,
{
self._drop(columns, false)
}
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-plan/src/dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ use polars_core::prelude::*;
use polars_core::series::ops::NullBehavior;
use polars_core::series::IsSorted;
use polars_core::utils::try_get_supertype;
pub(crate) use selector::Selector;
pub use selector::Selector;
#[cfg(feature = "dtype-struct")]
pub use struct_::*;
pub use udf::UserDefinedFunction;
Expand Down
27 changes: 25 additions & 2 deletions crates/polars-plan/src/dsl/selector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ pub enum Selector {
}

impl Selector {
#[cfg(feature = "meta")]
pub(crate) fn new(e: Expr) -> Self {
pub fn new(e: Expr) -> Self {
Self::Root(Box::new(e))
}
}
Expand Down Expand Up @@ -56,3 +55,27 @@ impl Sub for Selector {
Selector::Sub(Box::new(self), Box::new(rhs))
}
}

impl From<&str> for Selector {
fn from(value: &str) -> Self {
Selector::new(col(value))
}
}

impl From<String> for Selector {
fn from(value: String) -> Self {
Selector::new(col(value.as_ref()))
}
}

impl From<ColumnName> for Selector {
fn from(value: ColumnName) -> Self {
Selector::new(Expr::Column(value))
}
}

impl From<Expr> for Selector {
fn from(value: Expr) -> Self {
Selector::new(value)
}
}
2 changes: 1 addition & 1 deletion crates/polars-plan/src/plans/builder_dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ impl DslBuilder {
.into()
}

pub fn drop(self, to_drop: PlHashSet<String>, strict: bool) -> Self {
pub fn drop(self, to_drop: Vec<Selector>, strict: bool) -> Self {
self.map_private(DslFunction::Drop(DropFunction { to_drop, strict }))
}

Expand Down
4 changes: 4 additions & 0 deletions crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use polars_io::path_utils::{expand_paths_hive, expanded_from_single_directory};

use super::stack_opt::ConversionOptimizer;
use super::*;
use crate::plans::conversion::expr_expansion::expand_selectors;

fn expand_expressions(
input: Node,
Expand Down Expand Up @@ -646,6 +647,9 @@ pub fn to_alp_impl(
return run_conversion(lp, lp_arena, expr_arena, convert, "fill_nan");
},
DslFunction::Drop(DropFunction { to_drop, strict }) => {
let to_drop = expand_selectors(to_drop, &input_schema, &[])?;
let to_drop = to_drop.iter().map(|s| s.as_ref()).collect::<PlHashSet<_>>();

if strict {
for col_name in to_drop.iter() {
polars_ensure!(
Expand Down
88 changes: 63 additions & 25 deletions crates/polars-plan/src/plans/conversion/expr_expansion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -840,32 +840,70 @@ fn replace_selector(expr: Expr, schema: &Schema, keys: &[Expr]) -> PolarsResult<
let mut swapped = Selector::Root(Box::new(Expr::Wildcard));
std::mem::swap(&mut s, &mut swapped);

let mut members = PlIndexSet::new();
replace_selector_inner(swapped, &mut members, &mut vec![], schema, keys)?;

if members.len() <= 1 {
Ok(Expr::Columns(
members
.into_iter()
.map(|e| {
let Expr::Column(name) = e else {
unreachable!()
};
name
})
.collect(),
))
} else {
// Ensure that multiple columns returned from combined/nested selectors remain in schema order
let selected = schema
.iter_fields()
.map(|field| ColumnName::from(field.name().as_ref()))
.filter(|field_name| members.contains(&Expr::Column(field_name.clone())))
.collect();

Ok(Expr::Columns(selected))
}
let cols = expand_selector(swapped, schema, keys)?;
Ok(Expr::Columns(cols))
},
e => Ok(e),
})
}

pub(super) fn expand_selectors(
s: Vec<Selector>,
schema: &Schema,
keys: &[Expr],
) -> PolarsResult<Arc<[ColumnName]>> {
let mut columns = vec![];

for s in s {
match s {
Selector::Root(e) => match *e {
Expr::Column(name) => columns.push(name),
Expr::Columns(names) => columns.extend_from_slice(names.as_ref()),
Expr::Selector(s) => {
let names = expand_selector(s, schema, keys)?;
columns.extend_from_slice(names.as_ref());
},
e => {
let names = expand_selector(Selector::new(e), schema, keys)?;
columns.extend_from_slice(names.as_ref());
},
},
other => {
let names = expand_selector(other, schema, keys)?;
columns.extend_from_slice(names.as_ref());
},
}
}

Ok(Arc::from(columns))
}

pub(super) fn expand_selector(
s: Selector,
schema: &Schema,
keys: &[Expr],
) -> PolarsResult<Arc<[ColumnName]>> {
let mut members = PlIndexSet::new();
replace_selector_inner(s, &mut members, &mut vec![], schema, keys)?;

if members.len() <= 1 {
Ok(members
.into_iter()
.map(|e| {
let Expr::Column(name) = e else {
unreachable!()
};
name
})
.collect())
} else {
// Ensure that multiple columns returned from combined/nested selectors remain in schema order
let selected = schema
.iter_fields()
.map(|field| ColumnName::from(field.name().as_ref()))
.filter(|field_name| members.contains(&Expr::Column(field_name.clone())))
.collect();

Ok(selected)
}
}
2 changes: 1 addition & 1 deletion crates/polars-plan/src/plans/functions/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pub enum DslFunction {
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct DropFunction {
/// Columns that are going to be dropped
pub(crate) to_drop: PlHashSet<String>,
pub(crate) to_drop: Vec<Selector>,
/// If `true`, performs a check for each item in `to_drop` against the schema. Returns an
/// `ColumnNotFound` error if the column does not exist in the schema.
pub(crate) strict: bool,
Expand Down
1 change: 1 addition & 0 deletions py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ polars-stream = { workspace = true }

ahash = { workspace = true }
arboard = { workspace = true, optional = true }
bytemuck = { workspace = true }
ciborium = { workspace = true }
either = { workspace = true }
itoa = { workspace = true }
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4719,7 +4719,7 @@ def drop(
│ 8.0 │
└─────┘
"""
drop_cols = _expand_selectors(self, *columns)
drop_cols = parse_into_list_of_expressions(*columns)
return self._from_pyldf(self._ldf.drop(drop_cols, strict=strict))

def rename(self, mapping: dict[str, str] | Callable[[str], str]) -> LazyFrame:
Expand Down
4 changes: 4 additions & 0 deletions py-polars/polars/selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ def is_selector(obj: Any) -> bool:
return isinstance(obj, _selector_proxy_) and hasattr(obj, "_attrs")


# TODO: Don't use this. It collects a schema.
# This should all go to IR conversion.
def expand_selector(
target: DataFrame | LazyFrame | Mapping[str, PolarsDataType],
selector: SelectorType | Expr,
Expand Down Expand Up @@ -188,6 +190,8 @@ def expand_selector(
return tuple(target.select(selector).collect_schema())


# TODO: Don't use this. It collects a schema.
# This should all go to IR conversion.
def _expand_selectors(frame: DataFrame | LazyFrame, *items: Any) -> list[Any]:
"""
Internal function that expands any selectors to column names in the given input.
Expand Down
20 changes: 18 additions & 2 deletions py-polars/src/expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ mod serde;
mod string;
mod r#struct;

use std::mem::ManuallyDrop;

use polars::lazy::dsl::Expr;
use pyo3::prelude::*;

Expand All @@ -35,7 +37,14 @@ pub(crate) trait ToExprs {
impl ToExprs for Vec<PyExpr> {
fn to_exprs(self) -> Vec<Expr> {
// SAFETY: repr is transparent.
unsafe { std::mem::transmute(self) }
unsafe {
let length = self.len();
let capacity = self.capacity();
let mut manual_drop_vec = ManuallyDrop::new(self);
let vec_ptr: *mut PyExpr = manual_drop_vec.as_mut_ptr();
let ptr: *mut Expr = vec_ptr as *mut Expr;
Vec::from_raw_parts(ptr, length, capacity)
}
}
}

Expand All @@ -46,6 +55,13 @@ pub(crate) trait ToPyExprs {
impl ToPyExprs for Vec<Expr> {
fn to_pyexprs(self) -> Vec<PyExpr> {
// SAFETY: repr is transparent.
unsafe { std::mem::transmute(self) }
unsafe {
let length = self.len();
let capacity = self.capacity();
let mut manual_drop_vec = ManuallyDrop::new(self);
let vec_ptr: *mut Expr = manual_drop_vec.as_mut_ptr();
let ptr: *mut PyExpr = vec_ptr as *mut PyExpr;
Vec::from_raw_parts(ptr, length, capacity)
}
}
}
3 changes: 2 additions & 1 deletion py-polars/src/lazyframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1162,8 +1162,9 @@ impl PyLazyFrame {
.into()
}

fn drop(&self, columns: Vec<String>, strict: bool) -> Self {
fn drop(&self, columns: Vec<PyExpr>, strict: bool) -> Self {
let ldf = self.ldf.clone();
let columns = columns.to_exprs();
if strict {
ldf.drop(columns)
} else {
Expand Down

0 comments on commit d5265d3

Please sign in to comment.