Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into faster-ci-parallelism
Browse files Browse the repository at this point in the history
  • Loading branch information
pythonspeed committed Feb 18, 2025
2 parents a98838f + de1d9d5 commit ab0d7a5
Show file tree
Hide file tree
Showing 64 changed files with 1,571 additions and 758 deletions.
2 changes: 2 additions & 0 deletions crates/polars-arrow/src/array/growable/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ impl<'a, O: Offset> GrowableUtf8<'a, O> {
/// # Panics
/// If `arrays` is empty.
pub fn new(arrays: Vec<&'a Utf8Array<O>>, mut use_validity: bool, capacity: usize) -> Self {
assert!(!arrays.is_empty());

// if any of the arrays has nulls, insertions from any array requires setting bits
// as there is at least one array with nulls.
if arrays.iter().any(|array| array.null_count() > 0) {
Expand Down
8 changes: 4 additions & 4 deletions crates/polars-arrow/src/array/list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ impl<O: Offset> ListArray<O> {
///
/// # Errors
/// This function returns an error iff:
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len()`.
/// * `offsets.last()` is greater than `values.len()`.
/// * the validity's length is not equal to `offsets.len_proxy()`.
/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`].
/// * The `dtype`'s inner field's data type is not equal to `values.dtype`.
/// # Implementation
Expand Down Expand Up @@ -66,8 +66,8 @@ impl<O: Offset> ListArray<O> {
///
/// # Panics
/// This function panics iff:
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len()`.
/// * `offsets.last()` is greater than `values.len()`.
/// * the validity's length is not equal to `offsets.len_proxy()`.
/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`].
/// * The `dtype`'s inner field's data type is not equal to `values.dtype`.
/// # Implementation
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-arrow/src/array/map/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ impl MapArray {
/// Returns a new [`MapArray`].
/// # Errors
/// This function errors iff:
/// * The last offset is not equal to the field' length
/// * `offsets.last()` is greater than `field.len()`
/// * The `dtype`'s physical type is not [`crate::datatypes::PhysicalType::Map`]
/// * The fields' `dtype` is not equal to the inner field of `dtype`
/// * The validity is not `None` and its length is different from `offsets.len() - 1`.
Expand Down Expand Up @@ -66,7 +66,7 @@ impl MapArray {

/// Creates a new [`MapArray`].
/// # Panics
/// * The last offset is not equal to the field' length.
/// * `offsets.last()` is greater than `field.len()`.
/// * The `dtype`'s physical type is not [`crate::datatypes::PhysicalType::Map`],
/// * The validity is not `None` and its length is different from `offsets.len() - 1`.
pub fn new(
Expand Down
5 changes: 4 additions & 1 deletion crates/polars-arrow/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -444,8 +444,11 @@ macro_rules! impl_sliced {
#[inline]
#[must_use]
pub fn sliced(self, offset: usize, length: usize) -> Self {
let total = offset
.checked_add(length)
.expect("offset + length overflowed");
assert!(
offset + length <= self.len(),
total <= self.len(),
"the offset of the new Buffer cannot exceed the existing length"
);
unsafe { Self::sliced_unchecked(self, offset, length) }
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/array/struct_/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ impl StructArray {

/// Slices this [`StructArray`].
/// # Panics
/// * `offset + length` must be smaller than `self.len()`.
/// panics iff `offset + length > self.len()`
/// # Implementation
/// This operation is `O(F)` where `F` is the number of fields.
pub fn slice(&mut self, offset: usize, length: usize) {
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/array/utf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ impl<O: Offset> Utf8Array<O> {
/// Creates a new [`Utf8Array`].
/// # Panics
/// This function panics iff:
/// * The last offset is greater than the values' length.
/// * `offsets.last()` is greater than `values.len()`.
/// * the validity's length is not equal to `offsets.len_proxy()`.
/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.
/// * The `values` between two consecutive `offsets` are not valid utf8
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-arrow/src/array/utf8/mutable_values.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ impl<O: Offset> MutableUtf8ValuesArray<O> {
///
/// # Errors
/// This function returns an error iff:
/// * The last offset is not equal to the values' length.
/// * `offsets.last()` is greater than `values.len()`.
/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.
/// * The `values` between two consecutive `offsets` are not valid utf8
/// # Implementation
Expand All @@ -92,7 +92,7 @@ impl<O: Offset> MutableUtf8ValuesArray<O> {
///
/// # Panic
/// This function does not panic iff:
/// * The last offset is equal to the values' length.
/// * `offsets.last()` is greater than `values.len()`
/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is equal to either `Utf8` or `LargeUtf8`.
///
/// # Safety
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/bitmap/immutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ impl Bitmap {

/// Creates a new [`Bitmap`] from a slice and length.
/// # Panic
/// Panics iff `length <= bytes.len() * 8`
/// Panics iff `length > bytes.len() * 8`
#[inline]
pub fn from_u8_slice<T: AsRef<[u8]>>(slice: T, length: usize) -> Self {
Bitmap::try_new(slice.as_ref().to_vec(), length).unwrap()
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/offset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ impl<O: Offset> Offsets<O> {

/// Returns a `length` corresponding to the position `index`
/// # Panic
/// This function panics iff `index >= self.len()`
/// This function panics iff `index >= self.len_proxy()`
#[inline]
pub fn length_at(&self, index: usize) -> usize {
let (start, end) = self.start_end(index);
Expand Down
25 changes: 19 additions & 6 deletions crates/polars-core/src/chunked_array/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ mod iterator;

use std::borrow::Cow;

use either::Either;

use crate::prelude::*;

impl ArrayChunked {
Expand Down Expand Up @@ -37,13 +39,24 @@ impl ArrayChunked {
return Cow::Borrowed(self);
};

assert_eq!(self.chunks().len(), physical_repr.chunks().len());
let chunk_len_validity_iter =
if physical_repr.chunks().len() == 1 && self.chunks().len() > 1 {
// Physical repr got rechunked, rechunk our validity as well.
Either::Left(std::iter::once((self.len(), self.rechunk_validity())))
} else {
// No rechunking, expect the same number of chunks.
assert_eq!(self.chunks().len(), physical_repr.chunks().len());
Either::Right(
self.chunks()
.iter()
.map(|c| (c.len(), c.validity().cloned())),
)
};

let width = self.width();
let chunks: Vec<_> = self
.downcast_iter()
let chunks: Vec<_> = chunk_len_validity_iter
.zip(physical_repr.into_chunks())
.map(|(chunk, values)| {
.map(|((len, validity), values)| {
FixedSizeListArray::new(
ArrowDataType::FixedSizeList(
Box::new(ArrowField::new(
Expand All @@ -53,9 +66,9 @@ impl ArrayChunked {
)),
width,
),
chunk.len(),
len,
values,
chunk.validity().cloned(),
validity,
)
.to_boxed()
})
Expand Down
11 changes: 9 additions & 2 deletions crates/polars-core/src/chunked_array/list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,16 @@ impl ListChunked {
return Cow::Borrowed(self);
};

assert_eq!(self.chunks().len(), physical_repr.chunks().len());
let ca = if physical_repr.chunks().len() == 1 && self.chunks().len() > 1 {
// Physical repr got rechunked, rechunk self as well.
self.rechunk()
} else {
Cow::Borrowed(self)
};

let chunks: Vec<_> = self
assert_eq!(ca.chunks().len(), physical_repr.chunks().len());

let chunks: Vec<_> = ca
.downcast_iter()
.zip(physical_repr.into_chunks())
.map(|(chunk, values)| {
Expand Down
72 changes: 48 additions & 24 deletions crates/polars-core/src/chunked_array/ops/aggregate/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -453,16 +453,16 @@ impl ChunkAggSeries for StringChunked {

#[cfg(feature = "dtype-categorical")]
impl CategoricalChunked {
fn min_categorical(&self) -> Option<&str> {
fn min_categorical(&self) -> Option<u32> {
if self.is_empty() || self.null_count() == self.len() {
return None;
}
if self.uses_lexical_ordering() {
let rev_map = self.get_rev_map();
// Fast path where all categories are used
if self._can_fast_unique() {
let c = if self._can_fast_unique() {
self.get_rev_map().get_categories().min_ignore_nan_kernel()
} else {
let rev_map = self.get_rev_map();
// SAFETY:
// Indices are in bounds
self.physical()
Expand All @@ -471,26 +471,26 @@ impl CategoricalChunked {
opt_el.map(|el| unsafe { rev_map.get_unchecked(el) })
})
.min()
}
};
rev_map.find(c.unwrap())
} else {
// SAFETY:
// Indices are in bounds
self.physical()
.min()
.map(|el| unsafe { self.get_rev_map().get_unchecked(el) })
match self._can_fast_unique() {
true => Some(0),
false => self.physical().min(),
}
}
}

fn max_categorical(&self) -> Option<&str> {
fn max_categorical(&self) -> Option<u32> {
if self.is_empty() || self.null_count() == self.len() {
return None;
}
if self.uses_lexical_ordering() {
let rev_map = self.get_rev_map();
// Fast path where all categories are used
if self._can_fast_unique() {
let c = if self._can_fast_unique() {
self.get_rev_map().get_categories().max_ignore_nan_kernel()
} else {
let rev_map = self.get_rev_map();
// SAFETY:
// Indices are in bounds
self.physical()
Expand All @@ -499,13 +499,13 @@ impl CategoricalChunked {
opt_el.map(|el| unsafe { rev_map.get_unchecked(el) })
})
.max()
}
};
rev_map.find(c.unwrap())
} else {
// SAFETY:
// Indices are in bounds
self.physical()
.max()
.map(|el| unsafe { self.get_rev_map().get_unchecked(el) })
match self._can_fast_unique() {
true => Some((self.get_rev_map().len() - 1) as u32),
false => self.physical().max(),
}
}
}
}
Expand All @@ -530,9 +530,21 @@ impl ChunkAggSeries for CategoricalChunked {
)
},
},
DataType::Categorical(_, _) => {
let av: AnyValue = self.min_categorical().into();
Scalar::new(DataType::String, av.into_static())
DataType::Categorical(r, _) => match self.min_categorical() {
None => Scalar::new(self.dtype().clone(), AnyValue::Null),
Some(v) => {
let RevMapping::Local(arr, _) = &**r.as_ref().unwrap() else {
unreachable!()
};
Scalar::new(
self.dtype().clone(),
AnyValue::CategoricalOwned(
v,
r.as_ref().unwrap().clone(),
SyncPtr::from_const(arr as *const _),
),
)
},
},
_ => unreachable!(),
}
Expand All @@ -555,9 +567,21 @@ impl ChunkAggSeries for CategoricalChunked {
)
},
},
DataType::Categorical(_, _) => {
let av: AnyValue = self.max_categorical().into();
Scalar::new(DataType::String, av.into_static())
DataType::Categorical(r, _) => match self.max_categorical() {
None => Scalar::new(self.dtype().clone(), AnyValue::Null),
Some(v) => {
let RevMapping::Local(arr, _) = &**r.as_ref().unwrap() else {
unreachable!()
};
Scalar::new(
self.dtype().clone(),
AnyValue::CategoricalOwned(
v,
r.as_ref().unwrap().clone(),
SyncPtr::from_const(arr as *const _),
),
)
},
},
_ => unreachable!(),
}
Expand Down
3 changes: 3 additions & 0 deletions crates/polars-core/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,7 @@ impl Series {
match self.dtype() {
DataType::Float32 => Ok(self.f32().unwrap().is_nan()),
DataType::Float64 => Ok(self.f64().unwrap().is_nan()),
DataType::Null => Ok(BooleanChunked::full_null(self.name().clone(), self.len())),
dt if dt.is_primitive_numeric() => {
let arr = BooleanArray::full(self.len(), false, ArrowDataType::Boolean)
.with_validity(self.rechunk_validity());
Expand Down Expand Up @@ -663,6 +664,7 @@ impl Series {
match self.dtype() {
DataType::Float32 => Ok(self.f32().unwrap().is_finite()),
DataType::Float64 => Ok(self.f64().unwrap().is_finite()),
DataType::Null => Ok(BooleanChunked::full_null(self.name().clone(), self.len())),
dt if dt.is_primitive_numeric() => {
let arr = BooleanArray::full(self.len(), true, ArrowDataType::Boolean)
.with_validity(self.rechunk_validity());
Expand All @@ -677,6 +679,7 @@ impl Series {
match self.dtype() {
DataType::Float32 => Ok(self.f32().unwrap().is_infinite()),
DataType::Float64 => Ok(self.f64().unwrap().is_infinite()),
DataType::Null => Ok(BooleanChunked::full_null(self.name().clone(), self.len())),
dt if dt.is_primitive_numeric() => {
let arr = BooleanArray::full(self.len(), false, ArrowDataType::Boolean)
.with_validity(self.rechunk_validity());
Expand Down
26 changes: 24 additions & 2 deletions crates/polars-lazy/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -928,9 +928,10 @@ impl LazyFrame {
Ok(())
}

/// Filter by some predicate expression.
/// Filter frame rows that match a predicate expression.
///
/// The expression must yield boolean values.
/// The expression must yield boolean values (note that rows where the
/// predicate resolves to `null` are *not* included in the resulting frame).
///
/// # Example
///
Expand All @@ -950,6 +951,27 @@ impl LazyFrame {
Self::from_logical_plan(lp, opt_state)
}

/// Remove frame rows that match a predicate expression.
///
/// The expression must yield boolean values (note that rows where the
/// predicate resolves to `null` are *not* removed from the resulting frame).
///
/// # Example
///
/// ```rust
/// use polars_core::prelude::*;
/// use polars_lazy::prelude::*;
///
/// fn example(df: DataFrame) -> LazyFrame {
/// df.lazy()
/// .remove(col("sepal_width").is_null())
/// .select([col("sepal_width"), col("sepal_length")])
/// }
/// ```
pub fn remove(self, predicate: Expr) -> Self {
self.filter(predicate.neq_missing(lit(true)))
}

/// Select (and optionally rename, with [`alias`](crate::dsl::Expr::alias)) columns from the query.
///
/// Columns can be selected with [`col`];
Expand Down
5 changes: 2 additions & 3 deletions crates/polars-ops/src/frame/join/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ pub type ChunkJoinOptIds = Vec<NullableIdxSize>;
#[cfg(not(feature = "chunked_ids"))]
pub type ChunkJoinIds = Vec<IdxSize>;

use once_cell::sync::Lazy;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use strum_macros::IntoStaticStr;
Expand Down Expand Up @@ -138,8 +137,8 @@ impl JoinArgs {
}

pub fn suffix(&self) -> &PlSmallStr {
static DEFAULT: Lazy<PlSmallStr> = Lazy::new(|| PlSmallStr::from_static("_right"));
self.suffix.as_ref().unwrap_or(&*DEFAULT)
const DEFAULT: &PlSmallStr = &PlSmallStr::from_static("_right");
self.suffix.as_ref().unwrap_or(DEFAULT)
}
}

Expand Down
Loading

0 comments on commit ab0d7a5

Please sign in to comment.