Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into clean-to-string
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli committed Jul 24, 2024
2 parents a4304fa + 26a16e3 commit 6042da8
Show file tree
Hide file tree
Showing 91 changed files with 2,404 additions and 2,138 deletions.
51 changes: 25 additions & 26 deletions _typos.toml
Original file line number Diff line number Diff line change
@@ -1,34 +1,33 @@
[default]
extend-ignore-identifiers-re = [
"splitted.*",
[files]
extend-exclude = [
".git/",
"*.csv",
"*.gz",
"dists.dss",
]
ignore-hidden = false

[default.extend-identifiers]
bck = "bck"
Fo = "Fo"
ND = "ND"
ba = "ba"
nd = "nd"
opt_nd = "opt_nd"
ANDed = "ANDed"
ody = "ody"
[default]
extend-ignore-re = [
'"Theatre": \[.+\],',
]

[default.extend-words]
arange = "arange"
strat = "strat"
'"r0ot"' = "r0ot"
wee = "wee"
ser = "ser"
ND = "ND"
ody = "ody"

[type.csv]
extend-glob = ["*.csv"]
check-file = false
splitted = "splitted"
strat = "strat"

[type.gz]
extend-glob = ["*.gz"]
check-file = false
[type.py.extend-identifiers]
ba = "ba"
ody = "ody"

[files]
extend-exclude = ["_typos.toml", "dists.dss"]
[type.rust.extend-identifiers]
ANDed = "ANDed"
bck = "bck"
Fo = "Fo"
ND = "ND"
nd = "nd"
NDJson = "NDJson"
NDJsonReadOptions = "NDJsonReadOptions"
opt_nd = "opt_nd"
19 changes: 19 additions & 0 deletions crates/polars-arrow/src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use crate::array::dictionary::typed_iterator::{
pub unsafe trait DictionaryKey: NativeType + TryInto<usize> + TryFrom<usize> + Hash {
/// The corresponding [`IntegerType`] of this key
const KEY_TYPE: IntegerType;
const MAX_USIZE_VALUE: usize;

/// Represents this key as a `usize`.
///
Expand All @@ -50,6 +51,16 @@ pub unsafe trait DictionaryKey: NativeType + TryInto<usize> + TryFrom<usize> + H
}
}

/// Create a key from a `usize` without checking bounds.
///
/// # Safety
/// The caller _must_ have checked that the value can be created from a `usize`.
#[inline]
unsafe fn from_usize_unchecked(x: usize) -> Self {
debug_assert!(Self::try_from(x).is_ok());
unsafe { Self::try_from(x).unwrap_unchecked() }
}

/// If the key type always can be converted to `usize`.
fn always_fits_usize() -> bool {
false
Expand All @@ -58,39 +69,47 @@ pub unsafe trait DictionaryKey: NativeType + TryInto<usize> + TryFrom<usize> + H

unsafe impl DictionaryKey for i8 {
const KEY_TYPE: IntegerType = IntegerType::Int8;
const MAX_USIZE_VALUE: usize = i8::MAX as usize;
}
unsafe impl DictionaryKey for i16 {
const KEY_TYPE: IntegerType = IntegerType::Int16;
const MAX_USIZE_VALUE: usize = i16::MAX as usize;
}
unsafe impl DictionaryKey for i32 {
const KEY_TYPE: IntegerType = IntegerType::Int32;
const MAX_USIZE_VALUE: usize = i32::MAX as usize;
}
unsafe impl DictionaryKey for i64 {
const KEY_TYPE: IntegerType = IntegerType::Int64;
const MAX_USIZE_VALUE: usize = i64::MAX as usize;
}
unsafe impl DictionaryKey for u8 {
const KEY_TYPE: IntegerType = IntegerType::UInt8;
const MAX_USIZE_VALUE: usize = u8::MAX as usize;

fn always_fits_usize() -> bool {
true
}
}
unsafe impl DictionaryKey for u16 {
const KEY_TYPE: IntegerType = IntegerType::UInt16;
const MAX_USIZE_VALUE: usize = u16::MAX as usize;

fn always_fits_usize() -> bool {
true
}
}
unsafe impl DictionaryKey for u32 {
const KEY_TYPE: IntegerType = IntegerType::UInt32;
const MAX_USIZE_VALUE: usize = u32::MAX as usize;

fn always_fits_usize() -> bool {
true
}
}
unsafe impl DictionaryKey for u64 {
const KEY_TYPE: IntegerType = IntegerType::UInt64;
const MAX_USIZE_VALUE: usize = u64::MAX as usize;

#[cfg(target_pointer_width = "64")]
fn always_fits_usize() -> bool {
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/logical/decimal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ impl DecimalChunked {
}
}

pub(crate) fn to_scale(&self, scale: usize) -> PolarsResult<Cow<'_, Self>> {
pub fn to_scale(&self, scale: usize) -> PolarsResult<Cow<'_, Self>> {
if self.scale() == scale {
return Ok(Cow::Borrowed(self));
}
Expand Down
3 changes: 3 additions & 0 deletions crates/polars-core/src/chunked_array/logical/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ impl<K: PolarsDataType, T: PolarsDataType> Logical<K, T>
where
Self: LogicalType,
{
pub fn physical(&self) -> &ChunkedArray<T> {
&self.0
}
pub fn field(&self) -> Field {
let name = self.0.ref_field().name();
Field::new(name, LogicalType::dtype(self).clone())
Expand Down
17 changes: 17 additions & 0 deletions crates/polars-core/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,13 +195,30 @@ impl Series {
// TODO! this probably can now be removed, now we don't have special case for structs.
pub fn select_chunk(&self, i: usize) -> Self {
let mut new = self.clear();
let flags = self.get_flags();

let mut new_flags = MetadataFlags::empty();
new_flags.set(
MetadataFlags::SORTED_ASC,
flags.contains(MetadataFlags::SORTED_ASC),
);
new_flags.set(
MetadataFlags::SORTED_DSC,
flags.contains(MetadataFlags::SORTED_DSC),
);
new_flags.set(
MetadataFlags::FAST_EXPLODE_LIST,
flags.contains(MetadataFlags::FAST_EXPLODE_LIST),
);

// Assign mut so we go through arc only once.
let mut_new = new._get_inner_mut();
let chunks = unsafe { mut_new.chunks_mut() };
let chunk = self.chunks()[i].clone();
chunks.clear();
chunks.push(chunk);
mut_new.compute_len();
mut_new._set_flags(new_flags);
new
}

Expand Down
5 changes: 1 addition & 4 deletions crates/polars-io/src/parquet/read/mmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,7 @@ pub(super) fn to_deserializer<'a>(
columns: Vec<(&ColumnChunkMetaData, MemSlice)>,
field: Field,
num_rows: usize,
chunk_size: Option<usize>,
) -> PolarsResult<ArrayIter<'a>> {
let chunk_size = chunk_size.unwrap_or(usize::MAX).min(num_rows);

let (columns, types): (Vec<_>, Vec<_>) = columns
.into_iter()
.map(|(column_meta, chunk)| {
Expand All @@ -90,5 +87,5 @@ pub(super) fn to_deserializer<'a>(
})
.unzip();

column_iter_to_arrays(columns, types, field, Some(chunk_size), num_rows)
column_iter_to_arrays(columns, types, field, num_rows)
}
32 changes: 4 additions & 28 deletions crates/polars-io/src/parquet/read/read_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ fn column_idx_to_series(
remaining_rows: usize,
file_schema: &ArrowSchema,
store: &mmap::ColumnStore,
chunk_size: usize,
) -> PolarsResult<Series> {
let field = &file_schema.fields[column_i];

Expand All @@ -67,7 +66,7 @@ fn column_idx_to_series(
}

let columns = mmap_columns(store, md.columns(), &field.name);
let iter = mmap::to_deserializer(columns, field.clone(), remaining_rows, Some(chunk_size))?;
let iter = mmap::to_deserializer(columns, field.clone(), remaining_rows)?;

let mut series = if remaining_rows < md.num_rows() {
array_iter_to_series(iter, field, Some(remaining_rows))
Expand Down Expand Up @@ -237,35 +236,20 @@ fn rg_to_dfs_optionally_par_over_columns(
}

let projection_height = (*remaining_rows).min(md.num_rows());
let chunk_size = md.num_rows();
let columns = if let ParallelStrategy::Columns = parallel {
POOL.install(|| {
projection
.par_iter()
.map(|column_i| {
column_idx_to_series(
*column_i,
md,
projection_height,
schema,
store,
chunk_size,
)
column_idx_to_series(*column_i, md, projection_height, schema, store)
})
.collect::<PolarsResult<Vec<_>>>()
})?
} else {
projection
.iter()
.map(|column_i| {
column_idx_to_series(
*column_i,
md,
projection_height,
schema,
store,
chunk_size,
)
column_idx_to_series(*column_i, md, projection_height, schema, store)
})
.collect::<PolarsResult<Vec<_>>>()?
};
Expand Down Expand Up @@ -349,18 +333,10 @@ fn rg_to_dfs_par_over_rg(
assert!(std::env::var("POLARS_PANIC_IF_PARQUET_PARSED").is_err())
}

let chunk_size = md.num_rows();
let columns = projection
.iter()
.map(|column_i| {
column_idx_to_series(
*column_i,
md,
projection_height,
schema,
store,
chunk_size,
)
column_idx_to_series(*column_i, md, projection_height, schema, store)
})
.collect::<PolarsResult<Vec<_>>>()?;

Expand Down
12 changes: 4 additions & 8 deletions crates/polars-io/src/path_utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ pub fn expand_paths_hive(
if is_cloud || { cfg!(not(target_family = "windows")) && config::force_async() } {
#[cfg(feature = "cloud")]
{
use polars_utils::_limit_path_len_io_err;

use crate::cloud::object_path_from_string;

if first_path.starts_with("hf://") {
Expand Down Expand Up @@ -199,14 +201,8 @@ pub fn expand_paths_hive(
// indistinguishable from an empty directory.
let path = PathBuf::from(path);
if !path.is_dir() {
path.metadata().map_err(|err| {
let msg =
Some(format!("{}: {}", err, path.to_str().unwrap()).into());
PolarsError::IO {
error: err.into(),
msg,
}
})?;
path.metadata()
.map_err(|err| _limit_path_len_io_err(&path, err))?;
}
}

Expand Down
5 changes: 1 addition & 4 deletions crates/polars-io/src/utils/other.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,7 @@ pub(crate) fn update_row_counts3(dfs: &mut [DataFrame], heights: &[IdxSize], off
}

#[cfg(feature = "json")]
pub(crate) fn overwrite_schema(
schema: &mut Schema,
overwriting_schema: &Schema,
) -> PolarsResult<()> {
pub fn overwrite_schema(schema: &mut Schema, overwriting_schema: &Schema) -> PolarsResult<()> {
for (k, value) in overwriting_schema.iter() {
*schema.try_get_mut(k)? = value.clone();
}
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-lazy/src/prelude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ pub use polars_ops::prelude::{JoinArgs, JoinType, JoinValidation};
#[cfg(feature = "rank")]
pub use polars_ops::prelude::{RankMethod, RankOptions};
#[cfg(feature = "polars_cloud")]
pub use polars_plan::client::assert_cloud_eligible;
pub use polars_plan::client::prepare_cloud_plan;
pub use polars_plan::plans::{
AnonymousScan, AnonymousScanArgs, AnonymousScanOptions, DslPlan, Literal, LiteralValue, Null,
NULL,
Expand Down
10 changes: 10 additions & 0 deletions crates/polars-lazy/src/scan/ndjson.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pub struct LazyJsonLineReader {
pub(crate) low_memory: bool,
pub(crate) rechunk: bool,
pub(crate) schema: Option<SchemaRef>,
pub(crate) schema_overwrite: Option<SchemaRef>,
pub(crate) row_index: Option<RowIndex>,
pub(crate) infer_schema_length: Option<NonZeroUsize>,
pub(crate) n_rows: Option<usize>,
Expand All @@ -38,6 +39,7 @@ impl LazyJsonLineReader {
low_memory: false,
rechunk: false,
schema: None,
schema_overwrite: None,
row_index: None,
infer_schema_length: NonZeroUsize::new(100),
ignore_errors: false,
Expand Down Expand Up @@ -82,6 +84,13 @@ impl LazyJsonLineReader {
self
}

/// Set the JSON file's schema
#[must_use]
pub fn with_schema_overwrite(mut self, schema_overwrite: Option<SchemaRef>) -> Self {
self.schema_overwrite = schema_overwrite;
self
}

/// Reduce memory usage at the expense of performance
#[must_use]
pub fn low_memory(mut self, toggle: bool) -> Self {
Expand Down Expand Up @@ -129,6 +138,7 @@ impl LazyFileListReader for LazyJsonLineReader {
low_memory: self.low_memory,
ignore_errors: self.ignore_errors,
schema: self.schema,
schema_overwrite: self.schema_overwrite,
};

let scan_type = FileScan::NDJson {
Expand Down
20 changes: 15 additions & 5 deletions crates/polars-ops/src/series/ops/is_in.rs
Original file line number Diff line number Diff line change
Expand Up @@ -716,18 +716,28 @@ pub fn is_in(s: &Series, other: &Series) -> PolarsResult<BooleanChunked> {
let ca = s.bool().unwrap();
is_in_boolean(ca, other)
},
DataType::Null => {
let series_bool = s.cast(&DataType::Boolean)?;
let ca = series_bool.bool().unwrap();
Ok(ca.clone())
},
#[cfg(feature = "dtype-decimal")]
DataType::Decimal(_, _) => {
let s = s.decimal()?;
let other = other.decimal()?;
let scale = s.scale().max(other.scale());
let s = s.to_scale(scale)?;
let other = other.to_scale(scale)?.into_owned().into_series();

is_in_numeric(s.physical(), &other)
},
dt if dt.to_physical().is_numeric() => {
let s = s.to_physical_repr();
with_match_physical_numeric_polars_type!(s.dtype(), |$T| {
let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
is_in_numeric(ca, other)
})
},
DataType::Null => {
let series_bool = s.cast(&DataType::Boolean)?;
let ca = series_bool.bool().unwrap();
Ok(ca.clone())
},
dt => polars_bail!(opq = is_in, dt),
}
}
Loading

0 comments on commit 6042da8

Please sign in to comment.