From 7031faab540417ab39701091d16aa37ff1d58610 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 15 Jan 2025 17:18:28 -0500 Subject: [PATCH] dear god --- bench-vortex/benches/bytes_at.rs | 4 +- bench-vortex/src/bin/notimplemented.rs | 3 +- bench-vortex/src/clickbench.rs | 4 +- bench-vortex/src/data_downloads.rs | 3 +- bench-vortex/src/lib.rs | 2 +- bench-vortex/src/reader.rs | 2 +- bench-vortex/src/tpch/mod.rs | 2 +- bench-vortex/src/vortex_utils.rs | 5 +- encodings/alp/src/alp/array.rs | 21 +- encodings/alp/src/alp_rd/array.rs | 34 +-- encodings/alp/src/alp_rd/mod.rs | 6 +- encodings/bytebool/src/array.rs | 10 +- encodings/datetime-parts/src/array.rs | 24 +- .../datetime-parts/src/compute/filter.rs | 3 +- encodings/datetime-parts/src/compute/mod.rs | 13 +- encodings/datetime-parts/src/compute/take.rs | 3 +- encodings/dict/src/array.rs | 14 +- encodings/dict/src/compress.rs | 17 +- encodings/fastlanes/src/bitpacking/mod.rs | 6 +- encodings/fastlanes/src/delta/mod.rs | 3 +- encodings/fastlanes/src/for/compress.rs | 12 +- encodings/fastlanes/src/for/compute/mod.rs | 3 +- encodings/fastlanes/src/for/mod.rs | 10 +- encodings/fsst/src/array.rs | 35 ++- encodings/fsst/src/canonical.rs | 3 +- encodings/fsst/src/compress.rs | 3 +- encodings/fsst/src/compute/compare.rs | 2 +- encodings/fsst/src/compute/mod.rs | 9 +- encodings/roaring/src/boolean/mod.rs | 7 +- encodings/roaring/src/integer/mod.rs | 8 +- encodings/runend-bool/src/array.rs | 21 +- encodings/runend/src/array.rs | 17 +- encodings/runend/src/compress.rs | 3 +- encodings/runend/src/compute/mod.rs | 4 +- encodings/zigzag/src/array.rs | 17 +- fuzz/src/filter.rs | 5 +- fuzz/src/search_sorted.rs | 4 +- fuzz/src/slice.rs | 10 +- fuzz/src/sort.rs | 5 +- fuzz/src/take.rs | 5 +- pyvortex/demo/repro_compress.py | 20 ++ pyvortex/demo/repro_decompress.py | 25 ++ pyvortex/demo/users.py | 62 +++++ pyvortex/src/array.rs | 3 +- pyvortex/src/encode.rs | 8 +- .../src/array/bool/compute/fill_null.rs | 2 +- vortex-array/src/array/bool/mod.rs | 7 +- vortex-array/src/array/chunked/canonical.rs | 19 +- .../src/array/chunked/compute/boolean.rs | 17 +- .../src/array/chunked/compute/compare.rs | 12 +- .../src/array/chunked/compute/fill_null.rs | 12 +- .../src/array/chunked/compute/filter.rs | 6 +- vortex-array/src/array/chunked/compute/mod.rs | 15 +- .../src/array/chunked/compute/scalar_at.rs | 10 +- .../src/array/chunked/compute/slice.rs | 9 +- vortex-array/src/array/chunked/mod.rs | 31 ++- vortex-array/src/array/chunked/variants.rs | 8 +- vortex-array/src/array/constant/canonical.rs | 2 +- vortex-array/src/array/constant/mod.rs | 10 +- vortex-array/src/array/datetime/mod.rs | 8 +- vortex-array/src/array/extension/mod.rs | 11 +- vortex-array/src/array/list/compute/mod.rs | 2 +- vortex-array/src/array/list/mod.rs | 16 +- vortex-array/src/array/null/mod.rs | 5 +- vortex-array/src/array/primitive/mod.rs | 7 +- vortex-array/src/array/sparse/canonical.rs | 13 +- vortex-array/src/array/sparse/mod.rs | 8 +- vortex-array/src/array/struct_/compute.rs | 3 +- vortex-array/src/array/struct_/mod.rs | 23 +- vortex-array/src/array/varbin/arrow.rs | 2 +- vortex-array/src/array/varbin/builder.rs | 12 +- vortex-array/src/array/varbin/canonical.rs | 4 +- .../src/array/varbin/compute/filter.rs | 10 +- vortex-array/src/array/varbin/compute/take.rs | 3 +- vortex-array/src/array/varbin/mod.rs | 15 +- .../src/array/varbinview/compute/mod.rs | 9 +- vortex-array/src/array/varbinview/mod.rs | 8 +- vortex-array/src/arrow/array.rs | 5 +- vortex-array/src/builders/struct_.rs | 2 +- vortex-array/src/compute/binary_numeric.rs | 9 +- vortex-array/src/compute/boolean.rs | 4 +- vortex-array/src/compute/cast.rs | 5 +- vortex-array/src/compute/compare.rs | 2 +- vortex-array/src/compute/fill_null.rs | 2 +- vortex-array/src/compute/filter.rs | 2 +- vortex-array/src/compute/invert.rs | 2 +- vortex-array/src/compute/like.rs | 6 +- vortex-array/src/compute/scalar_at.rs | 5 +- vortex-array/src/data/mod.rs | 18 +- vortex-array/src/data/owned.rs | 4 +- vortex-array/src/data/statistics.rs | 6 +- vortex-array/src/data/viewed.rs | 6 +- vortex-array/src/dtypes.rs | 236 ++++++++++++++++++ vortex-array/src/iter/ext.rs | 5 +- vortex-array/src/lib.rs | 5 +- vortex-array/src/macros.rs | 2 +- vortex-array/src/parts.rs | 4 +- vortex-array/src/patches.rs | 10 +- vortex-array/src/stream/ext.rs | 4 +- vortex-array/src/validity.rs | 7 +- vortex-array/src/variants.rs | 27 +- vortex-dtype/src/extension.rs | 1 + vortex-dtype/src/ptype.rs | 9 + vortex-expr/src/get_item.rs | 2 +- vortex-expr/src/lib.rs | 5 +- vortex-file/src/read/stream.rs | 3 +- vortex-file/src/tests.rs | 28 ++- vortex-ipc/src/iterator.rs | 4 +- vortex-ipc/src/messages/decoder.rs | 2 +- vortex-ipc/src/stream.rs | 4 +- .../src/layouts/chunked/stats_table.rs | 2 +- vortex-layout/src/layouts/chunked/writer.rs | 6 +- vortex-layout/src/layouts/flat/eval_expr.rs | 9 +- .../src/compressors/date_time_parts.rs | 3 +- .../src/compressors/fsst.rs | 5 +- .../src/compressors/roaring_bool.rs | 2 +- .../src/compressors/struct_.rs | 4 +- vortex-scan/src/lib.rs | 3 +- 118 files changed, 920 insertions(+), 349 deletions(-) create mode 100644 pyvortex/demo/repro_compress.py create mode 100644 pyvortex/demo/repro_decompress.py create mode 100644 pyvortex/demo/users.py create mode 100644 vortex-array/src/dtypes.rs diff --git a/bench-vortex/benches/bytes_at.rs b/bench-vortex/benches/bytes_at.rs index 86ee8d1fcc..28d8868911 100644 --- a/bench-vortex/benches/bytes_at.rs +++ b/bench-vortex/benches/bytes_at.rs @@ -7,7 +7,7 @@ use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use parquet::data_type::AsBytes; use vortex::array::{VarBinArray, VarBinViewArray}; use vortex::buffer::{buffer, ByteBuffer}; -use vortex::dtype::{DType, Nullability}; +use vortex::dtypes::DTYPE_STRING_NONNULL; use vortex::ipc::iterator::{ArrayIteratorIPC, SyncIPCReader}; use vortex::iter::ArrayIteratorExt; use vortex::validity::Validity; @@ -17,7 +17,7 @@ fn array_data_fixture() -> VarBinArray { VarBinArray::try_new( buffer![0i32, 5i32, 10i32, 15i32, 20i32].into_array(), ByteBuffer::copy_from(b"helloworldhelloworld".as_bytes()), - DType::Utf8(Nullability::NonNullable), + DTYPE_STRING_NONNULL.clone(), Validity::NonNullable, ) .unwrap() diff --git a/bench-vortex/src/bin/notimplemented.rs b/bench-vortex/src/bin/notimplemented.rs index f4bde65faa..ec1d1dc6bb 100644 --- a/bench-vortex/src/bin/notimplemented.rs +++ b/bench-vortex/src/bin/notimplemented.rs @@ -14,6 +14,7 @@ use vortex::array::{ use vortex::buffer::buffer; use vortex::datetime_dtype::{TemporalMetadata, TimeUnit, TIME_ID}; use vortex::dtype::{DType, ExtDType, Nullability, PType}; +use vortex::dtypes::DTYPE_BOOL_NONNULL; use vortex::encodings::alp::{ALPArray, Exponents, RDEncoder}; use vortex::encodings::bytebool::ByteBoolArray; use vortex::encodings::datetime_parts::DateTimePartsArray; @@ -79,7 +80,7 @@ fn enc_impls() -> Vec { BoolArray::from_iter([false]).into_array(), BoolArray::from_iter([true]).into_array(), ], - DType::Bool(Nullability::NonNullable), + DTYPE_BOOL_NONNULL.clone(), ) .unwrap() .into_array(), diff --git a/bench-vortex/src/clickbench.rs b/bench-vortex/src/clickbench.rs index fa5ab090a6..1f95bd5d29 100644 --- a/bench-vortex/src/clickbench.rs +++ b/bench-vortex/src/clickbench.rs @@ -202,7 +202,9 @@ pub async fn register_vortex_files( let name: Arc = field.name().as_str().into(); let dtype = types_map[&name].clone(); let chunks = arrays_map.remove(&name).unwrap(); - let chunked_child = ChunkedArray::try_new(chunks, dtype).unwrap(); + // TODO(aduffy): fix extra clone + let chunked_child = + ChunkedArray::try_new(chunks, Arc::new(dtype)).unwrap(); (name, chunked_child.into_array()) }) diff --git a/bench-vortex/src/data_downloads.rs b/bench-vortex/src/data_downloads.rs index a040652b1d..f473b7dfd1 100644 --- a/bench-vortex/src/data_downloads.rs +++ b/bench-vortex/src/data_downloads.rs @@ -3,6 +3,7 @@ use std::fs::File; use std::future::Future; use std::io::{Read, Write}; use std::path::PathBuf; +use std::sync::Arc; use arrow_array::RecordBatchReader; use bzip2::read::BzDecoder; @@ -49,7 +50,7 @@ pub fn data_vortex_uncompressed(fname_out: &str, downloaded_data: PathBuf) -> Pa .into_iter() .map(|batch_result| ArrayData::try_from(batch_result.unwrap()).unwrap()) .collect(), - dtype, + Arc::new(dtype), ) .unwrap() .into_array(); diff --git a/bench-vortex/src/lib.rs b/bench-vortex/src/lib.rs index e8d057f60e..3341ab7dcb 100644 --- a/bench-vortex/src/lib.rs +++ b/bench-vortex/src/lib.rs @@ -198,7 +198,7 @@ pub fn fetch_taxi_data() -> ArrayData { .map(ArrayData::try_from) .map(Result::unwrap) .collect_vec(), - DType::from_arrow(schema), + Arc::new(DType::from_arrow(schema)), ) .unwrap() .into_array() diff --git a/bench-vortex/src/reader.rs b/bench-vortex/src/reader.rs index 43e5132d16..145265c47a 100644 --- a/bench-vortex/src/reader.rs +++ b/bench-vortex/src/reader.rs @@ -77,7 +77,7 @@ pub fn read_parquet_to_vortex>(parquet_path: P) -> VortexResult>>()?; - ChunkedArray::try_new(chunks, dtype) + ChunkedArray::try_new(chunks, Arc::new(dtype)) } pub fn compress_parquet_to_vortex(parquet_path: &Path) -> VortexResult { diff --git a/bench-vortex/src/tpch/mod.rs b/bench-vortex/src/tpch/mod.rs index 07d6ec3893..225b97d79e 100644 --- a/bench-vortex/src/tpch/mod.rs +++ b/bench-vortex/src/tpch/mod.rs @@ -249,7 +249,7 @@ async fn register_vortex_file( let name: Arc = field.name().as_str().into(); let dtype = types_map[&name].clone(); let chunks = arrays_map.remove(&name).unwrap(); - let mut chunked_child = ChunkedArray::try_new(chunks, dtype).unwrap(); + let mut chunked_child = ChunkedArray::try_new(chunks, Arc::new(dtype)).unwrap(); if !enable_compression { chunked_child = chunked_child .rechunk(TARGET_BLOCK_BYTESIZE, TARGET_BLOCK_SIZE) diff --git a/bench-vortex/src/vortex_utils.rs b/bench-vortex/src/vortex_utils.rs index 1421175628..3f49a786f3 100644 --- a/bench-vortex/src/vortex_utils.rs +++ b/bench-vortex/src/vortex_utils.rs @@ -16,7 +16,7 @@ pub async fn vortex_chunk_sizes(path: PathBuf) -> VortexResult VortexResult, ) -> VortexResult { - let dtype = match encoded.dtype() { - DType::Primitive(PType::I32, nullability) => DType::Primitive(PType::F32, *nullability), - DType::Primitive(PType::I64, nullability) => DType::Primitive(PType::F64, *nullability), + let dtype = match encoded.dtype().as_ref() { + DType::Primitive(PType::I32, nullability) => primitive_dtype!(PType::F32, *nullability), + DType::Primitive(PType::I64, nullability) => primitive_dtype!(PType::F64, *nullability), d => vortex_bail!(MismatchedTypes: "int32 or int64", d), }; @@ -76,7 +77,7 @@ impl ALPArray { pub fn encoded(&self) -> ArrayData { self.as_ref() - .child(0, &self.encoded_dtype(), self.len()) + .child(0, self.encoded_dtype(), self.len()) .vortex_expect("Missing encoded child in ALPArray") } @@ -100,13 +101,13 @@ impl ALPArray { } #[inline] - fn encoded_dtype(&self) -> DType { - match self.dtype() { + fn encoded_dtype(&self) -> &Arc { + match self.dtype().as_ref() { DType::Primitive(PType::F32, _) => { - DType::Primitive(PType::I32, self.dtype().nullability()) + primitive_dtype_ref!(PType::I32, self.dtype().nullability()) } DType::Primitive(PType::F64, _) => { - DType::Primitive(PType::I64, self.dtype().nullability()) + primitive_dtype_ref!(PType::I64, self.dtype().nullability()) } d => vortex_panic!(MismatchedTypes: "f32 or f64", d), } diff --git a/encodings/alp/src/alp_rd/array.rs b/encodings/alp/src/alp_rd/array.rs index 620574c951..5ded21e4ed 100644 --- a/encodings/alp/src/alp_rd/array.rs +++ b/encodings/alp/src/alp_rd/array.rs @@ -1,4 +1,5 @@ use std::fmt::{Debug, Display}; +use std::sync::Arc; use serde::{Deserialize, Serialize}; use vortex_array::array::PrimitiveArray; @@ -8,7 +9,8 @@ use vortex_array::stats::{StatisticsVTable, StatsSet}; use vortex_array::validity::{ArrayValidity, LogicalValidity, ValidityVTable}; use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; use vortex_array::{ - impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoCanonical, + impl_encoding, primitive_dtype_ref, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, + IntoCanonical, }; use vortex_dtype::{DType, Nullability, PType}; use vortex_error::{vortex_bail, VortexExpect, VortexResult}; @@ -34,7 +36,7 @@ impl Display for ALPRDMetadata { impl ALPRDArray { pub fn try_new( - dtype: DType, + dtype: Arc, left_parts: ArrayData, left_parts_dict: impl AsRef<[u16]>, right_parts: ArrayData, @@ -123,27 +125,25 @@ impl ALPRDArray { /// The dtype of the left parts of the array. #[inline] - fn left_parts_dtype(&self) -> DType { - DType::Primitive(self.metadata().left_parts_ptype, self.dtype().nullability()) + fn left_parts_dtype(&self) -> &Arc { + primitive_dtype_ref!(self.metadata().left_parts_ptype, self.dtype().nullability()) } /// The dtype of the right parts of the array. #[inline] - fn right_parts_dtype(&self) -> DType { - DType::Primitive( - if self.is_f32() { - PType::U32 - } else { - PType::U64 - }, - Nullability::NonNullable, - ) + fn right_parts_dtype(&self) -> &Arc { + let ptype = if self.is_f32() { + PType::U32 + } else { + PType::U64 + }; + primitive_dtype_ref!(ptype, Nullability::NonNullable) } /// The dtype of the patches of the left parts of the array. #[inline] - fn left_parts_patches_dtype(&self) -> DType { - DType::Primitive(self.metadata().left_parts_ptype, Nullability::NonNullable) + fn left_parts_patches_dtype(&self) -> &Arc { + primitive_dtype_ref!(self.metadata().left_parts_ptype, Nullability::NonNullable) } /// The leftmost (most significant) bits of the floating point values stored in the array. @@ -169,10 +169,10 @@ impl ALPRDArray { Patches::new( self.len(), self.as_ref() - .child(2, &metadata.indices_dtype(), metadata.len()) + .child(2, metadata.indices_dtype(), metadata.len()) .vortex_expect("ALPRDArray: patch indices"), self.as_ref() - .child(3, &self.left_parts_patches_dtype(), metadata.len()) + .child(3, self.left_parts_patches_dtype(), metadata.len()) .vortex_expect("ALPRDArray: patch values"), ) }) diff --git a/encodings/alp/src/alp_rd/mod.rs b/encodings/alp/src/alp_rd/mod.rs index e03276dc35..774562d9f4 100644 --- a/encodings/alp/src/alp_rd/mod.rs +++ b/encodings/alp/src/alp_rd/mod.rs @@ -15,9 +15,9 @@ use itertools::Itertools; use num_traits::{Float, One, PrimInt}; use vortex_array::aliases::hash_map::HashMap; use vortex_array::array::PrimitiveArray; -use vortex_array::{ArrayDType, IntoArrayData, IntoArrayVariant}; +use vortex_array::{primitive_dtype, ArrayDType, IntoArrayData, IntoArrayVariant}; use vortex_buffer::{Buffer, BufferMut}; -use vortex_dtype::{match_each_integer_ptype, DType, NativePType}; +use vortex_dtype::{match_each_integer_ptype, NativePType}; use vortex_error::{vortex_bail, VortexExpect, VortexResult, VortexUnwrap}; use vortex_fastlanes::bitpack_encode_unchecked; @@ -240,7 +240,7 @@ impl RDEncoder { }); ALPRDArray::try_new( - DType::Primitive(T::PTYPE, packed_left.dtype().nullability()), + primitive_dtype!(T::PTYPE, packed_left.dtype().nullability()), packed_left, &self.codes, packed_right, diff --git a/encodings/bytebool/src/array.rs b/encodings/bytebool/src/array.rs index 12a9af7750..a86caedc21 100644 --- a/encodings/bytebool/src/array.rs +++ b/encodings/bytebool/src/array.rs @@ -4,14 +4,16 @@ use std::sync::Arc; use arrow_buffer::BooleanBuffer; use serde::{Deserialize, Serialize}; use vortex_array::array::BoolArray; +use vortex_array::dtypes::DTYPE_BOOL_NONNULL; use vortex_array::encoding::ids; use vortex_array::stats::StatsSet; use vortex_array::validity::{LogicalValidity, Validity, ValidityMetadata, ValidityVTable}; use vortex_array::variants::{BoolArrayTrait, VariantsVTable}; use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; -use vortex_array::{impl_encoding, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoCanonical}; +use vortex_array::{ + bool_dtype, impl_encoding, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoCanonical, +}; use vortex_buffer::ByteBuffer; -use vortex_dtype::DType; use vortex_error::{VortexExpect as _, VortexResult}; impl_encoding!("vortex.bytebool", ids::BYTE_BOOL, ByteBool); @@ -31,7 +33,7 @@ impl ByteBoolArray { pub fn validity(&self) -> Validity { self.metadata().validity.to_validity(|| { self.as_ref() - .child(0, &Validity::DTYPE, self.len()) + .child(0, &DTYPE_BOOL_NONNULL, self.len()) .vortex_expect("ByteBoolArray: accessing validity child") }) } @@ -41,7 +43,7 @@ impl ByteBoolArray { ArrayData::try_new_owned( &ByteBoolEncoding, - DType::Bool(validity.nullability()), + bool_dtype!(validity.nullability()), length, Arc::new(ByteBoolMetadata { validity: validity.to_metadata(length)?, diff --git a/encodings/datetime-parts/src/array.rs b/encodings/datetime-parts/src/array.rs index 9ee15a1caa..ad92dbb655 100644 --- a/encodings/datetime-parts/src/array.rs +++ b/encodings/datetime-parts/src/array.rs @@ -1,4 +1,5 @@ use std::fmt::{Debug, Display}; +use std::sync::Arc; use serde::{Deserialize, Serialize}; use vortex_array::array::StructArray; @@ -9,10 +10,10 @@ use vortex_array::validity::{ArrayValidity, LogicalValidity, Validity, ValidityV use vortex_array::variants::{ExtensionArrayTrait, VariantsVTable}; use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; use vortex_array::{ - impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, - IntoCanonical, + impl_encoding, primitive_dtype_ref, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, + IntoArrayData, IntoCanonical, }; -use vortex_dtype::{DType, PType}; +use vortex_dtype::{DType, Nullability, PType}; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult, VortexUnwrap}; use crate::compute::decode_to_temporal; @@ -72,7 +73,8 @@ impl DateTimePartsArray { }; Self::try_from_parts( - dtype, + // TODO(aduffy): fix cloning + Arc::new(dtype), length, metadata, [days, seconds, subsecond].into(), @@ -84,7 +86,7 @@ impl DateTimePartsArray { self.as_ref() .child( 0, - &DType::Primitive(self.metadata().days_ptype, self.dtype().nullability()), + primitive_dtype_ref!(self.metadata().days_ptype, self.dtype().nullability()), self.len(), ) .vortex_expect("DatetimePartsArray missing days array") @@ -92,13 +94,21 @@ impl DateTimePartsArray { pub fn seconds(&self) -> ArrayData { self.as_ref() - .child(1, &self.metadata().seconds_ptype.into(), self.len()) + .child( + 1, + primitive_dtype_ref!(self.metadata().seconds_ptype, Nullability::NonNullable), + self.len(), + ) .vortex_expect("DatetimePartsArray missing seconds array") } pub fn subsecond(&self) -> ArrayData { self.as_ref() - .child(2, &self.metadata().subseconds_ptype.into(), self.len()) + .child( + 2, + primitive_dtype_ref!(self.metadata().subseconds_ptype, Nullability::NonNullable), + self.len(), + ) .vortex_expect("DatetimePartsArray missing subsecond array") } diff --git a/encodings/datetime-parts/src/compute/filter.rs b/encodings/datetime-parts/src/compute/filter.rs index da768fc36a..a36e214928 100644 --- a/encodings/datetime-parts/src/compute/filter.rs +++ b/encodings/datetime-parts/src/compute/filter.rs @@ -7,7 +7,8 @@ use crate::{DateTimePartsArray, DateTimePartsEncoding}; impl FilterFn for DateTimePartsEncoding { fn filter(&self, array: &DateTimePartsArray, mask: &FilterMask) -> VortexResult { Ok(DateTimePartsArray::try_new( - array.dtype().clone(), + // TODO(aduffy): fix cloning + array.dtype().as_ref().clone(), filter(array.days().as_ref(), mask)?, filter(array.seconds().as_ref(), mask)?, filter(array.subsecond().as_ref(), mask)?, diff --git a/encodings/datetime-parts/src/compute/mod.rs b/encodings/datetime-parts/src/compute/mod.rs index de3937c1e8..6b6398c178 100644 --- a/encodings/datetime-parts/src/compute/mod.rs +++ b/encodings/datetime-parts/src/compute/mod.rs @@ -1,6 +1,8 @@ mod filter; mod take; +use std::sync::Arc; + use vortex_array::array::{PrimitiveArray, TemporalArray}; use vortex_array::compute::{ scalar_at, slice, try_cast, ComputeVTable, FilterFn, ScalarAtFn, SliceFn, TakeFn, @@ -42,7 +44,8 @@ impl SliceFn for DateTimePartsEncoding { stop: usize, ) -> VortexResult { Ok(DateTimePartsArray::try_new( - array.dtype().clone(), + // TODO(aduffy): fix cloning + array.dtype().as_ref().clone(), slice(array.days(), start, stop)?, slice(array.seconds(), start, stop)?, slice(array.subsecond(), start, stop)?, @@ -53,7 +56,7 @@ impl SliceFn for DateTimePartsEncoding { impl ScalarAtFn for DateTimePartsEncoding { fn scalar_at(&self, array: &DateTimePartsArray, index: usize) -> VortexResult { - let DType::Extension(ext) = array.dtype().clone() else { + let DType::Extension(ext) = array.dtype().as_ref() else { vortex_bail!( "DateTimePartsArray must have extension dtype, found {}", array.dtype() @@ -66,7 +69,7 @@ impl ScalarAtFn for DateTimePartsEncoding { }; if !array.is_valid(index) { - return Ok(Scalar::null(DType::Extension(ext))); + return Ok(Scalar::null(DType::Extension(Arc::clone(&ext)))); } let divisor = match time_unit { @@ -89,7 +92,7 @@ impl ScalarAtFn for DateTimePartsEncoding { let scalar = days * 86_400 * divisor + seconds * divisor + subseconds; - Ok(Scalar::extension(ext, Scalar::from(scalar))) + Ok(Scalar::extension(Arc::clone(ext), Scalar::from(scalar))) } } @@ -97,7 +100,7 @@ impl ScalarAtFn for DateTimePartsEncoding { /// /// Enforces that the passed array is actually a [DateTimePartsArray] with proper metadata. pub fn decode_to_temporal(array: &DateTimePartsArray) -> VortexResult { - let DType::Extension(ext) = array.dtype().clone() else { + let DType::Extension(ext) = array.dtype().as_ref() else { vortex_bail!(ComputeError: "expected dtype to be DType::Extension variant") }; diff --git a/encodings/datetime-parts/src/compute/take.rs b/encodings/datetime-parts/src/compute/take.rs index c9d0c68366..55beab9f05 100644 --- a/encodings/datetime-parts/src/compute/take.rs +++ b/encodings/datetime-parts/src/compute/take.rs @@ -7,7 +7,8 @@ use crate::{DateTimePartsArray, DateTimePartsEncoding}; impl TakeFn for DateTimePartsEncoding { fn take(&self, array: &DateTimePartsArray, indices: &ArrayData) -> VortexResult { Ok(DateTimePartsArray::try_new( - array.dtype().clone(), + // TODO(aduffy): fix cloning + array.dtype().as_ref().clone(), take(array.days(), indices)?, take(array.seconds(), indices)?, take(array.subsecond(), indices)?, diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index 1858409952..ee9abfeade 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -10,10 +10,10 @@ use vortex_array::validity::{ArrayValidity, LogicalValidity, ValidityVTable}; use vortex_array::variants::PrimitiveArrayTrait; use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; use vortex_array::{ - impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, - IntoArrayVariant, IntoCanonical, + impl_encoding, primitive_dtype_ref, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, + IntoArrayData, IntoArrayVariant, IntoCanonical, }; -use vortex_dtype::{match_each_integer_ptype, DType, PType}; +use vortex_dtype::{match_each_integer_ptype, DType, Nullability, PType}; use vortex_error::{vortex_bail, vortex_panic, VortexExpect as _, VortexResult}; impl_encoding!("vortex.dict", ids::DICT, Dict); @@ -51,7 +51,11 @@ impl DictArray { #[inline] pub fn codes(&self) -> ArrayData { self.as_ref() - .child(0, &DType::from(self.metadata().codes_ptype), self.len()) + .child( + 0, + primitive_dtype_ref!(self.metadata().codes_ptype, Nullability::NonNullable), + self.len(), + ) .vortex_expect("DictArray is missing its codes child array") } @@ -67,7 +71,7 @@ impl ArrayTrait for DictArray {} impl IntoCanonical for DictArray { fn into_canonical(self) -> VortexResult { - match self.dtype() { + match self.dtype().as_ref() { // NOTE: Utf8 and Binary will decompress into VarBinViewArray, which requires a full // decompression to construct the views child array. // For this case, it is *always* faster to decompress the values first and then create diff --git a/encodings/dict/src/compress.rs b/encodings/dict/src/compress.rs index 5b9029ff8a..a81416c038 100644 --- a/encodings/dict/src/compress.rs +++ b/encodings/dict/src/compress.rs @@ -1,4 +1,5 @@ use std::hash::{BuildHasher, Hash, Hasher}; +use std::sync::Arc; use hashbrown::hash_map::Entry; use hashbrown::HashTable; @@ -87,14 +88,16 @@ pub fn dict_encode_typed_primitive( /// Dictionary encode varbin array. Specializes for primitive byte arrays to avoid double copying pub fn dict_encode_varbin(array: &VarBinArray) -> (PrimitiveArray, VarBinArray) { array - .with_iterator(|iter| dict_encode_varbin_bytes(array.dtype().clone(), iter)) + // TODO(aduffy): fix cloning + .with_iterator(|iter| dict_encode_varbin_bytes(array.dtype().as_ref().clone(), iter)) .vortex_unwrap() } /// Dictionary encode a VarbinViewArray. pub fn dict_encode_varbinview(array: &VarBinViewArray) -> (PrimitiveArray, VarBinViewArray) { let (codes, values) = array - .with_iterator(|iter| dict_encode_varbin_bytes(array.dtype().clone(), iter)) + // TODO(aduffy): fix cloning + .with_iterator(|iter| dict_encode_varbin_bytes(array.dtype().as_ref().clone(), iter)) .vortex_unwrap(); ( codes, @@ -153,8 +156,14 @@ fn dict_encode_varbin_bytes<'a, I: Iterator>>( let values_validity = dict_values_validity(dtype.is_nullable(), offsets.len() - 1); ( PrimitiveArray::new(codes, Validity::NonNullable), - VarBinArray::try_new(offsets.into_array(), bytes.freeze(), dtype, values_validity) - .vortex_expect("Failed to create VarBinArray dictionary during encoding"), + // TODO(aduffy): fix cloning + VarBinArray::try_new( + offsets.into_array(), + bytes.freeze(), + Arc::new(dtype), + values_validity, + ) + .vortex_expect("Failed to create VarBinArray dictionary during encoding"), ) } diff --git a/encodings/fastlanes/src/bitpacking/mod.rs b/encodings/fastlanes/src/bitpacking/mod.rs index 4be720ad0d..50c0573758 100644 --- a/encodings/fastlanes/src/bitpacking/mod.rs +++ b/encodings/fastlanes/src/bitpacking/mod.rs @@ -5,6 +5,7 @@ use ::serde::{Deserialize, Serialize}; pub use compress::*; use fastlanes::BitPacking; use vortex_array::array::PrimitiveArray; +use vortex_array::dtypes::DTYPE_BOOL_NONNULL; use vortex_array::encoding::ids; use vortex_array::patches::{Patches, PatchesMetadata}; use vortex_array::stats::{StatisticsVTable, StatsSet}; @@ -147,7 +148,8 @@ impl BitPackedArray { ArrayData::try_new_owned( &BitPackedEncoding, - dtype, + // TODO(aduffy): fix cloning + Arc::new(dtype), length, Arc::new(metadata), [packed].into(), @@ -215,7 +217,7 @@ impl BitPackedArray { }; self.metadata().validity.to_validity(|| { self.as_ref() - .child(validity_child_idx, &Validity::DTYPE, self.len()) + .child(validity_child_idx, &DTYPE_BOOL_NONNULL, self.len()) .vortex_expect("BitPackedArray: validity child") }) } diff --git a/encodings/fastlanes/src/delta/mod.rs b/encodings/fastlanes/src/delta/mod.rs index 1f6a871753..8025f6e9b5 100644 --- a/encodings/fastlanes/src/delta/mod.rs +++ b/encodings/fastlanes/src/delta/mod.rs @@ -3,6 +3,7 @@ use std::fmt::{Debug, Display}; pub use compress::*; use serde::{Deserialize, Serialize}; use vortex_array::array::PrimitiveArray; +use vortex_array::dtypes::DTYPE_BOOL_NONNULL; use vortex_array::encoding::ids; use vortex_array::stats::{StatisticsVTable, StatsSet}; use vortex_array::validity::{LogicalValidity, Validity, ValidityMetadata, ValidityVTable}; @@ -205,7 +206,7 @@ impl DeltaArray { pub fn validity(&self) -> Validity { self.metadata().validity.to_validity(|| { self.as_ref() - .child(2, &Validity::DTYPE, self.len()) + .child(2, &DTYPE_BOOL_NONNULL, self.len()) .vortex_expect("DeltaArray: validity child") }) } diff --git a/encodings/fastlanes/src/for/compress.rs b/encodings/fastlanes/src/for/compress.rs index 6c9d6a87d6..b6f62b6bd0 100644 --- a/encodings/fastlanes/src/for/compress.rs +++ b/encodings/fastlanes/src/for/compress.rs @@ -192,20 +192,26 @@ mod test { ); assert_eq!( scalar_at(&compressed, 1).unwrap(), - Scalar::null(array.dtype().clone()) + // TODO(aduffy): fix cloning + Scalar::null(array.dtype().as_ref().clone()) ); let sparse = SparseArray::try_from(compressed.encoded()).unwrap(); assert!(sparse.dtype().is_unsigned_int()); assert!(sparse.statistics().to_set().into_iter().next().is_none()); - assert_eq!(sparse.fill_scalar(), Scalar::null(sparse.dtype().clone())); + // TODO(aduffy): fix cloning + assert_eq!( + sparse.fill_scalar(), + Scalar::null(sparse.dtype().as_ref().clone()) + ); assert_eq!( scalar_at(&sparse, 0).unwrap(), Scalar::primitive(0u32, Nullability::Nullable) ); assert_eq!( scalar_at(&sparse, 1).unwrap(), - Scalar::null(sparse.dtype().clone()) + // TODO(aduffy): fix cloning + Scalar::null(sparse.dtype().as_ref().clone()) ); } diff --git a/encodings/fastlanes/src/for/compute/mod.rs b/encodings/fastlanes/src/for/compute/mod.rs index 1294069d47..3166cd59ad 100644 --- a/encodings/fastlanes/src/for/compute/mod.rs +++ b/encodings/fastlanes/src/for/compute/mod.rs @@ -81,7 +81,8 @@ impl ScalarAtFn for FoREncoding { .typed_value::<$P>() .vortex_expect("FoRArray Reference value cannot be null"))) .map(|v| Scalar::primitive::<$P>(v, array.dtype().nullability())) - .unwrap_or_else(|| Scalar::null(array.dtype().clone())) + // TODO(aduffy): fix cloning + .unwrap_or_else(|| Scalar::null(array.dtype().as_ref().clone())) })) } } diff --git a/encodings/fastlanes/src/for/mod.rs b/encodings/fastlanes/src/for/mod.rs index c3d750c85a..c9127f7e2d 100644 --- a/encodings/fastlanes/src/for/mod.rs +++ b/encodings/fastlanes/src/for/mod.rs @@ -1,4 +1,5 @@ use std::fmt::{Debug, Display}; +use std::sync::Arc; pub use compress::*; use serde::{Deserialize, Serialize}; @@ -8,9 +9,9 @@ use vortex_array::validity::{ArrayValidity, LogicalValidity, ValidityVTable}; use vortex_array::variants::{PrimitiveArrayTrait, VariantsVTable}; use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; use vortex_array::{ - impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoCanonical, + impl_encoding, primitive_dtype_ref, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, + IntoCanonical, }; -use vortex_dtype::DType; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult}; use vortex_scalar::{PValue, Scalar}; @@ -52,7 +53,8 @@ impl FoRArray { .vortex_expect("Reference value is non-null"); Self::try_from_parts( - dtype, + // TODO(aduffy): fix cloning + Arc::new(dtype), child.len(), FoRMetadata { reference, shift }, [child].into(), @@ -63,7 +65,7 @@ impl FoRArray { #[inline] pub fn encoded(&self) -> ArrayData { let dtype = if self.ptype().is_signed_int() { - &DType::Primitive(self.ptype().to_unsigned(), self.dtype().nullability()) + primitive_dtype_ref!(self.ptype().to_unsigned(), self.dtype().nullability()) } else { self.dtype() }; diff --git a/encodings/fsst/src/array.rs b/encodings/fsst/src/array.rs index 9382a2a7f0..260e27d6ea 100644 --- a/encodings/fsst/src/array.rs +++ b/encodings/fsst/src/array.rs @@ -1,4 +1,5 @@ use std::fmt::{Debug, Display}; +use std::sync::Arc; use fsst::{Decompressor, Symbol}; use serde::{Deserialize, Serialize}; @@ -8,7 +9,9 @@ use vortex_array::stats::{StatisticsVTable, StatsSet}; use vortex_array::validity::{ArrayValidity, LogicalValidity, Validity, ValidityVTable}; use vortex_array::variants::{BinaryArrayTrait, Utf8ArrayTrait, VariantsVTable}; use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; -use vortex_array::{impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, IntoCanonical}; +use vortex_array::{ + impl_encoding, primitive_dtype_ref, ArrayDType, ArrayData, ArrayLen, ArrayTrait, IntoCanonical, +}; use vortex_dtype::{DType, Nullability, PType}; use vortex_error::{vortex_bail, VortexExpect, VortexResult}; @@ -47,11 +50,11 @@ impl FSSTArray { uncompressed_lengths: ArrayData, ) -> VortexResult { // Check: symbols must be a u64 array - if symbols.dtype() != &SYMBOLS_DTYPE { + if symbols.dtype().as_ref() != &SYMBOLS_DTYPE { vortex_bail!(InvalidArgument: "symbols array must be of type u64") } - if symbol_lengths.dtype() != &SYMBOL_LENS_DTYPE { + if symbol_lengths.dtype().as_ref() != &SYMBOL_LENS_DTYPE { vortex_bail!(InvalidArgument: "symbol_lengths array must be of type u8") } @@ -80,7 +83,7 @@ impl FSSTArray { } // Check: strings must be a Binary array. - if !matches!(codes.dtype(), DType::Binary(_)) { + if !matches!(codes.dtype().as_ref(), DType::Binary(_)) { vortex_bail!(InvalidArgument: "codes array must be DType::Binary type"); } @@ -91,7 +94,8 @@ impl FSSTArray { let children = [symbols, symbol_lengths, codes, uncompressed_lengths].into(); Self::try_from_parts( - dtype, + // TODO(aduffy): fix cloning + Arc::new(dtype), len, FSSTMetadata { symbols_len, @@ -106,21 +110,30 @@ impl FSSTArray { /// Access the symbol table array pub fn symbols(&self) -> ArrayData { self.as_ref() - .child(0, &SYMBOLS_DTYPE, self.metadata().symbols_len) + .child( + 0, + primitive_dtype_ref!(PType::U64, Nullability::NonNullable), + self.metadata().symbols_len, + ) .vortex_expect("FSSTArray symbols child") } /// Access the symbol table array pub fn symbol_lengths(&self) -> ArrayData { self.as_ref() - .child(1, &SYMBOL_LENS_DTYPE, self.metadata().symbols_len) + .child( + 1, + primitive_dtype_ref!(PType::U8, Nullability::NonNullable), + self.metadata().symbols_len, + ) .vortex_expect("FSSTArray symbol_lengths child") } /// Access the codes array pub fn codes(&self) -> ArrayData { self.as_ref() - .child(2, &self.codes_dtype(), self.len()) + // TODO(aduffy): fix cloning + .child(2, &Arc::new(self.codes_dtype()), self.len()) .vortex_expect("FSSTArray codes child") } @@ -139,10 +152,10 @@ impl FSSTArray { /// Get the DType of the uncompressed lengths array #[inline] - pub fn uncompressed_lengths_dtype(&self) -> DType { - DType::Primitive( + pub fn uncompressed_lengths_dtype(&self) -> &Arc { + primitive_dtype_ref!( self.metadata().uncompressed_lengths_ptype, - Nullability::NonNullable, + Nullability::NonNullable ) } diff --git a/encodings/fsst/src/canonical.rs b/encodings/fsst/src/canonical.rs index b7570687c0..fc337af157 100644 --- a/encodings/fsst/src/canonical.rs +++ b/encodings/fsst/src/canonical.rs @@ -55,7 +55,8 @@ impl IntoCanonical for FSSTArray { VarBinViewArray::try_new( views, vec![uncompressed_bytes_array], - self.dtype().clone(), + // TODO(aduffy): fix cloning + self.dtype().as_ref().clone(), self.validity(), ) .map(Canonical::VarBinView) diff --git a/encodings/fsst/src/compress.rs b/encodings/fsst/src/compress.rs index 6b65ad39db..0243c6d24d 100644 --- a/encodings/fsst/src/compress.rs +++ b/encodings/fsst/src/compress.rs @@ -18,7 +18,8 @@ use crate::FSSTArray; /// If the `strings` array is not encoded as either [`VarBinArray`] or [`VarBinViewArray`]. pub fn fsst_compress(strings: &ArrayData, compressor: &Compressor) -> VortexResult { let len = strings.len(); - let dtype = strings.dtype().clone(); + // TODO(aduffy): fix cloning + let dtype = strings.dtype().as_ref().clone(); // Compress VarBinArray if let Ok(varbin) = VarBinArray::try_from(strings.clone()) { diff --git a/encodings/fsst/src/compute/compare.rs b/encodings/fsst/src/compute/compare.rs index 234174137c..a03cf23934 100644 --- a/encodings/fsst/src/compute/compare.rs +++ b/encodings/fsst/src/compute/compare.rs @@ -55,7 +55,7 @@ fn compare_fsst_constant( } let compressor = compressor.build(); - let encoded_scalar = match left.dtype() { + let encoded_scalar = match left.dtype().as_ref() { DType::Utf8(_) => { let value = right .scalar() diff --git a/encodings/fsst/src/compute/mod.rs b/encodings/fsst/src/compute/mod.rs index 0ce03f932e..2a22b1a458 100644 --- a/encodings/fsst/src/compute/mod.rs +++ b/encodings/fsst/src/compute/mod.rs @@ -39,7 +39,8 @@ impl SliceFn for FSSTEncoding { // Slicing an FSST array leaves the symbol table unmodified, // only slicing the `codes` array. Ok(FSSTArray::try_new( - array.dtype().clone(), + // TODO(aduffy): fix cloning + array.dtype().as_ref().clone(), array.symbols(), array.symbol_lengths(), slice(array.codes(), start, stop)?, @@ -53,7 +54,8 @@ impl TakeFn for FSSTEncoding { // Take on an FSSTArray is a simple take on the codes array. fn take(&self, array: &FSSTArray, indices: &ArrayData) -> VortexResult { Ok(FSSTArray::try_new( - array.dtype().clone(), + // TODO(aduffy): fix cloning + array.dtype().as_ref().clone(), array.symbols(), array.symbol_lengths(), take(array.codes(), indices)?, @@ -82,7 +84,8 @@ impl FilterFn for FSSTEncoding { // Filtering an FSSTArray filters the codes array, leaving the symbols array untouched fn filter(&self, array: &FSSTArray, mask: &FilterMask) -> VortexResult { Ok(FSSTArray::try_new( - array.dtype().clone(), + // TODO(aduffy): fix cloning + array.dtype().as_ref().clone(), array.symbols(), array.symbol_lengths(), filter(&array.codes(), mask)?, diff --git a/encodings/roaring/src/boolean/mod.rs b/encodings/roaring/src/boolean/mod.rs index e3c114bece..36e6e9abd6 100644 --- a/encodings/roaring/src/boolean/mod.rs +++ b/encodings/roaring/src/boolean/mod.rs @@ -13,10 +13,11 @@ use vortex_array::validity::{LogicalValidity, ValidityVTable}; use vortex_array::variants::{BoolArrayTrait, VariantsVTable}; use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; use vortex_array::{ - impl_encoding, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, IntoCanonical, + bool_dtype, impl_encoding, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, + IntoCanonical, }; use vortex_buffer::ByteBuffer; -use vortex_dtype::{DType, Nullability}; +use vortex_dtype::Nullability; use vortex_error::{vortex_bail, vortex_err, VortexExpect as _, VortexResult}; mod compress; @@ -52,7 +53,7 @@ impl RoaringBoolArray { ArrayData::try_new_owned( &RoaringBoolEncoding, - DType::Bool(Nullability::NonNullable), + bool_dtype!(Nullability::NonNullable), length, Arc::new(RoaringBoolMetadata), [ByteBuffer::from(bitmap.serialize::())].into(), diff --git a/encodings/roaring/src/integer/mod.rs b/encodings/roaring/src/integer/mod.rs index de8489db2d..c213a62b04 100644 --- a/encodings/roaring/src/integer/mod.rs +++ b/encodings/roaring/src/integer/mod.rs @@ -12,12 +12,12 @@ use vortex_array::validity::{LogicalValidity, Validity, ValidityVTable}; use vortex_array::variants::{PrimitiveArrayTrait, VariantsVTable}; use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; use vortex_array::{ - impl_encoding, ArrayDType as _, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, - IntoCanonical, + impl_encoding, primitive_dtype, ArrayDType as _, ArrayData, ArrayLen, ArrayTrait, Canonical, + IntoArrayData, IntoCanonical, }; use vortex_buffer::{Buffer, ByteBuffer}; use vortex_dtype::Nullability::NonNullable; -use vortex_dtype::{DType, PType}; +use vortex_dtype::PType; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult}; mod compress; @@ -65,7 +65,7 @@ impl RoaringIntArray { ArrayData::try_new_owned( &RoaringIntEncoding, - DType::Primitive(ptype, NonNullable), + primitive_dtype!(ptype, NonNullable), length, Arc::new(RoaringIntMetadata { ptype }), [ByteBuffer::from(bitmap.serialize::())].into(), diff --git a/encodings/runend-bool/src/array.rs b/encodings/runend-bool/src/array.rs index d79e635681..96483ab6c7 100644 --- a/encodings/runend-bool/src/array.rs +++ b/encodings/runend-bool/src/array.rs @@ -3,16 +3,19 @@ use std::fmt::{Debug, Display}; use serde::{Deserialize, Serialize}; use vortex_array::array::{BoolArray, PrimitiveArray}; use vortex_array::compute::{scalar_at, search_sorted_usize, SearchSortedSide}; +use vortex_array::dtypes::DTYPE_BOOL_NONNULL; use vortex_array::encoding::ids; use vortex_array::stats::{ArrayStatistics, Stat, StatisticsVTable, StatsSet}; use vortex_array::validity::{LogicalValidity, Validity, ValidityMetadata, ValidityVTable}; use vortex_array::variants::{BoolArrayTrait, PrimitiveArrayTrait, VariantsVTable}; use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; use vortex_array::{ - impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, - IntoArrayVariant, IntoCanonical, + bool_dtype, impl_encoding, primitive_dtype_ref, ArrayDType, ArrayData, ArrayLen, ArrayTrait, + Canonical, IntoArrayData, IntoArrayVariant, IntoCanonical, +}; +use vortex_dtype::{ + match_each_integer_ptype, match_each_unsigned_integer_ptype, Nullability, PType, }; -use vortex_dtype::{match_each_integer_ptype, match_each_unsigned_integer_ptype, DType, PType}; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult}; use vortex_scalar::Scalar; @@ -71,7 +74,6 @@ impl RunEndBoolArray { } } - let dtype = DType::Bool(validity.nullability()); let ends_ptype = ends.dtype().try_into()?; let metadata = RunEndBoolMetadata { start, @@ -102,6 +104,7 @@ impl RunEndBoolArray { StatsSet::default() }; + let dtype = bool_dtype!(validity.nullability()); let mut children = Vec::with_capacity(2); children.push(ends); if let Some(a) = validity.into_array() { @@ -131,7 +134,7 @@ impl RunEndBoolArray { self.as_ref() .child( 0, - &self.metadata().ends_ptype.into(), + primitive_dtype_ref!(self.metadata().ends_ptype, Nullability::NonNullable), self.metadata().num_runs, ) .vortex_expect("RunEndBoolArray is missing its run ends") @@ -140,7 +143,7 @@ impl RunEndBoolArray { pub fn validity(&self) -> Validity { self.metadata().validity.to_validity(|| { self.as_ref() - .child(1, &Validity::DTYPE, self.len()) + .child(1, &DTYPE_BOOL_NONNULL, self.len()) .vortex_expect("RunEndBoolArray: validity child") }) } @@ -286,7 +289,7 @@ mod test { ) .unwrap(); assert_eq!(arr.len(), 5); - assert_eq!(arr.dtype(), &DType::Bool(Nullability::NonNullable)); + assert_eq!(arr.dtype().as_ref(), &DType::Bool(Nullability::NonNullable)); assert_eq!(scalar_at(arr.as_ref(), 0).unwrap(), false.into()); assert_eq!(scalar_at(arr.as_ref(), 2).unwrap(), true.into()); @@ -308,7 +311,7 @@ mod test { 8, ) .unwrap(); - assert_eq!(arr.dtype(), &DType::Bool(Nullability::NonNullable)); + assert_eq!(arr.dtype().as_ref(), &DType::Bool(Nullability::NonNullable)); assert_eq!( to_bool_vec(&arr), @@ -323,7 +326,7 @@ mod test { ]) .to_array(); let arr = slice(&raw, 2, 8).unwrap(); - assert_eq!(arr.dtype(), &DType::Bool(Nullability::NonNullable)); + assert_eq!(arr.dtype().as_ref(), &DType::Bool(Nullability::NonNullable)); assert_eq!( to_bool_vec(&arr), diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index d714c9dbc0..45bc6834ae 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -11,11 +11,11 @@ use vortex_array::validity::{ArrayValidity, LogicalValidity, ValidityVTable}; use vortex_array::variants::{BoolArrayTrait, PrimitiveArrayTrait, VariantsVTable}; use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; use vortex_array::{ - impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, - IntoArrayVariant, IntoCanonical, + impl_encoding, primitive_dtype_ref, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, + IntoArrayData, IntoArrayVariant, IntoCanonical, }; use vortex_buffer::Buffer; -use vortex_dtype::{DType, PType}; +use vortex_dtype::{DType, Nullability, PType}; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult}; use vortex_scalar::Scalar; @@ -52,7 +52,10 @@ impl RunEndArray { offset: usize, length: usize, ) -> VortexResult { - if !matches!(values.dtype(), &DType::Bool(_) | &DType::Primitive(_, _)) { + if !matches!( + values.dtype().as_ref(), + DType::Bool(_) | DType::Primitive(_, _) + ) { vortex_bail!( "RunEnd array can only have Bool or Primitive values, {} given", values.dtype() @@ -135,7 +138,7 @@ impl RunEndArray { self.as_ref() .child( 0, - &DType::from(self.metadata().ends_ptype), + primitive_dtype_ref!(self.metadata().ends_ptype, Nullability::NonNullable), self.metadata().num_runs, ) .vortex_expect("RunEndArray is missing its run ends") @@ -201,7 +204,7 @@ impl ValidityVTable for RunEndEncoding { impl IntoCanonical for RunEndArray { fn into_canonical(self) -> VortexResult { let pends = self.ends().into_primitive()?; - match self.dtype() { + match self.dtype().as_ref() { DType::Bool(_) => { let bools = self.values().into_bool()?; runend_decode_bools(pends, bools, self.offset(), self.len()).map(Canonical::Bool) @@ -278,7 +281,7 @@ mod tests { .unwrap(); assert_eq!(arr.len(), 10); assert_eq!( - arr.dtype(), + arr.dtype().as_ref(), &DType::Primitive(PType::I32, Nullability::NonNullable) ); diff --git a/encodings/runend/src/compress.rs b/encodings/runend/src/compress.rs index 12eec0f0e5..673ed7bff7 100644 --- a/encodings/runend/src/compress.rs +++ b/encodings/runend/src/compress.rs @@ -20,7 +20,8 @@ pub fn runend_encode(array: &PrimitiveArray) -> VortexResult<(PrimitiveArray, Ar // We can trivially return an all-null REE array return Ok(( PrimitiveArray::new(buffer![array.len() as u64], Validity::NonNullable), - ConstantArray::new(Scalar::null(array.dtype().clone()), 1).into_array(), + // TODO(aduffy): fix cloning + ConstantArray::new(Scalar::null(array.dtype().as_ref().clone()), 1).into_array(), )); } Validity::Array(a) => Some(a.into_bool()?.boolean_buffer()), diff --git a/encodings/runend/src/compute/mod.rs b/encodings/runend/src/compute/mod.rs index 72f3fd12ef..afac0d3525 100644 --- a/encodings/runend/src/compute/mod.rs +++ b/encodings/runend/src/compute/mod.rs @@ -182,7 +182,7 @@ mod test { ) .unwrap(); assert_eq!( - arr.dtype(), + arr.dtype().as_ref(), &DType::Primitive(PType::I32, Nullability::NonNullable) ); assert_eq!(arr.len(), 5); @@ -230,7 +230,7 @@ mod test { ) .unwrap(); assert_eq!( - arr.dtype(), + arr.dtype().as_ref(), &DType::Primitive(PType::I32, Nullability::NonNullable) ); assert_eq!(arr.len(), 6); diff --git a/encodings/zigzag/src/array.rs b/encodings/zigzag/src/array.rs index 12c28f6f26..fcfa226f65 100644 --- a/encodings/zigzag/src/array.rs +++ b/encodings/zigzag/src/array.rs @@ -8,10 +8,10 @@ use vortex_array::validity::{ArrayValidity, LogicalValidity, ValidityVTable}; use vortex_array::variants::{PrimitiveArrayTrait, VariantsVTable}; use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; use vortex_array::{ - impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayVariant, - IntoCanonical, + impl_encoding, primitive_dtype, primitive_dtype_ref, ArrayDType, ArrayData, ArrayLen, + ArrayTrait, Canonical, IntoArrayVariant, IntoCanonical, }; -use vortex_dtype::{DType, PType}; +use vortex_dtype::PType; use vortex_error::{vortex_bail, vortex_err, vortex_panic, VortexExpect as _, VortexResult}; use vortex_scalar::Scalar; use zigzag::ZigZag as ExternalZigZag; @@ -37,8 +37,8 @@ impl ZigZagArray { vortex_bail!(MismatchedTypes: "unsigned int", encoded_dtype); } - let dtype = DType::from(PType::try_from(&encoded_dtype)?.to_signed()) - .with_nullability(encoded_dtype.nullability()); + let ptype = PType::try_from(&encoded_dtype)?.to_signed(); + let dtype = primitive_dtype!(ptype, encoded_dtype.nullability()); let len = encoded.len(); let children = [encoded]; @@ -62,9 +62,12 @@ impl ZigZagArray { let ptype = PType::try_from(self.dtype()).unwrap_or_else(|err| { vortex_panic!(err, "Failed to convert DType {} to PType", self.dtype()) }); - let encoded = DType::from(ptype.to_unsigned()).with_nullability(self.dtype().nullability()); self.as_ref() - .child(0, &encoded, self.len()) + .child( + 0, + primitive_dtype_ref!(ptype.to_unsigned(), self.dtype().nullability()), + self.len(), + ) .vortex_expect("ZigZagArray is missing its encoded child array") } } diff --git a/fuzz/src/filter.rs b/fuzz/src/filter.rs index 9c2bb87ba8..2ff6ae37e3 100644 --- a/fuzz/src/filter.rs +++ b/fuzz/src/filter.rs @@ -28,7 +28,7 @@ pub fn filter_canonical_array(array: &ArrayData, filter: &[bool]) -> ArrayData { Validity::NonNullable }; - match array.dtype() { + match array.dtype().as_ref() { DType::Bool(_) => { let bool_array = array.clone().into_bool().unwrap(); BoolArray::try_new( @@ -67,7 +67,8 @@ pub fn filter_canonical_array(array: &ArrayData, filter: &[bool]) -> ArrayData { .collect::>() }) .unwrap(); - VarBinViewArray::from_iter(values, array.dtype().clone()).into_array() + // TODO(aduffy): fix extra clone + VarBinViewArray::from_iter(values, array.dtype().as_ref().clone()).into_array() } DType::Struct(..) => { let struct_array = array.clone().into_struct().unwrap(); diff --git a/fuzz/src/search_sorted.rs b/fuzz/src/search_sorted.rs index 24fdd7154e..540689a38f 100644 --- a/fuzz/src/search_sorted.rs +++ b/fuzz/src/search_sorted.rs @@ -61,7 +61,7 @@ pub fn search_sorted_canonical_array( scalar: &Scalar, side: SearchSortedSide, ) -> SearchResult { - match array.dtype() { + match array.dtype().as_ref() { DType::Bool(_) => { let bool_array = array.clone().into_bool().unwrap(); let validity = bool_array @@ -104,7 +104,7 @@ pub fn search_sorted_canonical_array( let opt_values = utf8 .with_iterator(|iter| iter.map(|v| v.map(|u| u.to_vec())).collect::>()) .unwrap(); - let to_find = if matches!(array.dtype(), DType::Utf8(_)) { + let to_find = if matches!(array.dtype().as_ref(), DType::Utf8(_)) { BufferString::try_from(scalar) .unwrap() .as_str() diff --git a/fuzz/src/slice.rs b/fuzz/src/slice.rs index 1b46952bfc..2f9969b207 100644 --- a/fuzz/src/slice.rs +++ b/fuzz/src/slice.rs @@ -21,7 +21,7 @@ pub fn slice_canonical_array(array: &ArrayData, start: usize, stop: usize) -> Ar Validity::NonNullable }; - match array.dtype() { + match array.dtype().as_ref() { DType::Bool(_) => { let bool_array = array.clone().into_bool().unwrap(); let sliced_bools = bool_array.boolean_buffer().slice(start, stop - start); @@ -40,8 +40,12 @@ pub fn slice_canonical_array(array: &ArrayData, start: usize, stop: usize) -> Ar let values = utf8 .with_iterator(|iter| iter.map(|v| v.map(|u| u.to_vec())).collect::>()) .unwrap(); - VarBinViewArray::from_iter(values[start..stop].iter().cloned(), array.dtype().clone()) - .into_array() + // TODO(aduffy): fix extra clone + VarBinViewArray::from_iter( + values[start..stop].iter().cloned(), + array.dtype().as_ref().clone(), + ) + .into_array() } DType::Struct(..) => { let struct_array = array.clone().into_struct().unwrap(); diff --git a/fuzz/src/sort.rs b/fuzz/src/sort.rs index 50c0195216..037aac7cfa 100644 --- a/fuzz/src/sort.rs +++ b/fuzz/src/sort.rs @@ -10,7 +10,7 @@ use vortex_dtype::{match_each_native_ptype, DType, NativePType}; use crate::take::take_canonical_array; pub fn sort_canonical_array(array: &ArrayData) -> ArrayData { - match array.dtype() { + match array.dtype().as_ref() { DType::Bool(_) => { let bool_array = array.clone().into_bool().unwrap(); let mut opt_values = bool_array @@ -58,7 +58,8 @@ pub fn sort_canonical_array(array: &ArrayData) -> ArrayData { .with_iterator(|iter| iter.map(|v| v.map(|u| u.to_vec())).collect::>()) .unwrap(); sort_opt_slice(&mut opt_values); - VarBinViewArray::from_iter(opt_values, array.dtype().clone()).into_array() + // TODO(aduffy): fix extra clone + VarBinViewArray::from_iter(opt_values, array.dtype().as_ref().clone()).into_array() } DType::Struct(..) => { let mut sort_indices = (0..array.len()).collect::>(); diff --git a/fuzz/src/take.rs b/fuzz/src/take.rs index 080a989796..98c017c6ca 100644 --- a/fuzz/src/take.rs +++ b/fuzz/src/take.rs @@ -26,7 +26,7 @@ pub fn take_canonical_array(array: &ArrayData, indices: &[usize]) -> ArrayData { Validity::NonNullable }; - match array.dtype() { + match array.dtype().as_ref() { DType::Bool(_) => { let bool_array = array.clone().into_bool().unwrap(); let vec_values = bool_array.boolean_buffer().iter().collect::>(); @@ -47,7 +47,8 @@ pub fn take_canonical_array(array: &ArrayData, indices: &[usize]) -> ArrayData { .unwrap(); VarBinViewArray::from_iter( indices.iter().map(|i| values[*i].clone()), - array.dtype().clone(), + // TODO(aduffy): fix extra clone + array.dtype().as_ref().clone(), ) .into_array() } diff --git a/pyvortex/demo/repro_compress.py b/pyvortex/demo/repro_compress.py new file mode 100644 index 0000000000..145f692a4a --- /dev/null +++ b/pyvortex/demo/repro_compress.py @@ -0,0 +1,20 @@ +import vortex as vx +import pyarrow.parquet as pq +from time import time + + +print("reading parquet file to arrow...", end="", flush=True) +start = time() +table = pq.read_table("/Users/aduffy/Downloads/share_vortex/A0.small.50.parquet") +print("completed in ", time() - start) + +print("reading arrow to Vortex...", end="", flush=True) +start = time() +vtable = vx.array(table) +print("completed in ", time() - start) + +print("compressing vortex...", end="", flush=True) +start = time() +vtable = vx.compress(vtable) +print("completed in ", time() - start) + diff --git a/pyvortex/demo/repro_decompress.py b/pyvortex/demo/repro_decompress.py new file mode 100644 index 0000000000..3a073f82c4 --- /dev/null +++ b/pyvortex/demo/repro_decompress.py @@ -0,0 +1,25 @@ +import vortex as vx +import pyarrow.parquet as pq +from time import time + + +print("reading parquet file to arrow...", end="", flush=True) +start = time() +table = pq.read_table("/Users/aduffy/Downloads/share_vortex/A0.small.50.parquet") +print("completed in ", time() - start) + +print("reading arrow to Vortex...", end="", flush=True) +start = time() +vtable = vx.array(table) +print("completed in ", time() - start) + +print("compressing vortex...", end="", flush=True) +start = time() +vtable = vx.compress(vtable) +print("completed in ", time() - start) + +print("decompressing vortex to arrow...", end="", flush=True) +start = time() +arr = vtable.to_arrow_table() +duration = time() - start +print("completed in ", duration) \ No newline at end of file diff --git a/pyvortex/demo/users.py b/pyvortex/demo/users.py new file mode 100644 index 0000000000..85a9784703 --- /dev/null +++ b/pyvortex/demo/users.py @@ -0,0 +1,62 @@ +import pyarrow.parquet as pq +import vortex as vx +import numpy as np +from time import time + + +# taken from OpenAI text-3-small +EMBED_DIM = 1536 +N_EMBEDS = 1 + +def generate_users(row_count, col_count, wrapped=False): + users = [] + for i in range(row_count): + #username = f"user_{i}" + # truncate the precision to 4 decimal places to trigger ALP + if wrapped: + user_dict = {f"embed_{i}": {"vals": np.round(np.random.rand(EMBED_DIM), 7) } for i in range(col_count) } + else: + user_dict = {f"embed_{i}": np.round(np.random.rand(EMBED_DIM), 7) for i in range(col_count) } + #embedding = np.round(np.random.rand(EMBED_DIM), 4).tolist() + #users.append({"username": username, "embedding": embedding}) + users.append(user_dict) + return users + +# +#print("generating test users...") +#start = time() +#test_users = generate_users(10_000) +#print("completed in ", time() - start) +# +## emit the users to Vortex +#print("moving array into vortex...") +#start = time() +#arr = vx.array(test_users) +#print("completed in ", time() - start) +##print(arr.tree_display()) +# +#print("compressing vortex array") +#start = time() +#comp = vx.compress(arr) +#print("completed in ", time() - start) +#print(comp.tree_display()) + +def generate_list_cols(row_count, col_count, wrapped): + users = generate_users(row_count, col_count, wrapped) + arr = vx.array(users) + # compression speed + start = time() + comp = vx.compress(arr) + duration = time() - start + print(f"wrapped={wrapped},rows={row_count},cols={col_count},time={duration}") + #print(comp.tree_display()) + start = time() + comp.to_arrow_table() + print(" decompress time: ", time() - start) + + +for row_count in [1_000]: + for col_count in [14000]: + for wrapped in [False, True]: + #for col_count in [1, 100, 200, 300, 400, 500, 700, 800, 900, 1_000]: + generate_list_cols(row_count, col_count, wrapped) diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs index 43f88bb610..3294878969 100644 --- a/pyvortex/src/array.rs +++ b/pyvortex/src/array.rs @@ -190,7 +190,8 @@ impl PyArray { /// utf8(False) #[getter] fn dtype(self_: PyRef) -> PyResult> { - PyDType::wrap(self_.py(), self_.inner.dtype().clone()) + // TODO(aduffy): fix extra clone + PyDType::wrap(self_.py(), self_.inner.dtype().as_ref().clone()) } // Rust docs are *not* copied into Python for __lt__: https://github.com/PyO3/pyo3/issues/4326 diff --git a/pyvortex/src/encode.rs b/pyvortex/src/encode.rs index e533a5f794..92b7b14688 100644 --- a/pyvortex/src/encode.rs +++ b/pyvortex/src/encode.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow::array::{make_array, ArrayData as ArrowArrayData}; use arrow::datatypes::{DataType, Field}; use arrow::ffi_stream::ArrowArrayStreamReader; @@ -42,7 +44,8 @@ pub fn _encode<'py>(obj: &Bound<'py, PyAny>) -> PyResult> { .map(|dt| DType::from_arrow(&Field::new("_", dt, false)))?; Bound::new( obj.py(), - PyArray::new(ChunkedArray::try_new(encoded_chunks, dtype)?.into_array()), + // TODO(aduffy): fix extra clone + PyArray::new(ChunkedArray::try_new(encoded_chunks, Arc::new(dtype))?.into_array()), ) } else if obj.is_instance(&table)? { let array_stream = ArrowArrayStreamReader::from_pyarrow_bound(obj)?; @@ -54,7 +57,8 @@ pub fn _encode<'py>(obj: &Bound<'py, PyAny>) -> PyResult> { .collect::>>()?; Bound::new( obj.py(), - PyArray::new(ChunkedArray::try_new(chunks, dtype)?.into_array()), + // TODO(aduffy): fix extra clone + PyArray::new(ChunkedArray::try_new(chunks, Arc::new(dtype))?.into_array()), ) } else { Err(PyValueError::new_err( diff --git a/vortex-array/src/array/bool/compute/fill_null.rs b/vortex-array/src/array/bool/compute/fill_null.rs index 933a93678f..454441865a 100644 --- a/vortex-array/src/array/bool/compute/fill_null.rs +++ b/vortex-array/src/array/bool/compute/fill_null.rs @@ -58,7 +58,7 @@ mod tests { expected ); assert_eq!( - non_null_array.dtype(), + non_null_array.dtype().as_ref(), &DType::Bool(Nullability::NonNullable) ); } diff --git a/vortex-array/src/array/bool/mod.rs b/vortex-array/src/array/bool/mod.rs index 4d9ec80fb5..4d323d83a0 100644 --- a/vortex-array/src/array/bool/mod.rs +++ b/vortex-array/src/array/bool/mod.rs @@ -5,9 +5,10 @@ use arrow_array::BooleanArray; use arrow_buffer::{BooleanBufferBuilder, MutableBuffer}; use serde::{Deserialize, Serialize}; use vortex_buffer::{Alignment, ByteBuffer}; -use vortex_dtype::{DType, Nullability}; +use vortex_dtype::Nullability; use vortex_error::{VortexExpect as _, VortexResult}; +use crate::dtypes::{bool_dtype, DTYPE_BOOL_NONNULL}; use crate::encoding::ids; use crate::stats::StatsSet; use crate::validity::{LogicalValidity, Validity, ValidityMetadata, ValidityVTable}; @@ -92,7 +93,7 @@ impl BoolArray { pub fn validity(&self) -> Validity { self.metadata().validity.to_validity(|| { self.as_ref() - .child(0, &Validity::DTYPE, self.len()) + .child(0, &DTYPE_BOOL_NONNULL, self.len()) .vortex_expect("BoolArray: validity child") }) } @@ -121,7 +122,7 @@ impl BoolArray { ArrayData::try_new_owned( &BoolEncoding, - DType::Bool(validity.nullability()), + bool_dtype!(validity.nullability()), buffer_len, Arc::new(BoolMetadata { validity: validity.to_metadata(buffer_len)?, diff --git a/vortex-array/src/array/chunked/canonical.rs b/vortex-array/src/array/chunked/canonical.rs index 846726d539..2923402468 100644 --- a/vortex-array/src/array/chunked/canonical.rs +++ b/vortex-array/src/array/chunked/canonical.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow_buffer::BooleanBufferBuilder; use vortex_buffer::BufferMut; use vortex_dtype::{match_each_native_ptype, DType, NativePType, Nullability, PType, StructDType}; @@ -28,7 +30,7 @@ impl IntoCanonical for ChunkedArray { pub(crate) fn try_canonicalize_chunks( chunks: Vec, validity: Validity, - dtype: &DType, + dtype: &Arc, ) -> VortexResult { let mismatched = chunks .iter() @@ -38,7 +40,7 @@ pub(crate) fn try_canonicalize_chunks( vortex_bail!(MismatchedTypes: dtype.clone(), ErrString::from(format!("{:?}", mismatched))) } - match dtype { + match dtype.as_ref() { // Structs can have their internal field pointers swizzled to push the chunking down // one level internally without copying or decompressing any data. DType::Struct(struct_dtype, _) => { @@ -80,8 +82,9 @@ pub(crate) fn try_canonicalize_chunks( .map(|chunk| chunk.clone().into_extension().map(|ext| ext.storage())) .collect::>>()?; let storage_dtype = ext_dtype.storage_dtype().clone(); + // TODO(aduffy): fix cloning. let chunked_storage = - ChunkedArray::try_new(storage_chunks, storage_dtype)?.into_array(); + ChunkedArray::try_new(storage_chunks, Arc::new(storage_dtype))?.into_array(); Ok(Canonical::Extension(ExtensionArray::new( ext_dtype.clone(), @@ -160,7 +163,9 @@ fn pack_lists(chunks: &[ArrayData], validity: Validity, dtype: &DType) -> Vortex .map(|off| off + adjustment_from_previous - first_offset_value as i64), ); } - let chunked_elements = ChunkedArray::try_new(elements, elem_dtype.clone())?.into_array(); + // TODO(aduffy): fix cloning. + let chunked_elements = + ChunkedArray::try_new(elements, Arc::new(elem_dtype.clone()))?.into_array(); let offsets = PrimitiveArray::new(offsets.freeze(), Validity::NonNullable); ListArray::try_new(chunked_elements, offsets.into_array(), validity) @@ -185,7 +190,8 @@ fn swizzle_struct_chunks( .maybe_null_field_by_idx(field_idx) .ok_or_else(|| vortex_err!("All chunks must have same dtype; missing field at index {}, current chunk dtype: {}", field_idx, c.dtype())) ).collect::>>()?; - let field_array = ChunkedArray::try_new(field_chunks, field_dtype.clone())?; + // TODO(aduffy): fix cloning. + let field_array = ChunkedArray::try_new(field_chunks, Arc::new(field_dtype.clone()))?; field_arrays.push(field_array.into_array()); } @@ -366,7 +372,8 @@ mod tests { let chunked_list = ChunkedArray::try_new( vec![l1.clone().into_array(), l2.clone().into_array()], - List(Arc::new(Primitive(I32, NonNullable)), NonNullable), + // TODO(aduffy): find a way not to clone this again. + Arc::new(List(Arc::new(Primitive(I32, NonNullable)), NonNullable)), ); let canon_values = chunked_list.unwrap().into_list().unwrap(); diff --git a/vortex-array/src/array/chunked/compute/boolean.rs b/vortex-array/src/array/chunked/compute/boolean.rs index 4a109cc29a..3254865e3f 100644 --- a/vortex-array/src/array/chunked/compute/boolean.rs +++ b/vortex-array/src/array/chunked/compute/boolean.rs @@ -1,8 +1,9 @@ -use vortex_dtype::DType; +use vortex_dtype::Nullability; use vortex_error::VortexResult; use crate::array::{ChunkedArray, ChunkedEncoding}; use crate::compute::{binary_boolean, slice, BinaryBooleanFn, BinaryOperator}; +use crate::dtypes::bool_dtype; use crate::{ArrayDType, ArrayData, IntoArrayData}; impl BinaryBooleanFn for ChunkedEncoding { @@ -23,30 +24,30 @@ impl BinaryBooleanFn for ChunkedEncoding { } let nullable = lhs.dtype().is_nullable() || rhs.dtype().is_nullable(); - let dtype = DType::Bool(nullable.into()); - Ok(Some(ChunkedArray::try_new(chunks, dtype)?.into_array())) + let nullability: Nullability = nullable.into(); + Ok(Some( + ChunkedArray::try_new(chunks, bool_dtype!(nullability))?.into_array(), + )) } } #[cfg(test)] mod tests { - use vortex_dtype::{DType, Nullability}; use crate::array::{BoolArray, ChunkedArray}; use crate::compute::{binary_boolean, BinaryOperator}; + use crate::dtypes::{DTYPE_BOOL_NONNULL, DTYPE_BOOL_NULL}; use crate::{IntoArrayData, IntoArrayVariant}; #[test] fn test_bin_bool_chunked() { let arr0 = BoolArray::from_iter(vec![true, false]).into_array(); let arr1 = BoolArray::from_iter(vec![false, false, true]).into_array(); - let chunked1 = - ChunkedArray::try_new(vec![arr0, arr1], DType::Bool(Nullability::NonNullable)).unwrap(); + let chunked1 = ChunkedArray::try_new(vec![arr0, arr1], DTYPE_BOOL_NONNULL.clone()).unwrap(); let arr2 = BoolArray::from_iter(vec![Some(false), Some(true)]).into_array(); let arr3 = BoolArray::from_iter(vec![Some(false), None, Some(false)]).into_array(); - let chunked2 = - ChunkedArray::try_new(vec![arr2, arr3], DType::Bool(Nullability::Nullable)).unwrap(); + let chunked2 = ChunkedArray::try_new(vec![arr2, arr3], DTYPE_BOOL_NULL.clone()).unwrap(); assert_eq!( binary_boolean( diff --git a/vortex-array/src/array/chunked/compute/compare.rs b/vortex-array/src/array/chunked/compute/compare.rs index d6418b2093..72843dba85 100644 --- a/vortex-array/src/array/chunked/compute/compare.rs +++ b/vortex-array/src/array/chunked/compute/compare.rs @@ -1,8 +1,9 @@ -use vortex_dtype::DType; +use vortex_dtype::Nullability; use vortex_error::VortexResult; use crate::array::{ChunkedArray, ChunkedEncoding}; use crate::compute::{compare, slice, CompareFn, Operator}; +use crate::dtypes::bool_dtype; use crate::{ArrayDType, ArrayData, IntoArrayData}; impl CompareFn for ChunkedEncoding { @@ -23,12 +24,11 @@ impl CompareFn for ChunkedEncoding { idx += chunk.len(); } + let nullability: Nullability = + (lhs.dtype().is_nullable() || rhs.dtype().is_nullable()).into(); + Ok(Some( - ChunkedArray::try_new( - compare_chunks, - DType::Bool((lhs.dtype().is_nullable() || rhs.dtype().is_nullable()).into()), - )? - .into_array(), + ChunkedArray::try_new(compare_chunks, bool_dtype!(nullability))?.into_array(), )) } } diff --git a/vortex-array/src/array/chunked/compute/fill_null.rs b/vortex-array/src/array/chunked/compute/fill_null.rs index 14c5ff3bfa..ae4e903607 100644 --- a/vortex-array/src/array/chunked/compute/fill_null.rs +++ b/vortex-array/src/array/chunked/compute/fill_null.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use vortex_error::VortexResult; use vortex_scalar::Scalar; @@ -12,7 +14,8 @@ impl FillNullFn for ChunkedEncoding { .chunks() .map(|c| fill_null(c, fill_value.clone())) .collect::>>()?, - array.dtype().as_nonnullable(), + // TODO(aduffy): fix cloning. + Arc::new(array.dtype().as_nonnullable()), ) .map(|a| a.into_array()) } @@ -21,10 +24,11 @@ impl FillNullFn for ChunkedEncoding { #[cfg(test)] mod tests { use arrow_buffer::BooleanBuffer; - use vortex_dtype::{DType, Nullability}; + use vortex_dtype::Nullability; use crate::array::{BoolArray, ChunkedArray}; use crate::compute::fill_null; + use crate::dtypes::{DTYPE_BOOL_NONNULL, DTYPE_BOOL_NULL}; use crate::validity::Validity; use crate::{ArrayDType, IntoArrayData}; @@ -37,11 +41,11 @@ mod tests { .into_array(), BoolArray::new(BooleanBuffer::new_set(5), Nullability::Nullable).into_array(), ], - DType::Bool(Nullability::Nullable), + DTYPE_BOOL_NULL.clone(), ) .unwrap(); let filled = fill_null(chunked, false.into()).unwrap(); - assert_eq!(*filled.dtype(), DType::Bool(Nullability::NonNullable)); + assert_eq!(*filled.dtype(), DTYPE_BOOL_NONNULL.clone()); } } diff --git a/vortex-array/src/array/chunked/compute/filter.rs b/vortex-array/src/array/chunked/compute/filter.rs index fc29caf825..50878a2a98 100644 --- a/vortex-array/src/array/chunked/compute/filter.rs +++ b/vortex-array/src/array/chunked/compute/filter.rs @@ -181,11 +181,11 @@ fn find_chunk_idx(idx: usize, chunk_ends: &[u64]) -> (usize, usize) { #[cfg(test)] mod test { use vortex_dtype::half::f16; - use vortex_dtype::{DType, Nullability, PType}; + use vortex_dtype::{Nullability, PType}; use crate::array::{ChunkedArray, PrimitiveArray}; use crate::compute::{filter, FilterMask}; - use crate::IntoArrayData; + use crate::{primitive_dtype, IntoArrayData}; #[test] fn filter_chunked_floats() { @@ -209,7 +209,7 @@ mod test { ]) .into_array(), ], - DType::Primitive(PType::F16, Nullability::NonNullable), + primitive_dtype!(PType::F16, Nullability::NonNullable), ) .unwrap() .into_array(); diff --git a/vortex-array/src/array/chunked/compute/mod.rs b/vortex-array/src/array/chunked/compute/mod.rs index 7cb867ffb7..374535dd26 100644 --- a/vortex-array/src/array/chunked/compute/mod.rs +++ b/vortex-array/src/array/chunked/compute/mod.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use vortex_dtype::DType; use vortex_error::VortexResult; @@ -68,18 +70,19 @@ impl CastFn for ChunkedEncoding { cast_chunks.push(try_cast(&chunk, dtype)?); } - Ok(ChunkedArray::try_new(cast_chunks, dtype.clone())?.into_array()) + // TODO(aduffy): fix cloning. + Ok(ChunkedArray::try_new(cast_chunks, Arc::new(dtype.clone()))?.into_array()) } } #[cfg(test)] mod test { use vortex_buffer::buffer; - use vortex_dtype::{DType, Nullability, PType}; + use vortex_dtype::{Nullability, PType}; use crate::array::chunked::ChunkedArray; use crate::compute::try_cast; - use crate::{IntoArrayData, IntoArrayVariant}; + use crate::{primitive_dtype, IntoArrayData, IntoArrayVariant}; #[test] fn test_cast_chunked() { @@ -88,7 +91,7 @@ mod test { let chunked = ChunkedArray::try_new( vec![arr0, arr1], - DType::Primitive(PType::U32, Nullability::NonNullable), + primitive_dtype!(PType::U32, Nullability::NonNullable), ) .unwrap() .into_array(); @@ -96,7 +99,7 @@ mod test { // Two levels of chunking, just to be fancy. let root = ChunkedArray::try_new( vec![chunked], - DType::Primitive(PType::U32, Nullability::NonNullable), + primitive_dtype!(PType::U32, Nullability::NonNullable), ) .unwrap() .into_array(); @@ -104,7 +107,7 @@ mod test { assert_eq!( try_cast( &root, - &DType::Primitive(PType::U64, Nullability::NonNullable) + primitive_dtype!(PType::U64, Nullability::NonNullable).as_ref() ) .unwrap() .into_primitive() diff --git a/vortex-array/src/array/chunked/compute/scalar_at.rs b/vortex-array/src/array/chunked/compute/scalar_at.rs index 8ab111ef8d..8126303ad5 100644 --- a/vortex-array/src/array/chunked/compute/scalar_at.rs +++ b/vortex-array/src/array/chunked/compute/scalar_at.rs @@ -14,11 +14,11 @@ impl ScalarAtFn for ChunkedEncoding { #[cfg(test)] mod tests { use vortex_buffer::Buffer; - use vortex_dtype::{DType, Nullability, PType}; + use vortex_dtype::{Nullability, PType}; use crate::array::{ChunkedArray, PrimitiveArray}; use crate::compute::scalar_at; - use crate::IntoArrayData; + use crate::{primitive_dtype, IntoArrayData}; #[test] fn empty_children_both_sides() { @@ -30,7 +30,7 @@ mod tests { Buffer::::empty().into_array(), Buffer::::empty().into_array(), ], - DType::Primitive(PType::U64, Nullability::NonNullable), + primitive_dtype!(PType::U64, Nullability::NonNullable), ) .unwrap(); assert_eq!(scalar_at(array.as_ref(), 0).unwrap(), 1u64.into()); @@ -46,7 +46,7 @@ mod tests { Buffer::::empty().into_array(), PrimitiveArray::from_iter([3u64, 4]).into_array(), ], - DType::Primitive(PType::U64, Nullability::NonNullable), + primitive_dtype!(PType::U64, Nullability::NonNullable), ) .unwrap(); assert_eq!(scalar_at(array.as_ref(), 0).unwrap(), 1u64.into()); @@ -64,7 +64,7 @@ mod tests { PrimitiveArray::from_iter([1u64, 2]).into_array(), PrimitiveArray::from_iter([3u64, 4]).into_array(), ], - DType::Primitive(PType::U64, Nullability::NonNullable), + primitive_dtype!(PType::U64, Nullability::NonNullable), ) .unwrap(); assert_eq!(scalar_at(array.as_ref(), 0).unwrap(), 1u64.into()); diff --git a/vortex-array/src/array/chunked/compute/slice.rs b/vortex-array/src/array/chunked/compute/slice.rs index 9e90f820c6..a4f8057fd8 100644 --- a/vortex-array/src/array/chunked/compute/slice.rs +++ b/vortex-array/src/array/chunked/compute/slice.rs @@ -44,11 +44,12 @@ impl SliceFn for ChunkedEncoding { #[cfg(test)] mod tests { - use vortex_dtype::{DType, NativePType, Nullability, PType}; + use vortex_dtype::{NativePType, Nullability, PType}; use crate::array::{ChunkedArray, PrimitiveArray}; use crate::compute::slice; - use crate::{ArrayData, IntoArrayData, IntoArrayVariant}; + use crate::dtypes::DTYPE_U32_NONNULL; + use crate::{primitive_dtype, ArrayData, IntoArrayData, IntoArrayVariant}; fn chunked_array() -> ChunkedArray { ChunkedArray::try_new( @@ -57,7 +58,7 @@ mod tests { PrimitiveArray::from_iter([4u64, 5, 6]).into_array(), PrimitiveArray::from_iter([7u64, 8, 9]).into_array(), ], - DType::Primitive(PType::U64, Nullability::NonNullable), + primitive_dtype!(PType::U64, Nullability::NonNullable), ) .unwrap() } @@ -116,7 +117,7 @@ mod tests { #[test] fn slice_empty() { - let chunked = ChunkedArray::try_new(vec![], PType::U32.into()).unwrap(); + let chunked = ChunkedArray::try_new(vec![], DTYPE_U32_NONNULL.clone()).unwrap(); let sliced = slice(chunked, 0, 0).unwrap(); assert!(sliced.is_empty()); diff --git a/vortex-array/src/array/chunked/mod.rs b/vortex-array/src/array/chunked/mod.rs index 3a27e475e7..6093d9f401 100644 --- a/vortex-array/src/array/chunked/mod.rs +++ b/vortex-array/src/array/chunked/mod.rs @@ -3,15 +3,17 @@ //! Vortex is a chunked array library that's able to use std::fmt::{Debug, Display}; +use std::sync::Arc; use futures_util::stream; use serde::{Deserialize, Serialize}; use vortex_buffer::BufferMut; -use vortex_dtype::{DType, Nullability, PType}; +use vortex_dtype::DType; use vortex_error::{vortex_bail, vortex_panic, VortexExpect as _, VortexResult, VortexUnwrap}; use crate::array::primitive::PrimitiveArray; use crate::compute::{scalar_at, search_sorted_usize, SearchSortedSide}; +use crate::dtypes::DTYPE_U64_NONNULL; use crate::encoding::ids; use crate::iter::{ArrayIterator, ArrayIteratorAdapter}; use crate::stats::StatsSet; @@ -42,9 +44,7 @@ impl Display for ChunkedMetadata { } impl ChunkedArray { - const ENDS_DTYPE: DType = DType::Primitive(PType::U64, Nullability::NonNullable); - - pub fn try_new(chunks: Vec, dtype: DType) -> VortexResult { + pub fn try_new(chunks: Vec, dtype: Arc) -> VortexResult { for chunk in &chunks { if chunk.dtype() != &dtype { vortex_bail!(MismatchedTypes: dtype, chunk.dtype()); @@ -102,7 +102,7 @@ impl ChunkedArray { #[inline] pub fn chunk_offsets(&self) -> ArrayData { self.as_ref() - .child(0, &Self::ENDS_DTYPE, self.nchunks() + 1) + .child(0, &DTYPE_U64_NONNULL, self.nchunks() + 1) .vortex_expect("Missing chunk ends in ChunkedArray") } @@ -137,12 +137,17 @@ impl ChunkedArray { }) } + // TODO(aduffy): fix cloning. pub fn array_iterator(&self) -> impl ArrayIterator + '_ { - ArrayIteratorAdapter::new(self.dtype().clone(), self.chunks().map(Ok)) + ArrayIteratorAdapter::new(self.dtype().as_ref().clone(), self.chunks().map(Ok)) } + // TODO(aduffy): fix cloning. pub fn array_stream(&self) -> impl ArrayStream + '_ { - ArrayStreamAdapter::new(self.dtype().clone(), stream::iter(self.chunks().map(Ok))) + ArrayStreamAdapter::new( + self.dtype().as_ref().clone(), + stream::iter(self.chunks().map(Ok)), + ) } pub fn rechunk(&self, target_bytesize: usize, target_rowsize: usize) -> VortexResult { @@ -242,7 +247,7 @@ mod test { use crate::array::chunked::ChunkedArray; use crate::compute::test_harness::test_binary_numeric; use crate::compute::{scalar_at, sub_scalar, try_cast}; - use crate::{assert_arrays_eq, ArrayDType, IntoArrayData, IntoArrayVariant}; + use crate::{assert_arrays_eq, primitive_dtype, ArrayDType, IntoArrayData, IntoArrayVariant}; fn chunked_array() -> ChunkedArray { ChunkedArray::try_new( @@ -251,7 +256,7 @@ mod test { buffer![4u64, 5, 6].into_array(), buffer![7u64, 8, 9].into_array(), ], - DType::Primitive(PType::U64, Nullability::NonNullable), + primitive_dtype!(PType::U64, Nullability::NonNullable), ) .unwrap() } @@ -295,7 +300,7 @@ mod test { fn test_rechunk_one_chunk() { let chunked = ChunkedArray::try_new( vec![buffer![0u64].into_array()], - DType::Primitive(PType::U64, Nullability::NonNullable), + primitive_dtype!(PType::U64, Nullability::NonNullable), ) .unwrap(); @@ -308,7 +313,7 @@ mod test { fn test_rechunk_two_chunks() { let chunked = ChunkedArray::try_new( vec![buffer![0u64].into_array(), buffer![5u64].into_array()], - DType::Primitive(PType::U64, Nullability::NonNullable), + primitive_dtype!(PType::U64, Nullability::NonNullable), ) .unwrap(); @@ -325,7 +330,7 @@ mod test { buffer![0u64, 1, 2, 3].into_array(), buffer![4u64, 5].into_array(), ], - DType::Primitive(PType::U64, Nullability::NonNullable), + primitive_dtype!(PType::U64, Nullability::NonNullable), ) .unwrap(); @@ -346,7 +351,7 @@ mod test { buffer![6u64, 7].into_array(), buffer![8u64, 9].into_array(), ], - DType::Primitive(PType::U64, Nullability::NonNullable), + primitive_dtype!(PType::U64, Nullability::NonNullable), ) .unwrap(); diff --git a/vortex-array/src/array/chunked/variants.rs b/vortex-array/src/array/chunked/variants.rs index 44f0dd60aa..2dc898cd1a 100644 --- a/vortex-array/src/array/chunked/variants.rs +++ b/vortex-array/src/array/chunked/variants.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use itertools::Itertools; use vortex_dtype::{DType, Field, FieldName}; use vortex_error::{vortex_err, vortex_panic, VortexExpect, VortexResult}; @@ -73,7 +75,8 @@ impl StructArrayTrait for ChunkedArray { } let projected_dtype = self.dtype().as_struct().map(|s| s.field_dtype(idx))?.ok()?; - let chunked = ChunkedArray::try_new(chunks, projected_dtype.clone()) + // TODO(aduffy): fix cloning. + let chunked = ChunkedArray::try_new(chunks, Arc::new(projected_dtype.clone())) .unwrap_or_else(|err| { vortex_panic!( err, @@ -109,7 +112,8 @@ impl StructArrayTrait for ChunkedArray { )?; ChunkedArray::try_new( chunks, - DType::Struct(projected_dtype, self.dtype().nullability()), + // TODO(aduffy): fix cloning. + Arc::new(DType::Struct(projected_dtype, self.dtype().nullability())), ) .map(|a| a.into_array()) } diff --git a/vortex-array/src/array/constant/canonical.rs b/vortex-array/src/array/constant/canonical.rs index a83951f784..c79271894e 100644 --- a/vortex-array/src/array/constant/canonical.rs +++ b/vortex-array/src/array/constant/canonical.rs @@ -23,7 +23,7 @@ impl IntoCanonical for ConstantArray { }, }; - Ok(match self.dtype() { + Ok(match self.dtype().as_ref() { DType::Null => Canonical::Null(NullArray::new(self.len())), DType::Bool(..) => Canonical::Bool(BoolArray::try_new( if BoolScalar::try_from(scalar)?.value().unwrap_or_default() { diff --git a/vortex-array/src/array/constant/mod.rs b/vortex-array/src/array/constant/mod.rs index 1581971b6d..622e255977 100644 --- a/vortex-array/src/array/constant/mod.rs +++ b/vortex-array/src/array/constant/mod.rs @@ -1,4 +1,5 @@ use std::fmt::Display; +use std::sync::Arc; use serde::{Deserialize, Serialize}; use vortex_error::{VortexExpect, VortexResult}; @@ -40,7 +41,8 @@ impl ConstantArray { let stats = StatsSet::constant(&scalar, length); let (dtype, scalar_value) = scalar.into_parts(); Self::try_from_parts( - dtype, + // TODO(aduffy): fix cloning. + Arc::new(dtype), length, ConstantMetadata { scalar_value }, [].into(), @@ -52,7 +54,11 @@ impl ConstantArray { /// Returns the [`Scalar`] value of this constant array. pub fn scalar(&self) -> Scalar { // NOTE(ngates): these clones are pretty cheap. - Scalar::new(self.dtype().clone(), self.metadata().scalar_value.clone()) + // TODO(aduffy): no they're not. fix cloning. + Scalar::new( + self.dtype().as_ref().clone(), + self.metadata().scalar_value.clone(), + ) } } diff --git a/vortex-array/src/array/datetime/mod.rs b/vortex-array/src/array/datetime/mod.rs index 6b114874f2..19bef68cee 100644 --- a/vortex-array/src/array/datetime/mod.rs +++ b/vortex-array/src/array/datetime/mod.rs @@ -41,7 +41,7 @@ pub struct TemporalArray { macro_rules! assert_width { ($width:ty, $array:expr) => {{ - let DType::Primitive(ptype, _) = $array.dtype() else { + let DType::Primitive(ptype, _) = $array.dtype().as_ref() else { panic!("array must have primitive type"); }; @@ -82,7 +82,7 @@ impl TemporalArray { let ext_dtype = ExtDType::new( DATE_ID.clone(), - Arc::new(array.dtype().clone()), + array.dtype().clone(), Some(TemporalMetadata::Date(time_unit).into()), ); @@ -123,7 +123,7 @@ impl TemporalArray { ext: ExtensionArray::new( Arc::new(ExtDType::new( TIME_ID.clone(), - Arc::new(array.dtype().clone()), + array.dtype().clone(), Some(temporal_metadata.clone().into()), )), array, @@ -149,7 +149,7 @@ impl TemporalArray { ext: ExtensionArray::new( Arc::new(ExtDType::new( TIMESTAMP_ID.clone(), - Arc::new(array.dtype().clone()), + array.dtype().clone(), Some(temporal_metadata.clone().into()), )), array, diff --git a/vortex-array/src/array/extension/mod.rs b/vortex-array/src/array/extension/mod.rs index cf23e62dff..2103b0b79a 100644 --- a/vortex-array/src/array/extension/mod.rs +++ b/vortex-array/src/array/extension/mod.rs @@ -30,12 +30,12 @@ impl ExtensionArray { pub fn new(ext_dtype: Arc, storage: ArrayData) -> Self { assert_eq!( ext_dtype.storage_dtype(), - storage.dtype(), + storage.dtype().as_ref(), "ExtensionArray: storage_dtype must match storage array DType", ); Self::try_from_parts( - DType::Extension(ext_dtype), + Arc::new(DType::Extension(ext_dtype)), storage.len(), ExtensionMetadata, [storage].into(), @@ -45,8 +45,13 @@ impl ExtensionArray { } pub fn storage(&self) -> ArrayData { + // TODO(aduffy): fix cloning. self.as_ref() - .child(0, self.ext_dtype().storage_dtype(), self.len()) + .child( + 0, + &Arc::new(self.ext_dtype().storage_dtype().clone()), + self.len(), + ) .vortex_expect("Missing storage array for ExtensionArray") } diff --git a/vortex-array/src/array/list/compute/mod.rs b/vortex-array/src/array/list/compute/mod.rs index 317f574c75..f6a2fa1934 100644 --- a/vortex-array/src/array/list/compute/mod.rs +++ b/vortex-array/src/array/list/compute/mod.rs @@ -24,7 +24,7 @@ impl ScalarAtFn for ListEncoding { let scalars: Vec = (0..elem.len()).map(|i| scalar_at(&elem, i)).try_collect()?; Ok(Scalar::list( - Arc::new(elem.dtype().clone()), + Arc::clone(elem.dtype()), scalars, array.dtype().nullability(), )) diff --git a/vortex-array/src/array/list/mod.rs b/vortex-array/src/array/list/mod.rs index eb9bdab32d..2bd8443512 100644 --- a/vortex-array/src/array/list/mod.rs +++ b/vortex-array/src/array/list/mod.rs @@ -7,7 +7,6 @@ use std::sync::Arc; use itertools::Itertools; use num_traits::AsPrimitive; use serde::{Deserialize, Serialize}; -#[cfg(feature = "test-harness")] use vortex_dtype::Nullability; use vortex_dtype::{match_each_native_ptype, DType, PType}; use vortex_error::{vortex_bail, vortex_panic, VortexExpect, VortexResult}; @@ -18,6 +17,7 @@ use crate::array::PrimitiveArray; #[cfg(feature = "test-harness")] use crate::builders::{ArrayBuilder, ListBuilder}; use crate::compute::{scalar_at, slice}; +use crate::dtypes::{primitive_dtype_ref, DTYPE_BOOL_NONNULL}; use crate::encoding::ids; use crate::stats::{StatisticsVTable, StatsSet}; use crate::validity::{LogicalValidity, Validity, ValidityMetadata, ValidityVTable}; @@ -73,7 +73,8 @@ impl ListArray { let offset_ptype = PType::try_from(offsets.dtype())?; - let list_dtype = DType::List(Arc::new(elements.dtype().clone()), nullability); + // TODO(aduffy): figure out if there's a way to internet the schema. + let list_dtype = Arc::new(DType::List(Arc::clone(elements.dtype()), nullability)); let mut children = vec![elements, offsets]; if let Some(val) = validity.into_array() { @@ -96,7 +97,7 @@ impl ListArray { pub fn validity(&self) -> Validity { self.metadata().validity.to_validity(|| { self.as_ref() - .child(2, &Validity::DTYPE, self.len()) + .child(2, &DTYPE_BOOL_NONNULL, self.len()) .vortex_expect("ListArray: validity child") }) } @@ -136,7 +137,11 @@ impl ListArray { pub fn offsets(&self) -> ArrayData { // TODO: find cheap transform self.as_ref() - .child(1, &self.metadata().offset_ptype.into(), self.len() + 1) + .child( + 1, + primitive_dtype_ref!(self.metadata().offset_ptype, Nullability::NonNullable), + self.len() + 1, + ) .vortex_expect("array contains offsets") } @@ -146,8 +151,9 @@ impl ListArray { .dtype() .as_list_element() .vortex_expect("must be list dtype"); + // TODO(aduffy): fix dtype clone. self.as_ref() - .child(0, dtype, self.metadata().elements_len) + .child(0, &Arc::new(dtype.clone()), self.metadata().elements_len) .vortex_expect("array contains elements") } } diff --git a/vortex-array/src/array/null/mod.rs b/vortex-array/src/array/null/mod.rs index a4087d45a2..3d6563c87c 100644 --- a/vortex-array/src/array/null/mod.rs +++ b/vortex-array/src/array/null/mod.rs @@ -1,4 +1,5 @@ use std::fmt::Display; +use std::sync::{Arc, LazyLock}; use serde::{Deserialize, Serialize}; use vortex_dtype::DType; @@ -25,10 +26,12 @@ impl Display for NullMetadata { } } +static DTYPE_NULL: LazyLock> = LazyLock::new(|| Arc::new(DType::Null)); + impl NullArray { pub fn new(len: usize) -> Self { Self::try_from_parts( - DType::Null, + DTYPE_NULL.clone(), len, NullMetadata, [].into(), diff --git a/vortex-array/src/array/primitive/mod.rs b/vortex-array/src/array/primitive/mod.rs index fa212cbda0..12420a5477 100644 --- a/vortex-array/src/array/primitive/mod.rs +++ b/vortex-array/src/array/primitive/mod.rs @@ -6,9 +6,10 @@ mod accessor; use arrow_buffer::BooleanBufferBuilder; use serde::{Deserialize, Serialize}; use vortex_buffer::{Buffer, BufferMut, ByteBuffer}; -use vortex_dtype::{match_each_native_ptype, DType, NativePType, Nullability, PType}; +use vortex_dtype::{match_each_native_ptype, NativePType, Nullability, PType}; use vortex_error::{vortex_panic, VortexExpect as _, VortexResult}; +use crate::dtypes::{primitive_dtype, DTYPE_BOOL_NONNULL}; use crate::encoding::ids; use crate::iter::Accessor; use crate::stats::StatsSet; @@ -42,7 +43,7 @@ impl PrimitiveArray { let len = buffer.len(); ArrayData::try_new_owned( &PrimitiveEncoding, - DType::from(T::PTYPE).with_nullability(validity.nullability()), + primitive_dtype!(T::PTYPE, validity.nullability()), len, Arc::new(PrimitiveMetadata { validity: validity.to_metadata(len).vortex_expect("Invalid validity"), @@ -96,7 +97,7 @@ impl PrimitiveArray { pub fn validity(&self) -> Validity { self.metadata().validity.to_validity(|| { self.as_ref() - .child(0, &Validity::DTYPE, self.len()) + .child(0, &DTYPE_BOOL_NONNULL, self.len()) .vortex_expect("PrimitiveArray: validity child") }) } diff --git a/vortex-array/src/array/sparse/canonical.rs b/vortex-array/src/array/sparse/canonical.rs index 7495c515b7..71e09da496 100644 --- a/vortex-array/src/array/sparse/canonical.rs +++ b/vortex-array/src/array/sparse/canonical.rs @@ -1,6 +1,6 @@ use arrow_buffer::{ArrowNativeType, BooleanBuffer}; use vortex_buffer::buffer; -use vortex_dtype::{match_each_native_ptype, DType, NativePType, Nullability, PType}; +use vortex_dtype::{match_each_native_ptype, NativePType, Nullability, PType}; use vortex_error::{VortexError, VortexResult}; use vortex_scalar::Scalar; @@ -18,7 +18,7 @@ impl IntoCanonical for SparseArray { return ConstantArray::new(self.fill_scalar(), self.len()).into_canonical(); } - if matches!(self.dtype(), DType::Bool(_)) { + if self.dtype().is_boolean() { canonicalize_sparse_bools(resolved_patches, &self.fill_scalar()) } else { let ptype = PType::try_from(resolved_patches.values().dtype())?; @@ -94,7 +94,7 @@ mod test { use crate::array::sparse::SparseArray; use crate::array::{BoolArray, PrimitiveArray}; use crate::validity::Validity; - use crate::{ArrayDType, IntoArrayData, IntoCanonical}; + use crate::{primitive_dtype, ArrayDType, IntoArrayData, IntoCanonical}; #[rstest] #[case(Some(true))] @@ -106,7 +106,10 @@ mod test { .into_array(); let sparse_bools = SparseArray::try_new(indices, values, 10, Scalar::from(fill_value)).unwrap(); - assert_eq!(*sparse_bools.dtype(), DType::Bool(Nullability::Nullable)); + assert_eq!( + sparse_bools.dtype().as_ref(), + &DType::Bool(Nullability::Nullable) + ); let flat_bools = sparse_bools.into_canonical().unwrap().into_bool().unwrap(); let expected = bool_array_from_nullable_vec( @@ -167,7 +170,7 @@ mod test { SparseArray::try_new(indices, values, 10, Scalar::from(fill_value)).unwrap(); assert_eq!( *sparse_ints.dtype(), - DType::Primitive(PType::I32, Nullability::Nullable) + primitive_dtype!(PType::I32, Nullability::Nullable) ); let flat_ints = sparse_ints diff --git a/vortex-array/src/array/sparse/mod.rs b/vortex-array/src/array/sparse/mod.rs index dfcdcc3824..5c20ecb730 100644 --- a/vortex-array/src/array/sparse/mod.rs +++ b/vortex-array/src/array/sparse/mod.rs @@ -77,7 +77,7 @@ impl SparseArray { indices_offset: usize, fill_value: Scalar, ) -> VortexResult { - if fill_value.dtype() != patches.values().dtype() { + if fill_value.dtype() != patches.values().dtype().as_ref() { vortex_bail!( "fill value, {:?}, should be instance of values dtype, {}", fill_value, @@ -132,7 +132,11 @@ impl SparseArray { #[inline] pub fn fill_scalar(&self) -> Scalar { - Scalar::new(self.dtype().clone(), self.metadata().fill_value.clone()) + // TODO(aduffy): fix clone + Scalar::new( + self.dtype().as_ref().clone(), + self.metadata().fill_value.clone(), + ) } } diff --git a/vortex-array/src/array/struct_/compute.rs b/vortex-array/src/array/struct_/compute.rs index d95788d60e..fcb21957a5 100644 --- a/vortex-array/src/array/struct_/compute.rs +++ b/vortex-array/src/array/struct_/compute.rs @@ -32,7 +32,8 @@ impl ComputeVTable for StructEncoding { impl ScalarAtFn for StructEncoding { fn scalar_at(&self, array: &StructArray, index: usize) -> VortexResult { Ok(Scalar::struct_( - array.dtype().clone(), + // TODO(aduffy): fix clone + array.dtype().as_ref().clone(), array .children() .map(|field| scalar_at(&field, index)) diff --git a/vortex-array/src/array/struct_/mod.rs b/vortex-array/src/array/struct_/mod.rs index 9459db42c7..ef8c83a4f9 100644 --- a/vortex-array/src/array/struct_/mod.rs +++ b/vortex-array/src/array/struct_/mod.rs @@ -1,9 +1,11 @@ use std::fmt::{Debug, Display}; +use std::sync::Arc; use serde::{Deserialize, Serialize}; use vortex_dtype::{DType, Field, FieldName, FieldNames, StructDType}; use vortex_error::{vortex_bail, vortex_err, vortex_panic, VortexExpect as _, VortexResult}; +use crate::dtypes::DTYPE_BOOL_NONNULL; use crate::encoding::ids; use crate::stats::{ArrayStatistics, Stat, StatisticsVTable, StatsSet}; use crate::validity::{LogicalValidity, Validity, ValidityMetadata, ValidityVTable}; @@ -33,7 +35,7 @@ impl StructArray { pub fn validity(&self) -> Validity { self.metadata().validity.to_validity(|| { self.as_ref() - .child(self.nfields(), &Validity::DTYPE, self.len()) + .child(self.nfields(), &DTYPE_BOOL_NONNULL, self.len()) .vortex_expect("StructArray: validity child") }) } @@ -67,7 +69,8 @@ impl StructArray { } } - let field_dtypes: Vec<_> = fields.iter().map(|d| d.dtype()).cloned().collect(); + // TODO(aduffy): fix clones. + let field_dtypes: Vec = fields.iter().map(|d| d.dtype().as_ref()).cloned().collect(); let validity_metadata = validity.to_metadata(length)?; @@ -78,7 +81,10 @@ impl StructArray { } Self::try_from_parts( - DType::Struct(StructDType::new(names, field_dtypes), nullability), + Arc::new(DType::Struct( + StructDType::new(names, field_dtypes), + nullability, + )), length, StructMetadata { validity: validity_metadata, @@ -153,10 +159,13 @@ impl StructArrayTrait for StructArray { self.as_ref() .child( idx, - &field_info - .dtype - .value() - .vortex_expect("FieldInfo could not access dtype"), + // TODO(aduffy): fix clones. + &Arc::new( + field_info + .dtype + .value() + .vortex_expect("FieldInfo could not access dtype"), + ), self.len(), ) .unwrap_or_else(|e| { diff --git a/vortex-array/src/array/varbin/arrow.rs b/vortex-array/src/array/varbin/arrow.rs index dded05b226..70d0fd45de 100644 --- a/vortex-array/src/array/varbin/arrow.rs +++ b/vortex-array/src/array/varbin/arrow.rs @@ -34,7 +34,7 @@ pub(crate) fn varbin_to_arrow(varbin_array: &VarBinArray) -> VortexResult match offsets.ptype() { PType::I32 => Arc::new(unsafe { BinaryArray::new_unchecked( diff --git a/vortex-array/src/array/varbin/builder.rs b/vortex-array/src/array/varbin/builder.rs index d1e49bfced..c257f7a8e2 100644 --- a/vortex-array/src/array/varbin/builder.rs +++ b/vortex-array/src/array/varbin/builder.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow_buffer::NullBufferBuilder; use num_traits::{AsPrimitive, PrimInt}; use vortex_buffer::BufferMut; @@ -89,8 +91,14 @@ impl VarBinBuilder { Validity::NonNullable }; - VarBinArray::try_new(offsets.into_array(), self.data.freeze(), dtype, validity) - .vortex_expect("Unexpected error while building VarBinArray") + // TODO(aduffy): remove extra Arc::new() wrapper, instead receive an Arc in parameters. + VarBinArray::try_new( + offsets.into_array(), + self.data.freeze(), + Arc::new(dtype), + validity, + ) + .vortex_expect("Unexpected error while building VarBinArray") } } diff --git a/vortex-array/src/array/varbin/canonical.rs b/vortex-array/src/array/varbin/canonical.rs index 3879b1931b..175a23f5e5 100644 --- a/vortex-array/src/array/varbin/canonical.rs +++ b/vortex-array/src/array/varbin/canonical.rs @@ -13,7 +13,7 @@ impl IntoCanonical for VarBinArray { fn into_canonical(self) -> VortexResult { let nullable = self.dtype().is_nullable(); let array_ref = varbin_to_arrow(&self)?; - let array = match self.dtype() { + let array = match self.dtype().as_ref() { DType::Utf8(_) => arrow_cast::cast(array_ref.as_ref(), &DataType::Utf8View)?, DType::Binary(_) => arrow_cast::cast(array_ref.as_ref(), &DataType::BinaryView)?, @@ -63,7 +63,7 @@ mod test { let varbin = varbin.finish(dtype.clone()); let canonical = varbin.into_canonical().unwrap().into_varbinview().unwrap(); - assert_eq!(canonical.dtype(), &dtype); + assert_eq!(canonical.dtype().as_ref(), &dtype); assert!(!canonical.is_valid(0)); assert!(!canonical.is_valid(1)); diff --git a/vortex-array/src/array/varbin/compute/filter.rs b/vortex-array/src/array/varbin/compute/filter.rs index 671aa075c8..ffdadc7e7b 100644 --- a/vortex-array/src/array/varbin/compute/filter.rs +++ b/vortex-array/src/array/varbin/compute/filter.rs @@ -34,7 +34,8 @@ fn filter_select_var_bin_by_slice( let offsets = values.offsets().into_primitive()?; match_each_integer_ptype!(offsets.ptype(), |$O| { filter_select_var_bin_by_slice_primitive_offset( - values.dtype().clone(), + // TODO(aduffy): fix clones. + values.dtype().as_ref().clone(), offsets.as_slice::<$O>(), values.bytes().as_slice(), mask, @@ -134,7 +135,8 @@ fn filter_select_var_bin_by_index( let offsets = values.offsets().into_primitive()?; match_each_integer_ptype!(offsets.ptype(), |$O| { filter_select_var_bin_by_index_primitive_offset( - values.dtype().clone(), + // TODO(aduffy): fix clones. + values.dtype().as_ref().clone(), offsets.as_slice::<$O>(), values.bytes().as_slice(), mask, @@ -186,6 +188,7 @@ mod test { use crate::array::varbin::VarBinArray; use crate::array::BoolArray; use crate::compute::{scalar_at, FilterMask}; + use crate::dtypes::DTYPE_STRING_NULL; use crate::validity::Validity; use crate::ToArrayData; @@ -255,7 +258,8 @@ mod test { let offsets = PrimitiveArray::from_iter([0, 3, 6, 11, 15, 19, 22]).to_array(); let validity = Validity::Array(BoolArray::from_iter([true, false, true, true, true, true]).to_array()); - let arr = VarBinArray::try_new(offsets, bytes, DType::Utf8(Nullable), validity).unwrap(); + let arr = + VarBinArray::try_new(offsets, bytes, DTYPE_STRING_NULL.clone(), validity).unwrap(); let filter = FilterMask::from_iter([true, true, true, false, true, true]); let buf = filter_select_var_bin_by_slice(&arr, &filter, 5) diff --git a/vortex-array/src/array/varbin/compute/take.rs b/vortex-array/src/array/varbin/compute/take.rs index d7a1920c67..a6abcf46a7 100644 --- a/vortex-array/src/array/varbin/compute/take.rs +++ b/vortex-array/src/array/varbin/compute/take.rs @@ -19,7 +19,8 @@ impl TakeFn for VarBinEncoding { match_each_integer_ptype!(offsets.ptype(), |$O| { match_each_integer_ptype!(indices.ptype(), |$I| { Ok(take( - array.dtype().clone(), + // TODO(aduffy): fix cloning. + array.dtype().as_ref().clone(), offsets.as_slice::<$O>(), data.as_slice(), indices.as_slice::<$I>(), diff --git a/vortex-array/src/array/varbin/mod.rs b/vortex-array/src/array/varbin/mod.rs index fe7f3ee136..51712b6dbd 100644 --- a/vortex-array/src/array/varbin/mod.rs +++ b/vortex-array/src/array/varbin/mod.rs @@ -14,6 +14,7 @@ use vortex_scalar::Scalar; use crate::array::primitive::PrimitiveArray; use crate::array::varbin::builder::VarBinBuilder; use crate::compute::scalar_at; +use crate::dtypes::{primitive_dtype_ref, DTYPE_BOOL_NONNULL}; use crate::encoding::ids; use crate::stats::StatsSet; use crate::validity::{Validity, ValidityMetadata}; @@ -48,7 +49,7 @@ impl VarBinArray { pub fn try_new( offsets: ArrayData, bytes: ByteBuffer, - dtype: DType, + dtype: Arc, validity: Validity, ) -> VortexResult { if !offsets.dtype().is_int() || offsets.dtype().is_nullable() { @@ -56,7 +57,7 @@ impl VarBinArray { } let offsets_ptype = PType::try_from(offsets.dtype()).vortex_unwrap(); - if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) { + if !matches!(dtype.as_ref(), DType::Binary(_) | DType::Utf8(_)) { vortex_bail!(MismatchedTypes: "utf8 or binary", dtype); } if dtype.is_nullable() == (validity == Validity::NonNullable) { @@ -96,7 +97,7 @@ impl VarBinArray { self.as_ref() .child( 0, - &DType::Primitive(self.metadata().offsets_ptype, Nullability::NonNullable), + primitive_dtype_ref!(self.metadata().offsets_ptype, Nullability::NonNullable), self.len() + 1, ) .vortex_expect("Missing offsets in VarBinArray") @@ -105,7 +106,7 @@ impl VarBinArray { pub fn validity(&self) -> Validity { self.metadata().validity.to_validity(|| { self.as_ref() - .child(1, &Validity::DTYPE, self.len()) + .child(1, &DTYPE_BOOL_NONNULL, self.len()) .vortex_expect("VarBinArray: validity child") }) } @@ -207,7 +208,7 @@ impl VarBinArray { /// Consumes self, returning a tuple containing the `DType`, the `bytes` array, /// the `offsets` array, and the `validity`. - pub fn into_parts(self) -> (DType, ByteBuffer, ArrayData, Validity) { + pub fn into_parts(self) -> (Arc, ByteBuffer, ArrayData, Validity) { ( self.dtype().clone(), self.bytes(), @@ -281,11 +282,11 @@ pub fn varbin_scalar(value: ByteBuffer, dtype: &DType) -> Scalar { mod test { use rstest::{fixture, rstest}; use vortex_buffer::Buffer; - use vortex_dtype::{DType, Nullability}; use crate::array::primitive::PrimitiveArray; use crate::array::varbin::VarBinArray; use crate::compute::{scalar_at, slice}; + use crate::dtypes::DTYPE_STRING_NONNULL; use crate::validity::Validity; use crate::{ArrayData, IntoArrayData}; @@ -297,7 +298,7 @@ mod test { VarBinArray::try_new( offsets.into_array(), values, - DType::Utf8(Nullability::NonNullable), + DTYPE_STRING_NONNULL.clone(), Validity::NonNullable, ) .unwrap() diff --git a/vortex-array/src/array/varbinview/compute/mod.rs b/vortex-array/src/array/varbinview/compute/mod.rs index 6336d5491a..376e4af030 100644 --- a/vortex-array/src/array/varbinview/compute/mod.rs +++ b/vortex-array/src/array/varbinview/compute/mod.rs @@ -45,7 +45,8 @@ impl SliceFn for VarBinViewEncoding { (0..array.metadata().buffer_lens.len()) .map(|i| array.buffer(i)) .collect::>(), - array.dtype().clone(), + // TODO(aduffy): fix cloning + array.dtype().as_ref().clone(), array.validity().slice(start, stop)?, )? .into_array()) @@ -66,7 +67,8 @@ impl TakeFn for VarBinViewEncoding { Ok(VarBinViewArray::try_new( views_buffer, array.buffers().collect(), - array.dtype().clone(), + // TODO(aduffy): fix cloning. + array.dtype().as_ref().clone(), validity, )? .into_array()) @@ -88,7 +90,8 @@ impl TakeFn for VarBinViewEncoding { Ok(VarBinViewArray::try_new( views_buffer, array.buffers().collect(), - array.dtype().clone(), + // TODO(aduffy): fix cloning. + array.dtype().as_ref().clone(), validity, )? .into_array()) diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index 99a7a4677b..0387722cb7 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -16,6 +16,7 @@ use vortex_error::{ }; use crate::arrow::FromArrowArray; +use crate::dtypes::DTYPE_BOOL_NONNULL; use crate::encoding::ids; use crate::stats::StatsSet; use crate::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata, ValidityVTable}; @@ -243,7 +244,8 @@ impl VarBinViewArray { Self::try_from(ArrayData::try_new_owned( &VarBinViewEncoding, - dtype, + // TODO(aduffy): fix cloning. + Arc::new(dtype), array_len, Arc::new(metadata), array_buffers.into(), @@ -303,7 +305,7 @@ impl VarBinViewArray { pub fn validity(&self) -> Validity { self.metadata().validity.to_validity(|| { self.as_ref() - .child(0, &Validity::DTYPE, self.len()) + .child(0, &DTYPE_BOOL_NONNULL, self.len()) .vortex_expect("VarBinViewArray: validity child") }) } @@ -462,7 +464,7 @@ pub(crate) fn varbinview_as_arrow(var_bin_view: &VarBinViewArray) -> ArrayRef { .collect::>(); // Switch on Arrow DType. - match var_bin_view.dtype() { + match var_bin_view.dtype().as_ref() { DType::Binary(_) => Arc::new(unsafe { BinaryViewArray::new_unchecked( ScalarBuffer::::from(views.into_byte_buffer().into_arrow_buffer()), diff --git a/vortex-array/src/arrow/array.rs b/vortex-array/src/arrow/array.rs index 7f268eaa57..df4e1d55b2 100644 --- a/vortex-array/src/arrow/array.rs +++ b/vortex-array/src/arrow/array.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow_array::array::{ Array as ArrowArray, ArrayRef as ArrowArrayRef, ArrowPrimitiveType, BooleanArray as ArrowBooleanArray, GenericByteArray, NullArray as ArrowNullArray, @@ -124,7 +126,8 @@ where VarBinArray::try_new( value.offsets().clone().into(), ByteBuffer::from_arrow_buffer(value.values().clone(), Alignment::of::()), - dtype, + // TODO(aduffy): see if we can elide this extra Arc::new(). + Arc::new(dtype), nulls(value.nulls(), nullable), ) .vortex_expect("Failed to convert Arrow GenericByteArray to Vortex VarBinArray") diff --git a/vortex-array/src/builders/struct_.rs b/vortex-array/src/builders/struct_.rs index 7ff652982b..ba6abfc622 100644 --- a/vortex-array/src/builders/struct_.rs +++ b/vortex-array/src/builders/struct_.rs @@ -140,6 +140,6 @@ mod tests { let struct_ = builder.finish().unwrap(); assert_eq!(struct_.len(), 1); - assert_eq!(struct_.dtype(), &dtype); + assert_eq!(struct_.dtype().as_ref(), &dtype); } } diff --git a/vortex-array/src/compute/binary_numeric.rs b/vortex-array/src/compute/binary_numeric.rs index d092298f31..80c3fbd4d9 100644 --- a/vortex-array/src/compute/binary_numeric.rs +++ b/vortex-array/src/compute/binary_numeric.rs @@ -104,9 +104,10 @@ pub fn binary_numeric( rhs.len() ) } - if !matches!(lhs.dtype(), DType::Primitive(_, _)) - || !matches!(rhs.dtype(), DType::Primitive(_, _)) - || lhs.dtype() != rhs.dtype() + + if !matches!(lhs.dtype().as_ref(), DType::Primitive(_, _)) + || !matches!(rhs.dtype().as_ref(), DType::Primitive(_, _)) + || lhs.dtype().as_ref() != rhs.dtype().as_ref() { vortex_bail!( "Numeric operations are only supported on two arrays sharing the same primitive-type: {} {}", @@ -180,7 +181,7 @@ fn check_numeric_result(result: &ArrayData, lhs: &ArrayData, rhs: &ArrayData) { rhs.encoding().id() ); debug_assert_eq!( - result.dtype(), + result.dtype().as_ref(), &DType::Primitive( PType::try_from(lhs.dtype()) .vortex_expect("Numeric operation DType failed to convert to PType"), diff --git a/vortex-array/src/compute/boolean.rs b/vortex-array/src/compute/boolean.rs index c3ea27a0dd..c562663671 100644 --- a/vortex-array/src/compute/boolean.rs +++ b/vortex-array/src/compute/boolean.rs @@ -111,7 +111,7 @@ pub fn binary_boolean( lhs.encoding().id() ); debug_assert_eq!( - result.dtype(), + result.dtype().as_ref(), &DType::Bool((lhs.dtype().is_nullable() || rhs.dtype().is_nullable()).into()), "Boolean operation dtype mismatch {}", lhs.encoding().id() @@ -132,7 +132,7 @@ pub fn binary_boolean( rhs.encoding().id() ); debug_assert_eq!( - result.dtype(), + result.dtype().as_ref(), &DType::Bool((lhs.dtype().is_nullable() || rhs.dtype().is_nullable()).into()), "Boolean operation dtype mismatch {}", rhs.encoding().id() diff --git a/vortex-array/src/compute/cast.rs b/vortex-array/src/compute/cast.rs index 3e2f1bae52..86df70a7ee 100644 --- a/vortex-array/src/compute/cast.rs +++ b/vortex-array/src/compute/cast.rs @@ -23,8 +23,9 @@ where /// /// Some array support the ability to narrow or upcast. pub fn try_cast(array: impl AsRef, dtype: &DType) -> VortexResult { + // TODO(aduffy): have this receive an &Arc instead. let array = array.as_ref(); - if array.dtype() == dtype { + if array.dtype().as_ref() == dtype { return Ok(array.clone()); } @@ -37,7 +38,7 @@ pub fn try_cast(array: impl AsRef, dtype: &DType) -> VortexResult, fill_value: Scalar) -> VortexResu array.encoding().id() ); debug_assert_eq!( - filled.dtype(), + filled.dtype().as_ref(), &array.dtype().with_nullability(fill_value_nullability), "FillNull dtype mismatch {}", array.encoding().id() diff --git a/vortex-array/src/compute/filter.rs b/vortex-array/src/compute/filter.rs index 61738210bf..4a50b63ab9 100644 --- a/vortex-array/src/compute/filter.rs +++ b/vortex-array/src/compute/filter.rs @@ -532,7 +532,7 @@ impl TryFrom for FilterMask { type Error = VortexError; fn try_from(array: ArrayData) -> Result { - if array.dtype() != &DType::Bool(Nullability::NonNullable) { + if array.dtype().as_ref() != &DType::Bool(Nullability::NonNullable) { vortex_bail!( "mask must be non-nullable bool, has dtype {}", array.dtype(), diff --git a/vortex-array/src/compute/invert.rs b/vortex-array/src/compute/invert.rs index 9aaa25cb95..e1904557c3 100644 --- a/vortex-array/src/compute/invert.rs +++ b/vortex-array/src/compute/invert.rs @@ -22,7 +22,7 @@ where /// Logically invert a boolean array. pub fn invert(array: &ArrayData) -> VortexResult { - if !matches!(array.dtype(), DType::Bool(..)) { + if !matches!(array.dtype().as_ref(), DType::Bool(..)) { vortex_bail!("Expected boolean array, got {}", array.dtype()); } diff --git a/vortex-array/src/compute/like.rs b/vortex-array/src/compute/like.rs index 540919f510..0976c1299d 100644 --- a/vortex-array/src/compute/like.rs +++ b/vortex-array/src/compute/like.rs @@ -47,10 +47,10 @@ pub fn like( pattern: &ArrayData, options: LikeOptions, ) -> VortexResult { - if !matches!(array.dtype(), DType::Utf8(..)) { + if !matches!(array.dtype().as_ref(), DType::Utf8(..)) { vortex_bail!("Expected utf8 array, got {}", array.dtype()); } - if !matches!(pattern.dtype(), DType::Utf8(..)) { + if !matches!(pattern.dtype().as_ref(), DType::Utf8(..)) { vortex_bail!("Expected utf8 pattern, got {}", array.dtype()); } @@ -76,7 +76,7 @@ fn check_like_result(result: &ArrayData, array: &ArrayData, pattern: &ArrayData) array.encoding().id() ); debug_assert_eq!( - result.dtype(), + result.dtype().as_ref(), &DType::Bool((array.dtype().is_nullable() || pattern.dtype().is_nullable()).into()), "Like dtype mismatch {}", array.encoding().id() diff --git a/vortex-array/src/compute/scalar_at.rs b/vortex-array/src/compute/scalar_at.rs index b9afb78bfc..5fd13e8208 100644 --- a/vortex-array/src/compute/scalar_at.rs +++ b/vortex-array/src/compute/scalar_at.rs @@ -30,7 +30,8 @@ pub fn scalar_at(array: impl AsRef, index: usize) -> VortexResult, index: usize) -> VortexResult for ArrayData { impl ArrayData { pub fn try_new_owned( encoding: EncodingRef, - dtype: DType, + dtype: Arc, len: usize, metadata: Arc, buffers: Box<[ByteBuffer]>, @@ -81,7 +81,7 @@ impl ArrayData { pub fn try_new_viewed( ctx: ContextRef, - dtype: DType, + dtype: Arc, len: usize, flatbuffer: FlatBuffer, flatbuffer_init: F, @@ -126,7 +126,7 @@ impl ArrayData { // Sanity check that the encoding implements the correct array trait debug_assert!( - match array.dtype() { + match array.dtype().as_ref() { DType::Null => array.as_null_array().is_some(), DType::Bool(_) => array.as_bool_array().is_some(), DType::Primitive(..) => array.as_primitive_array().is_some(), @@ -203,11 +203,11 @@ impl ArrayData { .then(|| scalar_at(self, 0).vortex_expect("expected a scalar value")) } - pub fn child<'a>(&'a self, idx: usize, dtype: &'a DType, len: usize) -> VortexResult { + pub fn child(&self, idx: usize, dtype: &Arc, len: usize) -> VortexResult { match self.0.as_ref() { InnerArrayData::Owned(d) => d.child(idx, dtype, len).cloned(), InnerArrayData::Viewed(v) => v - .child(idx, dtype, len) + .child(idx, Arc::clone(dtype), len) .map(|view| ArrayData(Arc::new(InnerArrayData::Viewed(view)))), } } @@ -362,12 +362,14 @@ impl ArrayData { let iter = ChunkedArray::maybe_from(self.clone()) .map(|chunked| ArrayDataIterator::Chunked(chunked, 0)) .unwrap_or_else(|| ArrayDataIterator::Single(Some(self))); - ArrayIteratorAdapter::new(dtype, iter) + // TODO(aduffy): fix cloning. + ArrayIteratorAdapter::new(dtype.as_ref().clone(), iter) } pub fn into_array_stream(self) -> impl ArrayStream { ArrayStreamAdapter::new( - self.dtype().clone(), + // TODO(aduffy): fix cloning. + self.dtype().as_ref().clone(), futures_util::stream::iter(self.into_array_iterator()), ) } @@ -431,7 +433,7 @@ impl Display for ArrayData { } impl> ArrayDType for T { - fn dtype(&self) -> &DType { + fn dtype(&self) -> &Arc { match self.as_ref().0.as_ref() { InnerArrayData::Owned(d) => &d.dtype, InnerArrayData::Viewed(v) => &v.dtype, diff --git a/vortex-array/src/data/owned.rs b/vortex-array/src/data/owned.rs index 1052100c30..a80f595db7 100644 --- a/vortex-array/src/data/owned.rs +++ b/vortex-array/src/data/owned.rs @@ -12,7 +12,7 @@ use crate::{ArrayDType, ArrayData, ArrayMetadata}; #[derive(Debug)] pub(super) struct OwnedArrayData { pub(super) encoding: EncodingRef, - pub(super) dtype: DType, + pub(super) dtype: Arc, pub(super) len: usize, pub(super) metadata: Arc, pub(super) buffers: Box<[ByteBuffer]>, @@ -33,7 +33,7 @@ impl OwnedArrayData { // We want to allow these panics because they are indicative of implementation error. #[allow(clippy::panic_in_result_fn)] - pub fn child(&self, index: usize, dtype: &DType, len: usize) -> VortexResult<&ArrayData> { + pub fn child(&self, index: usize, dtype: &Arc, len: usize) -> VortexResult<&ArrayData> { match self.children.get(index) { None => vortex_bail!( "ArrayData::child({}): child {index} not found", diff --git a/vortex-array/src/data/statistics.rs b/vortex-array/src/data/statistics.rs index 4e0a139756..99589422fa 100644 --- a/vortex-array/src/data/statistics.rs +++ b/vortex-array/src/data/statistics.rs @@ -28,12 +28,14 @@ impl Statistics for ArrayData { Stat::Max => { let max = v.flatbuffer().stats()?.max(); max.and_then(|v| ScalarValue::try_from(v).ok()) - .map(|v| Scalar::new(self.dtype().clone(), v)) + // TODO(aduffy): fix cloning + .map(|v| Scalar::new(self.dtype().as_ref().clone(), v)) } Stat::Min => { let min = v.flatbuffer().stats()?.min(); min.and_then(|v| ScalarValue::try_from(v).ok()) - .map(|v| Scalar::new(self.dtype().clone(), v)) + // TODO(aduffy): fix cloning + .map(|v| Scalar::new(self.dtype().as_ref().clone(), v)) } Stat::IsConstant => v.flatbuffer().stats()?.is_constant().map(bool::into), Stat::IsSorted => v.flatbuffer().stats()?.is_sorted().map(bool::into), diff --git a/vortex-array/src/data/viewed.rs b/vortex-array/src/data/viewed.rs index 0d189eae4d..c19e9de324 100644 --- a/vortex-array/src/data/viewed.rs +++ b/vortex-array/src/data/viewed.rs @@ -14,7 +14,7 @@ use crate::{flatbuffers as fb, ArrayMetadata, ContextRef}; /// Zero-copy view over flatbuffer-encoded array data, created without eager serialization. pub(super) struct ViewedArrayData { pub(super) encoding: EncodingRef, - pub(super) dtype: DType, + pub(super) dtype: Arc, pub(super) len: usize, pub(super) metadata: Arc, pub(super) flatbuffer: FlatBuffer, @@ -46,7 +46,7 @@ impl ViewedArrayData { } // TODO(ngates): should we separate self and DType lifetimes? Should DType be cloned? - pub fn child(&self, idx: usize, dtype: &DType, len: usize) -> VortexResult { + pub fn child(&self, idx: usize, dtype: Arc, len: usize) -> VortexResult { let child = self .array_child(idx) .ok_or_else(|| vortex_err!("ArrayView: array_child({idx}) not found"))?; @@ -67,7 +67,7 @@ impl ViewedArrayData { Ok(Self { encoding, - dtype: dtype.clone(), + dtype, len, metadata, flatbuffer: self.flatbuffer.clone(), diff --git a/vortex-array/src/dtypes.rs b/vortex-array/src/dtypes.rs new file mode 100644 index 0000000000..d97e4e81d9 --- /dev/null +++ b/vortex-array/src/dtypes.rs @@ -0,0 +1,236 @@ +//! Pre-created DTypes. +//! +//! Many DTypes are reused throughout the codebase. We statically enumerate the common DTypes, and +//! use them in place of direct construction. +//! +//! While a `DType` is at the time of writing this, 40 bytes, an Arc is only 8 bytes, +//! and can be shared/copied without any extra allocations. + +use std::sync::{Arc, LazyLock}; + +use vortex_dtype::{DType, Nullability, PType}; + +pub static DTYPE_BOOL_NONNULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Bool(Nullability::NonNullable))); +pub static DTYPE_BOOL_NULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Bool(Nullability::Nullable))); + +pub static DTYPE_U8_NONNULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable))); +pub static DTYPE_U8_NULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::U8, Nullability::Nullable))); + +pub static DTYPE_U16_NONNULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::U16, Nullability::NonNullable))); +pub static DTYPE_U16_NULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::U16, Nullability::Nullable))); + +pub static DTYPE_U32_NONNULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::U32, Nullability::NonNullable))); +pub static DTYPE_U32_NULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::U32, Nullability::Nullable))); + +pub static DTYPE_U64_NONNULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::U64, Nullability::NonNullable))); +pub static DTYPE_U64_NULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::U64, Nullability::Nullable))); + +pub static DTYPE_I8_NONNULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::I8, Nullability::NonNullable))); +pub static DTYPE_I8_NULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::I8, Nullability::Nullable))); + +pub static DTYPE_I16_NONNULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::I16, Nullability::NonNullable))); +pub static DTYPE_I16_NULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::I16, Nullability::Nullable))); + +pub static DTYPE_I32_NONNULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::I32, Nullability::NonNullable))); +pub static DTYPE_I32_NULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::I32, Nullability::Nullable))); + +pub static DTYPE_I64_NONNULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::I64, Nullability::NonNullable))); +pub static DTYPE_I64_NULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::I64, Nullability::Nullable))); + +pub static DTYPE_F16_NONNULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::F16, Nullability::NonNullable))); +pub static DTYPE_F16_NULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::F16, Nullability::Nullable))); + +pub static DTYPE_F32_NONNULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::F32, Nullability::NonNullable))); +pub static DTYPE_F32_NULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::F32, Nullability::Nullable))); + +pub static DTYPE_F64_NONNULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::F64, Nullability::NonNullable))); +pub static DTYPE_F64_NULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Primitive(PType::F64, Nullability::Nullable))); + +pub static DTYPE_STRING_NONNULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Utf8(Nullability::NonNullable))); +pub static DTYPE_STRING_NULL: LazyLock> = + LazyLock::new(|| Arc::new(DType::Utf8(Nullability::Nullable))); + +#[macro_export] +macro_rules! primitive_dtype { + ($ptype:expr, $nullability:expr) => { + match ($ptype, $nullability) { + (vortex_dtype::PType::U8, vortex_dtype::Nullability::NonNullable) => { + $crate::dtypes::DTYPE_U8_NONNULL.clone() + } + (vortex_dtype::PType::U8, vortex_dtype::Nullability::Nullable) => { + $crate::dtypes::DTYPE_U8_NULL.clone() + } + (vortex_dtype::PType::U16, vortex_dtype::Nullability::NonNullable) => { + $crate::dtypes::DTYPE_U16_NONNULL.clone() + } + (vortex_dtype::PType::U16, vortex_dtype::Nullability::Nullable) => { + $crate::dtypes::DTYPE_U16_NULL.clone() + } + (vortex_dtype::PType::U32, vortex_dtype::Nullability::NonNullable) => { + $crate::dtypes::DTYPE_U32_NONNULL.clone() + } + (vortex_dtype::PType::U32, vortex_dtype::Nullability::Nullable) => { + $crate::dtypes::DTYPE_U32_NULL.clone() + } + (vortex_dtype::PType::U64, vortex_dtype::Nullability::NonNullable) => { + $crate::dtypes::DTYPE_U64_NONNULL.clone() + } + (vortex_dtype::PType::U64, vortex_dtype::Nullability::Nullable) => { + $crate::dtypes::DTYPE_U64_NULL.clone() + } + (vortex_dtype::PType::I8, vortex_dtype::Nullability::NonNullable) => { + $crate::dtypes::DTYPE_I8_NONNULL.clone() + } + (vortex_dtype::PType::I8, vortex_dtype::Nullability::Nullable) => { + $crate::dtypes::DTYPE_I8_NULL.clone() + } + (vortex_dtype::PType::I16, vortex_dtype::Nullability::NonNullable) => { + $crate::dtypes::DTYPE_I16_NONNULL.clone() + } + (vortex_dtype::PType::I16, vortex_dtype::Nullability::Nullable) => { + $crate::dtypes::DTYPE_I16_NULL.clone() + } + (vortex_dtype::PType::I32, vortex_dtype::Nullability::NonNullable) => { + $crate::dtypes::DTYPE_I32_NONNULL.clone() + } + (vortex_dtype::PType::I32, vortex_dtype::Nullability::Nullable) => { + $crate::dtypes::DTYPE_I32_NULL.clone() + } + (vortex_dtype::PType::I64, vortex_dtype::Nullability::NonNullable) => { + $crate::dtypes::DTYPE_I64_NONNULL.clone() + } + (vortex_dtype::PType::I64, vortex_dtype::Nullability::Nullable) => { + $crate::dtypes::DTYPE_I64_NULL.clone() + } + (vortex_dtype::PType::F16, vortex_dtype::Nullability::NonNullable) => { + $crate::dtypes::DTYPE_F16_NONNULL.clone() + } + (vortex_dtype::PType::F16, vortex_dtype::Nullability::Nullable) => { + $crate::dtypes::DTYPE_F16_NULL.clone() + } + (vortex_dtype::PType::F32, vortex_dtype::Nullability::NonNullable) => { + $crate::dtypes::DTYPE_F32_NONNULL.clone() + } + (vortex_dtype::PType::F32, vortex_dtype::Nullability::Nullable) => { + $crate::dtypes::DTYPE_F32_NULL.clone() + } + (vortex_dtype::PType::F64, vortex_dtype::Nullability::NonNullable) => { + $crate::dtypes::DTYPE_F64_NONNULL.clone() + } + (vortex_dtype::PType::F64, vortex_dtype::Nullability::Nullable) => { + $crate::dtypes::DTYPE_F64_NULL.clone() + } + } + }; +} + +#[macro_export] +macro_rules! primitive_dtype_ref { + ($ptype:expr, $nullability:expr) => { + match ($ptype, $nullability) { + (vortex_dtype::PType::U8, vortex_dtype::Nullability::NonNullable) => { + &$crate::dtypes::DTYPE_U8_NONNULL + } + (vortex_dtype::PType::U8, vortex_dtype::Nullability::Nullable) => { + &$crate::dtypes::DTYPE_U8_NULL + } + (vortex_dtype::PType::U16, vortex_dtype::Nullability::NonNullable) => { + &$crate::dtypes::DTYPE_U16_NONNULL + } + (vortex_dtype::PType::U16, vortex_dtype::Nullability::Nullable) => { + &$crate::dtypes::DTYPE_U16_NULL + } + (vortex_dtype::PType::U32, vortex_dtype::Nullability::NonNullable) => { + &$crate::dtypes::DTYPE_U32_NONNULL + } + (vortex_dtype::PType::U32, vortex_dtype::Nullability::Nullable) => { + &$crate::dtypes::DTYPE_U32_NULL + } + (vortex_dtype::PType::U64, vortex_dtype::Nullability::NonNullable) => { + &$crate::dtypes::DTYPE_U64_NONNULL + } + (vortex_dtype::PType::U64, vortex_dtype::Nullability::Nullable) => { + &$crate::dtypes::DTYPE_U64_NULL + } + (vortex_dtype::PType::I8, vortex_dtype::Nullability::NonNullable) => { + &$crate::dtypes::DTYPE_I8_NONNULL + } + (vortex_dtype::PType::I8, vortex_dtype::Nullability::Nullable) => { + &$crate::dtypes::DTYPE_I8_NULL + } + (vortex_dtype::PType::I16, vortex_dtype::Nullability::NonNullable) => { + &$crate::dtypes::DTYPE_I16_NONNULL + } + (vortex_dtype::PType::I16, vortex_dtype::Nullability::Nullable) => { + &$crate::dtypes::DTYPE_I16_NULL + } + (vortex_dtype::PType::I32, vortex_dtype::Nullability::NonNullable) => { + &$crate::dtypes::DTYPE_I32_NONNULL + } + (vortex_dtype::PType::I32, vortex_dtype::Nullability::Nullable) => { + &$crate::dtypes::DTYPE_I32_NULL + } + (vortex_dtype::PType::I64, vortex_dtype::Nullability::NonNullable) => { + &$crate::dtypes::DTYPE_I64_NONNULL + } + (vortex_dtype::PType::I64, vortex_dtype::Nullability::Nullable) => { + &$crate::dtypes::DTYPE_I64_NULL + } + (vortex_dtype::PType::F16, vortex_dtype::Nullability::NonNullable) => { + &$crate::dtypes::DTYPE_F16_NONNULL + } + (vortex_dtype::PType::F16, vortex_dtype::Nullability::Nullable) => { + &$crate::dtypes::DTYPE_F16_NULL + } + (vortex_dtype::PType::F32, vortex_dtype::Nullability::NonNullable) => { + &$crate::dtypes::DTYPE_F32_NONNULL + } + (vortex_dtype::PType::F32, vortex_dtype::Nullability::Nullable) => { + &$crate::dtypes::DTYPE_F32_NULL + } + (vortex_dtype::PType::F64, vortex_dtype::Nullability::NonNullable) => { + &$crate::dtypes::DTYPE_F64_NONNULL + } + (vortex_dtype::PType::F64, vortex_dtype::Nullability::Nullable) => { + &$crate::dtypes::DTYPE_F64_NULL + } + } + }; +} + +#[macro_export] +macro_rules! bool_dtype { + ($nullability:expr) => { + match ($nullability) { + vortex_dtype::Nullability::NonNullable => $crate::dtypes::DTYPE_BOOL_NONNULL.clone(), + vortex_dtype::Nullability::Nullable => $crate::dtypes::DTYPE_BOOL_NULL.clone(), + } + }; +} + +pub use {bool_dtype, primitive_dtype, primitive_dtype_ref}; diff --git a/vortex-array/src/iter/ext.rs b/vortex-array/src/iter/ext.rs index fe9ae94ff1..d8889974de 100644 --- a/vortex-array/src/iter/ext.rs +++ b/vortex-array/src/iter/ext.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use itertools::Itertools; use vortex_error::VortexResult; @@ -26,7 +28,8 @@ pub trait ArrayIteratorExt: ArrayIterator { if chunks.len() == 1 { Ok(chunks.remove(0)) } else { - Ok(ChunkedArray::try_new(chunks, dtype)?.into_array()) + // TODO(aduffy): fix cloning. + Ok(ChunkedArray::try_new(chunks, Arc::new(dtype))?.into_array()) } } } diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs index bc64075527..4e61db7068 100644 --- a/vortex-array/src/lib.rs +++ b/vortex-array/src/lib.rs @@ -12,6 +12,8 @@ //! arrays can be [canonicalized](Canonical) into for ease of access in compute functions. //! +use std::sync::Arc; + pub use canonical::*; pub use children::*; pub use context::*; @@ -36,6 +38,7 @@ pub mod compress; pub mod compute; mod context; mod data; +pub mod dtypes; pub mod encoding; pub mod iter; mod macros; @@ -106,7 +109,7 @@ pub trait ArrayTrait: pub trait ArrayDType { // TODO(ngates): move into ArrayTrait? - fn dtype(&self) -> &DType; + fn dtype(&self) -> &Arc; } pub trait ArrayLen { diff --git a/vortex-array/src/macros.rs b/vortex-array/src/macros.rs index 78c8564c87..2c92472b88 100644 --- a/vortex-array/src/macros.rs +++ b/vortex-array/src/macros.rs @@ -43,7 +43,7 @@ macro_rules! impl_encoding { #[allow(dead_code)] fn try_from_parts( - dtype: vortex_dtype::DType, + dtype: std::sync::Arc, len: usize, metadata: [<$Name Metadata>], children: Box<[$crate::ArrayData]>, diff --git a/vortex-array/src/parts.rs b/vortex-array/src/parts.rs index 7153d2267e..ef8ae1078d 100644 --- a/vortex-array/src/parts.rs +++ b/vortex-array/src/parts.rs @@ -1,4 +1,5 @@ use std::fmt::{Debug, Formatter}; +use std::sync::Arc; use flatbuffers::{FlatBufferBuilder, Follow, WIPOffset}; use itertools::Itertools; @@ -70,7 +71,8 @@ impl ArrayParts { pub fn decode(self, ctx: ContextRef, dtype: DType) -> VortexResult { ArrayData::try_new_viewed( ctx, - dtype, + // TODO(aduffy): fix cloning. + Arc::new(dtype), self.row_count, self.flatbuffer, // SAFETY: ArrayComponents guarantees the buffers are valid. diff --git a/vortex-array/src/patches.rs b/vortex-array/src/patches.rs index 3122e8e3ee..49adf4b98a 100644 --- a/vortex-array/src/patches.rs +++ b/vortex-array/src/patches.rs @@ -1,4 +1,5 @@ use std::fmt::Debug; +use std::sync::Arc; use itertools::Itertools as _; use serde::{Deserialize, Serialize}; @@ -14,6 +15,7 @@ use crate::compute::{ scalar_at, search_sorted, search_sorted_usize, search_sorted_usize_many, slice, sub_scalar, take, FilterMask, SearchResult, SearchSortedSide, }; +use crate::dtypes::primitive_dtype_ref; use crate::stats::{ArrayStatistics, Stat}; use crate::variants::PrimitiveArrayTrait; use crate::{ArrayDType, ArrayData, ArrayLen as _, IntoArrayData, IntoArrayVariant}; @@ -40,12 +42,12 @@ impl PatchesMetadata { } #[inline] - pub fn indices_dtype(&self) -> DType { + pub fn indices_dtype(&self) -> &Arc { assert!( self.indices_ptype.is_unsigned_int(), "Patch indices must be unsigned integers" ); - DType::Primitive(self.indices_ptype, NonNullable) + primitive_dtype_ref!(self.indices_ptype, NonNullable) } } @@ -100,7 +102,7 @@ impl Patches { self.indices.len() } - pub fn dtype(&self) -> &DType { + pub fn dtype(&self) -> &Arc { self.values.dtype() } @@ -132,7 +134,7 @@ impl Patches { len ); } - if self.values.dtype() != dtype { + if self.values.dtype().as_ref() != dtype { vortex_bail!( "Patch values dtype {} does not match array dtype {}", self.values.dtype(), diff --git a/vortex-array/src/stream/ext.rs b/vortex-array/src/stream/ext.rs index e6ac71dcd6..5978e49456 100644 --- a/vortex-array/src/stream/ext.rs +++ b/vortex-array/src/stream/ext.rs @@ -1,4 +1,5 @@ use std::future::Future; +use std::sync::Arc; use futures_util::TryStreamExt; use vortex_error::VortexResult; @@ -22,7 +23,8 @@ pub trait ArrayStreamExt: ArrayStream { if chunks.len() == 1 { Ok(chunks.remove(0)) } else { - Ok(ChunkedArray::try_new(chunks, dtype)?.into_array()) + // TODO(aduffy): fix cloning. + Ok(ChunkedArray::try_new(chunks, Arc::new(dtype))?.into_array()) } } } diff --git a/vortex-array/src/validity.rs b/vortex-array/src/validity.rs index 04f31480b1..75f7029b0b 100644 --- a/vortex-array/src/validity.rs +++ b/vortex-array/src/validity.rs @@ -5,13 +5,14 @@ use std::ops::BitAnd; use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder, NullBuffer}; use serde::{Deserialize, Serialize}; -use vortex_dtype::{DType, Nullability}; +use vortex_dtype::Nullability; use vortex_error::{ vortex_bail, vortex_err, vortex_panic, VortexError, VortexExpect as _, VortexResult, }; use crate::array::{BoolArray, ConstantArray}; use crate::compute::{filter, scalar_at, slice, take, FilterMask}; +use crate::dtypes::DTYPE_BOOL_NONNULL; use crate::encoding::Encoding; use crate::patches::Patches; use crate::stats::ArrayStatistics; @@ -92,7 +93,7 @@ pub enum Validity { impl Validity { /// The [`DType`] of the underlying validity array (if it exists). - pub const DTYPE: DType = DType::Bool(Nullability::NonNullable); + // pub const DTYPE: Arc = Arc::new(DType::Bool(Nullability::NonNullable)); pub fn to_metadata(&self, length: usize) -> VortexResult { match self { @@ -442,7 +443,7 @@ pub enum LogicalValidity { impl LogicalValidity { pub fn try_new_from_array(array: ArrayData) -> VortexResult { - if !matches!(array.dtype(), &Validity::DTYPE) { + if array.dtype().as_ref() != DTYPE_BOOL_NONNULL.as_ref() { vortex_bail!("Expected a non-nullable boolean array"); } diff --git a/vortex-array/src/variants.rs b/vortex-array/src/variants.rs index 17515a6a21..6144eeedf5 100644 --- a/vortex-array/src/variants.rs +++ b/vortex-array/src/variants.rs @@ -111,49 +111,50 @@ where /// Provide functions on type-erased ArrayData to downcast into dtype-specific array variants. impl ArrayData { pub fn as_null_array(&self) -> Option<&dyn NullArrayTrait> { - matches!(self.dtype(), DType::Null) + matches!(self.dtype().as_ref(), DType::Null) .then(|| self.encoding().as_null_array(self)) .flatten() } pub fn as_bool_array(&self) -> Option<&dyn BoolArrayTrait> { - matches!(self.dtype(), DType::Bool(..)) + self.dtype() + .is_boolean() .then(|| self.encoding().as_bool_array(self)) .flatten() } pub fn as_primitive_array(&self) -> Option<&dyn PrimitiveArrayTrait> { - matches!(self.dtype(), DType::Primitive(..)) + matches!(self.dtype().as_ref(), DType::Primitive(..)) .then(|| self.encoding().as_primitive_array(self)) .flatten() } pub fn as_utf8_array(&self) -> Option<&dyn Utf8ArrayTrait> { - matches!(self.dtype(), DType::Utf8(..)) + matches!(self.dtype().as_ref(), DType::Utf8(..)) .then(|| self.encoding().as_utf8_array(self)) .flatten() } pub fn as_binary_array(&self) -> Option<&dyn BinaryArrayTrait> { - matches!(self.dtype(), DType::Binary(..)) + matches!(self.dtype().as_ref(), DType::Binary(..)) .then(|| self.encoding().as_binary_array(self)) .flatten() } pub fn as_struct_array(&self) -> Option<&dyn StructArrayTrait> { - matches!(self.dtype(), DType::Struct(..)) + matches!(self.dtype().as_ref(), DType::Struct(..)) .then(|| self.encoding().as_struct_array(self)) .flatten() } pub fn as_list_array(&self) -> Option<&dyn ListArrayTrait> { - matches!(self.dtype(), DType::List(..)) + matches!(self.dtype().as_ref(), DType::List(..)) .then(|| self.encoding().as_list_array(self)) .flatten() } pub fn as_extension_array(&self) -> Option<&dyn ExtensionArrayTrait> { - matches!(self.dtype(), DType::Extension(..)) + matches!(self.dtype().as_ref(), DType::Extension(..)) .then(|| self.encoding().as_extension_array(self)) .flatten() } @@ -169,7 +170,7 @@ pub trait PrimitiveArrayTrait: ArrayTrait { /// This is a type that can safely be converted into a `NativePType` for use in /// `maybe_null_slice` or `into_maybe_null_slice`. fn ptype(&self) -> PType { - if let DType::Primitive(ptype, ..) = self.dtype() { + if let DType::Primitive(ptype, ..) = self.dtype().as_ref() { *ptype } else { vortex_panic!("array must have primitive data type"); @@ -183,14 +184,14 @@ pub trait BinaryArrayTrait: ArrayTrait {} pub trait StructArrayTrait: ArrayTrait { fn names(&self) -> &FieldNames { - let DType::Struct(st, _) = self.dtype() else { + let DType::Struct(st, _) = self.dtype().as_ref() else { unreachable!() }; st.names() } fn field_info(&self, field: &Field) -> VortexResult { - let DType::Struct(st, _) = self.dtype() else { + let DType::Struct(st, _) = self.dtype().as_ref() else { unreachable!() }; @@ -198,7 +199,7 @@ pub trait StructArrayTrait: ArrayTrait { } fn dtypes(&self) -> Vec { - let DType::Struct(st, _) = self.dtype() else { + let DType::Struct(st, _) = self.dtype().as_ref() else { unreachable!() }; st.dtypes().collect() @@ -236,7 +237,7 @@ pub trait ListArrayTrait: ArrayTrait {} pub trait ExtensionArrayTrait: ArrayTrait { /// Returns the extension logical [`DType`]. fn ext_dtype(&self) -> &Arc { - let DType::Extension(ext_dtype) = self.dtype() else { + let DType::Extension(ext_dtype) = self.dtype().as_ref() else { vortex_panic!("Expected ExtDType") }; ext_dtype diff --git a/vortex-dtype/src/extension.rs b/vortex-dtype/src/extension.rs index bdc4382237..495b2daa41 100644 --- a/vortex-dtype/src/extension.rs +++ b/vortex-dtype/src/extension.rs @@ -119,6 +119,7 @@ impl ExtDType { /// Returns the `ExtMetadata` for this extension type, if it exists #[inline] pub fn storage_dtype(&self) -> &DType { + // TODO(aduffy): remove as_ref self.storage_dtype.as_ref() } diff --git a/vortex-dtype/src/ptype.rs b/vortex-dtype/src/ptype.rs index 301a2fdca6..d40e263858 100644 --- a/vortex-dtype/src/ptype.rs +++ b/vortex-dtype/src/ptype.rs @@ -4,6 +4,7 @@ use std::cmp::Ordering; use std::fmt::{Debug, Display, Formatter}; use std::hash::Hash; use std::panic::RefUnwindSafe; +use std::sync::Arc; use num_traits::bounds::UpperBounded; use num_traits::{FromPrimitive, Num, NumCast, ToPrimitive}; @@ -316,6 +317,14 @@ impl TryFrom<&DType> for PType { } } +impl TryFrom<&Arc> for PType { + type Error = VortexError; + + fn try_from(value: &Arc) -> VortexResult { + Self::try_from(value.as_ref()) + } +} + impl From for &DType { fn from(item: PType) -> Self { // We expand this match statement so that we can return a static reference. diff --git a/vortex-expr/src/get_item.rs b/vortex-expr/src/get_item.rs index 51d69c701d..cfe8130342 100644 --- a/vortex-expr/src/get_item.rs +++ b/vortex-expr/src/get_item.rs @@ -99,7 +99,7 @@ mod tests { let st = test_array(); let get_item = get_item("a", ident()); let item = get_item.evaluate(st.as_ref()).unwrap(); - assert_eq!(item.dtype(), &DType::from(I32)) + assert_eq!(item.dtype().as_ref(), &DType::from(I32)) } #[test] diff --git a/vortex-expr/src/lib.rs b/vortex-expr/src/lib.rs index 9305327ed3..8bf57a6fc3 100644 --- a/vortex-expr/src/lib.rs +++ b/vortex-expr/src/lib.rs @@ -56,7 +56,7 @@ pub trait VortexExpr: Debug + Send + Sync + DynEq + DynHash + Display { /// fn evaluate(&self, batch: &ArrayData) -> VortexResult { let result = self.unchecked_evaluate(batch)?; - debug_assert_eq!(result.dtype(), &self.return_dtype(batch.dtype())?); + debug_assert_eq!(result.dtype().as_ref(), &self.return_dtype(batch.dtype())?); Ok(result) } @@ -75,7 +75,8 @@ pub trait VortexExpr: Debug + Send + Sync + DynEq + DynHash + Display { fn return_dtype(&self, scope_dtype: &DType) -> VortexResult { let empty = Canonical::empty(scope_dtype)?.into_array(); self.unchecked_evaluate(&empty) - .map(|array| array.dtype().clone()) + // TODO(aduffy): fix extra clone + .map(|array| array.dtype().as_ref().clone()) } } diff --git a/vortex-file/src/read/stream.rs b/vortex-file/src/read/stream.rs index 921baf8c73..84e64b56b7 100644 --- a/vortex-file/src/read/stream.rs +++ b/vortex-file/src/read/stream.rs @@ -70,7 +70,8 @@ impl VortexReadArrayStream { ) }) } else { - ChunkedArray::try_new(arrays, dtype).map(|e| e.into_array()) + // TODO(aduffy): fix extra clone + ChunkedArray::try_new(arrays, Arc::new(dtype)).map(|e| e.into_array()) } } } diff --git a/vortex-file/src/tests.rs b/vortex-file/src/tests.rs index 308cc4f850..199cc5ff4d 100644 --- a/vortex-file/src/tests.rs +++ b/vortex-file/src/tests.rs @@ -201,9 +201,12 @@ async fn test_read_projection() { .unwrap(); assert_eq!( - array.dtype(), + array.dtype().as_ref(), &DType::Struct( - StructDType::new(vec!["strings".into()].into(), vec![strings_dtype.clone()]), + StructDType::new( + vec!["strings".into()].into(), + vec![strings_dtype.as_ref().clone()] + ), Nullability::NonNullable, ) ); @@ -233,9 +236,12 @@ async fn test_read_projection() { .unwrap(); assert_eq!( - array.dtype(), + array.dtype().as_ref(), &DType::Struct( - StructDType::new(vec!["strings".into()].into(), vec![strings_dtype.clone()]), + StructDType::new( + vec!["strings".into()].into(), + vec![strings_dtype.as_ref().clone()] + ), Nullability::NonNullable, ) ); @@ -265,9 +271,12 @@ async fn test_read_projection() { .unwrap(); assert_eq!( - array.dtype(), + array.dtype().as_ref(), &DType::Struct( - StructDType::new(vec!["numbers".into()].into(), vec![numbers_dtype.clone()]), + StructDType::new( + vec!["numbers".into()].into(), + vec![numbers_dtype.as_ref().clone()] + ), Nullability::NonNullable, ) ); @@ -293,9 +302,12 @@ async fn test_read_projection() { .unwrap(); assert_eq!( - array.dtype(), + array.dtype().as_ref(), &DType::Struct( - StructDType::new(vec!["numbers".into()].into(), vec![numbers_dtype.clone()]), + StructDType::new( + vec!["numbers".into()].into(), + vec![numbers_dtype.as_ref().clone()] + ), Nullability::NonNullable, ) ); diff --git a/vortex-ipc/src/iterator.rs b/vortex-ipc/src/iterator.rs index 88decc284e..dc9d1f07f7 100644 --- a/vortex-ipc/src/iterator.rs +++ b/vortex-ipc/src/iterator.rs @@ -47,7 +47,7 @@ impl Iterator for SyncIPCReader { array_parts .decode(self.ctx.clone(), self.dtype.clone()) .and_then(|array| { - if array.dtype() != self.dtype() { + if array.dtype().as_ref() != self.dtype() { Err(vortex_err!( "Array data type mismatch: expected {:?}, got {:?}", self.dtype(), @@ -170,7 +170,7 @@ mod test { let reader = SyncIPCReader::try_new(Cursor::new(ipc_buffer), Default::default()).unwrap(); - assert_eq!(reader.dtype(), array.dtype()); + assert_eq!(reader.dtype(), array.dtype().as_ref()); let result = reader.into_array_data().unwrap().into_primitive().unwrap(); assert_eq!(array.as_slice::(), result.as_slice::()); } diff --git a/vortex-ipc/src/messages/decoder.rs b/vortex-ipc/src/messages/decoder.rs index eb133042aa..ed51d54337 100644 --- a/vortex-ipc/src/messages/decoder.rs +++ b/vortex-ipc/src/messages/decoder.rs @@ -249,7 +249,7 @@ mod test { // Decode the array parts with the context let actual = array_parts - .decode(Default::default(), expected.dtype().clone()) + .decode(Default::default(), expected.dtype().as_ref().clone()) .unwrap(); assert_eq!(expected.len(), actual.len()); diff --git a/vortex-ipc/src/stream.rs b/vortex-ipc/src/stream.rs index 686b732080..b2cd320bcd 100644 --- a/vortex-ipc/src/stream.rs +++ b/vortex-ipc/src/stream.rs @@ -62,7 +62,7 @@ impl Stream for AsyncIPCReader { array_parts .decode(this.ctx.clone(), this.dtype.clone()) .and_then(|array| { - if array.dtype() != this.dtype { + if array.dtype().as_ref() != this.dtype { Err(vortex_err!( "Array data type mismatch: expected {:?}, got {:?}", this.dtype, @@ -207,7 +207,7 @@ mod test { .await .unwrap(); - assert_eq!(reader.dtype(), array.dtype()); + assert_eq!(reader.dtype(), array.dtype().as_ref()); let result = reader .into_array_data() .await diff --git a/vortex-layout/src/layouts/chunked/stats_table.rs b/vortex-layout/src/layouts/chunked/stats_table.rs index 43e5c5844d..ca1fb1c674 100644 --- a/vortex-layout/src/layouts/chunked/stats_table.rs +++ b/vortex-layout/src/layouts/chunked/stats_table.rs @@ -29,7 +29,7 @@ impl StatsTable { array: ArrayData, stats: Arc<[Stat]>, ) -> VortexResult { - if &Self::dtype_for_stats_table(&column_dtype, &stats) != array.dtype() { + if &Self::dtype_for_stats_table(&column_dtype, &stats) != array.dtype().as_ref() { vortex_bail!("Array dtype does not match expected stats table dtype"); } Ok(Self { diff --git a/vortex-layout/src/layouts/chunked/writer.rs b/vortex-layout/src/layouts/chunked/writer.rs index 3a41dd973b..c738827c1e 100644 --- a/vortex-layout/src/layouts/chunked/writer.rs +++ b/vortex-layout/src/layouts/chunked/writer.rs @@ -82,8 +82,10 @@ impl LayoutWriter for ChunkedLayoutWriter { let metadata: Option = match stats_table { Some(stats_table) => { // Write the stats array as the final layout. - let stats_layout = FlatLayoutWriter::new(stats_table.array().dtype().clone()) - .push_one(segments, stats_table.array().clone())?; + // TODO(aduffy): fix extra clone + let stats_layout = + FlatLayoutWriter::new(stats_table.array().dtype().as_ref().clone()) + .push_one(segments, stats_table.array().clone())?; children.push(stats_layout); // We store a bit-set of the statistics in the layout metadata so we can infer the diff --git a/vortex-layout/src/layouts/flat/eval_expr.rs b/vortex-layout/src/layouts/flat/eval_expr.rs index dc583ccb1b..9dae012111 100644 --- a/vortex-layout/src/layouts/flat/eval_expr.rs +++ b/vortex-layout/src/layouts/flat/eval_expr.rs @@ -90,7 +90,8 @@ mod test { block_on(async { let mut segments = TestSegments::default(); let array = PrimitiveArray::new(buffer![1, 2, 3, 4, 5], Validity::AllValid); - let layout = FlatLayoutWriter::new(array.dtype().clone()) + // TODO(aduffy): fix extra clone + let layout = FlatLayoutWriter::new(array.dtype().as_ref().clone()) .push_one(&mut segments, array.to_array()) .unwrap(); @@ -115,7 +116,8 @@ mod test { block_on(async { let mut segments = TestSegments::default(); let array = PrimitiveArray::new(buffer![1, 2, 3, 4, 5], Validity::AllValid); - let layout = FlatLayoutWriter::new(array.dtype().clone()) + // TODO(aduffy): fix extra clone + let layout = FlatLayoutWriter::new(array.dtype().as_ref().clone()) .push_one(&mut segments, array.to_array()) .unwrap(); @@ -141,7 +143,8 @@ mod test { block_on(async { let mut segments = TestSegments::default(); let array = PrimitiveArray::new(buffer![1, 2, 3, 4, 5], Validity::AllValid); - let layout = FlatLayoutWriter::new(array.dtype().clone()) + // TODO(aduffy): fix extra clone + let layout = FlatLayoutWriter::new(array.dtype().as_ref().clone()) .push_one(&mut segments, array.to_array()) .unwrap(); diff --git a/vortex-sampling-compressor/src/compressors/date_time_parts.rs b/vortex-sampling-compressor/src/compressors/date_time_parts.rs index d7ae7c4c97..4a0f29b412 100644 --- a/vortex-sampling-compressor/src/compressors/date_time_parts.rs +++ b/vortex-sampling-compressor/src/compressors/date_time_parts.rs @@ -62,7 +62,8 @@ impl EncodingCompressor for DateTimePartsCompressor { )?; Ok(CompressedArray::compressed( DateTimePartsArray::try_new( - array.dtype().clone(), + // TODO(aduffy): fix extra clone + array.dtype().as_ref().clone(), days.array, seconds.array, subsecond.array, diff --git a/vortex-sampling-compressor/src/compressors/fsst.rs b/vortex-sampling-compressor/src/compressors/fsst.rs index 0e570771ca..1d224dc8fb 100644 --- a/vortex-sampling-compressor/src/compressors/fsst.rs +++ b/vortex-sampling-compressor/src/compressors/fsst.rs @@ -44,7 +44,7 @@ impl EncodingCompressor for FSSTCompressor { // FSST arrays must have DType::Utf8. // // Note that while it can accept binary data, it is unlikely to perform well. - if !matches!(array.dtype(), &DType::Utf8(_)) { + if !matches!(array.dtype().as_ref(), &DType::Utf8(_)) { return None; } @@ -115,7 +115,8 @@ impl EncodingCompressor for FSSTCompressor { Ok(CompressedArray::compressed( FSSTArray::try_new( - fsst_array.dtype().clone(), + // TODO(aduffy): fix extra clone + fsst_array.dtype().as_ref().clone(), fsst_array.symbols(), fsst_array.symbol_lengths(), compressed_codes.array, diff --git a/vortex-sampling-compressor/src/compressors/roaring_bool.rs b/vortex-sampling-compressor/src/compressors/roaring_bool.rs index 946bdd424e..8112baa5aa 100644 --- a/vortex-sampling-compressor/src/compressors/roaring_bool.rs +++ b/vortex-sampling-compressor/src/compressors/roaring_bool.rs @@ -29,7 +29,7 @@ impl EncodingCompressor for RoaringBoolCompressor { } // Only support non-nullable bool arrays - if array.dtype() != &DType::Bool(NonNullable) { + if array.dtype().as_ref() != &DType::Bool(NonNullable) { return None; } diff --git a/vortex-sampling-compressor/src/compressors/struct_.rs b/vortex-sampling-compressor/src/compressors/struct_.rs index 11ccb39939..7eb96fba89 100644 --- a/vortex-sampling-compressor/src/compressors/struct_.rs +++ b/vortex-sampling-compressor/src/compressors/struct_.rs @@ -24,8 +24,8 @@ impl EncodingCompressor for StructCompressor { } fn can_compress(&self, array: &ArrayData) -> Option<&dyn EncodingCompressor> { - let is_struct = - matches!(array.dtype(), DType::Struct(..)) && array.is_encoding(StructEncoding::ID); + let is_struct = matches!(array.dtype().as_ref(), DType::Struct(..)) + && array.is_encoding(StructEncoding::ID); is_struct.then_some(self) } diff --git a/vortex-scan/src/lib.rs b/vortex-scan/src/lib.rs index adbe422613..9bf5354c76 100644 --- a/vortex-scan/src/lib.rs +++ b/vortex-scan/src/lib.rs @@ -48,7 +48,8 @@ impl Scanner { dtype, projection, filter, - projection_dtype: result_dtype, + // TODO(aduffy): fix extra clone + projection_dtype: result_dtype.as_ref().clone(), }) }