Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: compare dict by value #2085

Merged
merged 33 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
48ef5e5
compare dict by value
joseph-isaacs Jan 27, 2025
64ce966
update
joseph-isaacs Jan 27, 2025
8ef9937
update
joseph-isaacs Jan 27, 2025
48e7187
update
joseph-isaacs Jan 28, 2025
671291b
update
joseph-isaacs Jan 28, 2025
e7fb64f
update
joseph-isaacs Jan 28, 2025
ce549ff
Merge branch 'develop' into ji/dict-compare-by-value
joseph-isaacs Jan 28, 2025
a7bda1b
update
joseph-isaacs Jan 28, 2025
45f1b1a
fmt
joseph-isaacs Jan 28, 2025
04b8051
non null codes
joseph-isaacs Jan 28, 2025
07eaf82
fix
joseph-isaacs Jan 28, 2025
1a3fcc6
Merge branch 'develop' into ji/dict-compare-by-value
joseph-isaacs Jan 28, 2025
91870d4
fmt
joseph-isaacs Jan 28, 2025
3ac5cd9
Merge branch 'develop' into ji/dict-compare-by-value
joseph-isaacs Jan 28, 2025
b5bac82
feat: nullable take (#2103)
joseph-isaacs Jan 31, 2025
c58515c
Merge remote-tracking branch 'origin/ji/dict-compare-by-value' into j…
joseph-isaacs Feb 5, 2025
dcfe935
fixup
joseph-isaacs Feb 5, 2025
7a42ada
Merge branch 'develop' into ji/dict-compare-by-value
joseph-isaacs Feb 5, 2025
ff9e3ba
fixup
joseph-isaacs Feb 5, 2025
e0f98ff
fixup
joseph-isaacs Feb 5, 2025
718b66a
fixup
joseph-isaacs Feb 5, 2025
1d0df06
fixup
joseph-isaacs Feb 5, 2025
631a4f0
fixup
joseph-isaacs Feb 5, 2025
0bf9bc6
fixup
joseph-isaacs Feb 5, 2025
2305347
fixup
joseph-isaacs Feb 5, 2025
d3291ca
tests
joseph-isaacs Feb 5, 2025
7f4dfe4
tests
joseph-isaacs Feb 5, 2025
113e314
tests
joseph-isaacs Feb 5, 2025
1fe37e8
fixup
joseph-isaacs Feb 5, 2025
ed2ed24
fix
joseph-isaacs Feb 5, 2025
7ad08d2
Fix dtypes
gatesn Feb 5, 2025
729fcb8
fix
joseph-isaacs Feb 6, 2025
4ee4e13
fix
joseph-isaacs Feb 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions encodings/dict/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pub struct DictMetadata {

impl DictArray {
pub fn try_new(codes: ArrayData, values: ArrayData) -> VortexResult<Self> {
if !codes.dtype().is_unsigned_int() || codes.dtype().is_nullable() {
if !codes.dtype().is_unsigned_int() {
vortex_bail!(MismatchedTypes: "non-nullable unsigned int", codes.dtype());
joseph-isaacs marked this conversation as resolved.
Show resolved Hide resolved
}
Self::try_from_parts(
Expand All @@ -47,7 +47,11 @@ impl DictArray {
#[inline]
pub fn codes(&self) -> ArrayData {
self.as_ref()
.child(0, &DType::from(self.metadata().codes_ptype), self.len())
.child(
0,
&DType::Primitive(self.metadata().codes_ptype, self.dtype().nullability()),
self.len(),
)
.vortex_expect("DictArray is missing its codes child array")
}

Expand Down Expand Up @@ -80,10 +84,14 @@ impl IntoCanonical for DictArray {

impl ValidityVTable<DictArray> for DictEncoding {
fn is_valid(&self, array: &DictArray, index: usize) -> VortexResult<bool> {
let values_index = scalar_at(array.codes(), index)
.unwrap_or_else(|err| {
vortex_panic!(err, "Failed to get index {} from DictArray codes", index)
})
let scalar = scalar_at(array.codes(), index).unwrap_or_else(|err| {
vortex_panic!(err, "Failed to get index {} from DictArray codes", index)
joseph-isaacs marked this conversation as resolved.
Show resolved Hide resolved
});

if scalar.is_null() {
return Ok(false);
};
let values_index: usize = scalar
.as_ref()
.try_into()
.vortex_expect("Failed to convert dictionary code to usize");
Expand All @@ -94,6 +102,7 @@ impl ValidityVTable<DictArray> for DictEncoding {
if array.dtype().is_nullable() {
let primitive_codes = array.codes().into_primitive()?;
match_each_integer_ptype!(primitive_codes.ptype(), |$P| {
// This is correct since the code will be 0 if the value is null.
let is_valid = primitive_codes
.as_slice::<$P>();
let is_valid_buffer = BooleanBuffer::collect_bool(is_valid.len(), |idx| {
Expand Down
71 changes: 48 additions & 23 deletions encodings/dict/src/compress.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::hash::{BuildHasher, Hash, Hasher};

use arrow_buffer::BooleanBufferBuilder;
use num_traits::AsPrimitive;
use vortex_array::accessor::ArrayAccessor;
use vortex_array::aliases::hash_map::{DefaultHashBuilder, Entry, HashMap, HashTable, RandomState};
Expand Down Expand Up @@ -107,20 +108,30 @@ impl<T: NativePType> DictEncoder for PrimitiveDictBuilder<T> {
}

let mut codes = BufferMut::<u64>::with_capacity(array.len());

let primitive = array.clone().into_primitive()?;
primitive.with_iterator(|it| {
for value in it {
let code = if let Some(&v) = value {
self.encode_value(v)
} else {
NULL_CODE
};
unsafe { codes.push_unchecked(code) }
}
})?;

Ok(PrimitiveArray::new(codes, Validity::NonNullable).into_array())
let (codes, validity) = if array.dtype().is_nullable() {
let mut bool_buf = BooleanBufferBuilder::new(array.len());
primitive.with_iterator(|it| {
for value in it {
let (code, validity) = value
.map(|v| (self.encode_value(*v), true))
.unwrap_or((NULL_CODE, false));
bool_buf.append(validity);
unsafe { codes.push_unchecked(code) }
}
})?;
(codes, Validity::Array(bool_buf.finish().into()))
} else {
primitive.with_iterator(|it| {
for value in it {
let code = value.map(|v| self.encode_value(*v)).unwrap_or(NULL_CODE);
joseph-isaacs marked this conversation as resolved.
Show resolved Hide resolved
unsafe { codes.push_unchecked(code) }
}
})?;
(codes, Validity::NonNullable)
};
Ok(PrimitiveArray::new(codes, validity).into_array())
}

fn values(&mut self) -> ArrayData {
Expand Down Expand Up @@ -199,20 +210,34 @@ impl BytesDictBuilder {
let mut local_lookup = self.lookup.take().vortex_expect("Must have a lookup dict");
let mut codes: BufferMut<u64> = BufferMut::with_capacity(len);

accessor.with_iterator(|it| {
for value in it {
let code = if let Some(v) = value {
self.encode_value(&mut local_lookup, v)
} else {
NULL_CODE
};
unsafe { codes.push_unchecked(code) }
}
})?;
let (codes, validity) = if self.dtype.is_nullable() {
let mut bool_buf = BooleanBufferBuilder::new(len);

accessor.with_iterator(|it| {
for value in it {
let (code, validity) = value
.map(|v| (self.encode_value(&mut local_lookup, v), true))
.unwrap_or((NULL_CODE, false));
bool_buf.append(validity);
unsafe { codes.push_unchecked(code) }
}
})?;
(codes, Validity::Array(bool_buf.finish().into()))
} else {
accessor.with_iterator(|it| {
for value in it {
let code = value
.map(|v| self.encode_value(&mut local_lookup, v))
.unwrap_or(NULL_CODE);
unsafe { codes.push_unchecked(code) }
}
})?;
(codes, Validity::NonNullable)
};

// Restore lookup dictionary back into the struct
self.lookup = Some(local_lookup);
Ok(PrimitiveArray::new(codes, Validity::NonNullable).into_array())
Ok(PrimitiveArray::new(codes, validity).into_array())
}
}

Expand Down
76 changes: 74 additions & 2 deletions encodings/dict/src/compute/compare.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
use vortex_array::array::ConstantArray;
use vortex_array::compute::{compare, take, CompareFn, Operator};
use vortex_array::ArrayData;
use vortex_array::compute::{compare, take, try_cast, CompareFn, Operator};
use vortex_array::{ArrayDType, ArrayData, ArrayLen, IntoArrayData, IntoArrayVariant};
use vortex_dtype::DType;
use vortex_error::VortexResult;
use vortex_scalar::Scalar;

use crate::{DictArray, DictEncoding};

Expand All @@ -14,6 +16,10 @@ impl CompareFn<DictArray> for DictEncoding {
) -> VortexResult<Option<ArrayData>> {
// If the RHS is constant, then we just need to compare against our encoded values.
if let Some(const_scalar) = rhs.as_constant() {
// TODO: support other operations if the dict is sorted.
if matches!(operator, Operator::Eq) {
return compare_eq_by_code(lhs, const_scalar);
}
// Ensure the other is the same length as the dictionary
let compare_result = compare(
lhs.values(),
Expand All @@ -28,3 +34,69 @@ impl CompareFn<DictArray> for DictEncoding {
Ok(None)
}
}

fn compare_eq_by_code(lhs: &DictArray, rhs: Scalar) -> VortexResult<Option<ArrayData>> {
// If the RHS is constant, then we just need to compare against our encoded values.
let compare_result = compare(
lhs.values(),
ConstantArray::new(rhs, lhs.values().len()),
Operator::Eq,
)?;

let bool = compare_result.into_bool()?;

// Couldn't find a value match, so the result is all false
let Some(code) = bool.boolean_buffer().set_indices().next() else {
return Ok(Some(
ConstantArray::new(
Scalar::bool(false, lhs.dtype().nullability()),
lhs.codes().len(),
)
.into_array(),
));
};

// The codes include nullability so we can just compare the codes directly, to the found code.
let compare_result = try_cast(
compare(
lhs.codes(),
try_cast(ConstantArray::new(code, lhs.len()), lhs.codes().dtype())?,
Operator::Eq,
)?,
&DType::Bool(lhs.dtype().nullability()),
)?;
Ok(Some(compare_result))
joseph-isaacs marked this conversation as resolved.
Show resolved Hide resolved
}

#[cfg(test)]
mod tests {
use vortex_array::array::ConstantArray;
use vortex_array::compute::{compare, Operator};
use vortex_array::{ArrayLen, IntoArrayData, IntoArrayVariant};
use vortex_buffer::buffer;
use vortex_scalar::Scalar;

use crate::DictArray;

#[test]
fn test_compare_value() {
let dict = DictArray::try_new(
buffer![0u32, 1, 2].into_array(),
buffer![1i32, 2, 3].into_array(),
)
.unwrap();

let res = compare(
&dict,
ConstantArray::new(Scalar::from(1i32), 3),
Operator::Eq,
)
.unwrap();
let res = res.into_bool().unwrap();
assert_eq!(res.len(), 3);
assert_eq!(
res.boolean_buffer().iter().collect::<Vec<_>>(),
vec![true, false, false]
);
}
}
21 changes: 21 additions & 0 deletions vortex-array/src/array/bool/compute/cast.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
use arrow_array::BooleanArray;
use vortex_dtype::DType;
use vortex_error::VortexResult;

use crate::array::{BoolArray, BoolEncoding};
use crate::compute::CastFn;
use crate::{ArrayDType, ArrayData, IntoArrayData};

impl CastFn<BoolArray> for BoolEncoding {
fn cast(&self, array: &BoolArray, dtype: &DType) -> VortexResult<ArrayData> {
assert!(matches!(dtype, DType::Bool(_)));
joseph-isaacs marked this conversation as resolved.
Show resolved Hide resolved

// If the types are the same, return the array,
// otherwise set the array nullability as the dtype nullability.
if array.dtype() != dtype {
joseph-isaacs marked this conversation as resolved.
Show resolved Hide resolved
Ok(BoolArray::new(array.boolean_buffer(), dtype.nullability()).into_array())
joseph-isaacs marked this conversation as resolved.
Show resolved Hide resolved
} else {
Ok(array.clone().into_array())
}
}
}
9 changes: 7 additions & 2 deletions vortex-array/src/array/bool/compute/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
use crate::array::BoolEncoding;
use crate::compute::{
BinaryBooleanFn, ComputeVTable, FillForwardFn, FillNullFn, FilterFn, InvertFn, ScalarAtFn,
SliceFn, TakeFn,
BinaryBooleanFn, CastFn, ComputeVTable, FillForwardFn, FillNullFn, FilterFn, InvertFn,
ScalarAtFn, SliceFn, TakeFn,
};
use crate::ArrayData;

mod cast;
mod fill_forward;
mod fill_null;
pub mod filter;
Expand Down Expand Up @@ -50,4 +51,8 @@ impl ComputeVTable for BoolEncoding {
fn take_fn(&self) -> Option<&dyn TakeFn<ArrayData>> {
Some(self)
}

fn cast_fn(&self) -> Option<&dyn CastFn<ArrayData>> {
Some(self)
}
}
4 changes: 2 additions & 2 deletions vortex-array/src/compute/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ pub fn take(
let array = array.as_ref();
let indices = indices.as_ref();

if !indices.dtype().is_int() || indices.dtype().is_nullable() {
if !indices.dtype().is_int() {
vortex_bail!(
"Take indices must be a non-nullable integer type, got {}",
"Take indices must be an integer type, got {}",
indices.dtype()
);
}
Expand Down
5 changes: 5 additions & 0 deletions vortex-buffer/src/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,11 @@ impl<T> Buffer<T> {

/// Returns the underlying aligned buffer.
pub fn into_inner(self) -> Bytes {
debug_assert_eq!(
self.length * size_of::<T>(),
self.bytes.len(),
"Own length has to be the same as the underlying bytes length"
);
self.bytes
}

Expand Down
1 change: 1 addition & 0 deletions vortex-buffer/src/buffer_mut.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ impl<T> BufferMut<T> {
pub fn zeroed_aligned(len: usize, alignment: Alignment) -> Self {
let mut bytes = BytesMut::zeroed((len * size_of::<T>()) + *alignment);
bytes.advance(bytes.as_ptr().align_offset(*alignment));
unsafe { bytes.set_len(len * size_of::<T>()) };
Self {
bytes,
length: len,
Expand Down
2 changes: 1 addition & 1 deletion vortex-sampling-compressor/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ vortex-dict = { workspace = true }
vortex-dtype = { workspace = true }
vortex-error = { workspace = true }
vortex-fastlanes = { workspace = true }
vortex-scalar = { workspace = true }
vortex-fsst = { workspace = true }
vortex-runend = { workspace = true }
vortex-sparse = { workspace = true }
vortex-scalar = { workspace = true }
vortex-zigzag = { workspace = true }

[dev-dependencies]
Expand Down
27 changes: 16 additions & 11 deletions vortex-sampling-compressor/src/downscale.rs
Original file line number Diff line number Diff line change
@@ -1,27 +1,32 @@
use vortex_array::array::{PrimitiveArray, PrimitiveEncoding};
use vortex_array::array::{ConstantArray, PrimitiveArray, PrimitiveEncoding};
use vortex_array::compute::try_cast;
use vortex_array::encoding::EncodingVTable;
use vortex_array::stats::{ArrayStatistics, Stat};
use vortex_array::{ArrayDType, ArrayData, IntoArrayData, IntoArrayVariant};
use vortex_array::{ArrayDType, ArrayData, ArrayLen, IntoArrayData, IntoArrayVariant};
use vortex_dtype::{DType, PType};
use vortex_error::{vortex_err, VortexExpect, VortexResult};
use vortex_error::{VortexExpect, VortexResult};
use vortex_scalar::Scalar;

/// Downscale a primitive array to the narrowest PType that fits all the values.
pub fn downscale_integer_array(array: ArrayData) -> VortexResult<ArrayData> {
if !array.is_encoding(PrimitiveEncoding.id()) {
// This can happen if e.g. the array is ConstantArray.
return Ok(array);
}
if array.is_empty() {
return Ok(array);
}
let array = PrimitiveArray::maybe_from(array).vortex_expect("Checked earlier");

let min = array
.statistics()
.compute(Stat::Min)
.ok_or_else(|| vortex_err!("Failed to compute min on primitive array"))?;
let max = array
.statistics()
.compute(Stat::Max)
.ok_or_else(|| vortex_err!("Failed to compute max on primitive array"))?;
let min = array.statistics().compute(Stat::Min);
let max = array.statistics().compute(Stat::Max);

let (Some(min), Some(max)) = (min, max) else {
// This array but be all nulls.
return Ok(
ConstantArray::new(Scalar::null(array.dtype().clone()), array.len()).into_array(),
);
};

// If we can't cast to i64, then leave the array as its original type.
// It's too big to downcast anyway.
Expand Down