diff --git a/crates/polars-arrow/src/compute/cast/utf8_to.rs b/crates/polars-arrow/src/compute/cast/utf8_to.rs index 2fdb979597d9..314e3632ef95 100644 --- a/crates/polars-arrow/src/compute/cast/utf8_to.rs +++ b/crates/polars-arrow/src/compute/cast/utf8_to.rs @@ -79,8 +79,10 @@ type OffsetType = i8; // chunks so that we don't overflow the offset u32. fn truncate_buffer(buf: &Buffer) -> Buffer { // * 2, as it must be able to hold u32::MAX offset + u32::MAX len. - buf.clone() - .sliced(0, std::cmp::min(buf.len(), OffsetType::MAX as usize * 2)) + buf.clone().sliced( + 0, + std::cmp::min(buf.len(), ((OffsetType::MAX as u64) * 2) as usize), + ) } pub fn binary_to_binview(arr: &BinaryArray) -> BinaryViewArray { diff --git a/crates/polars-row/src/encode.rs b/crates/polars-row/src/encode.rs index 840f6617bbf4..275699cc909a 100644 --- a/crates/polars-row/src/encode.rs +++ b/crates/polars-row/src/encode.rs @@ -297,7 +297,7 @@ fn allocate_rows_buf( columns: &mut [Encoder], fields: &[EncodingField], values: &mut Vec, - offsets: &mut Vec, + offsets: &mut Vec, ) -> usize { let has_variable = columns.iter().any(|enc| enc.is_variable()); @@ -372,13 +372,13 @@ fn allocate_rows_buf( for opt_val in iter { unsafe { lengths.push_unchecked( - row_size_fixed + crate::variable::encoded_len(opt_val, &field), + (row_size_fixed + crate::variable::encoded_len(opt_val, &field)) as u64, ); } } } else { for (opt_val, row_length) in iter.zip(lengths.iter_mut()) { - *row_length += crate::variable::encoded_len(opt_val, &field) + *row_length += crate::variable::encoded_len(opt_val, &field) as u64; } } processed_count += 1; @@ -389,18 +389,18 @@ fn allocate_rows_buf( let array = array.as_any().downcast_ref::().unwrap(); if processed_count == 0 { for opt_val in array.into_iter() { + let next_length = row_size_fixed + + crate::variable::encoded_len(opt_val, enc_field); unsafe { - lengths.push_unchecked( - row_size_fixed - + crate::variable::encoded_len(opt_val, enc_field), - ); + lengths.push_unchecked(next_length as u64); } } } else { for (opt_val, row_length) in array.into_iter().zip(lengths.iter_mut()) { - *row_length += crate::variable::encoded_len(opt_val, enc_field) + *row_length += + crate::variable::encoded_len(opt_val, enc_field) as u64 } } processed_count += 1; @@ -409,18 +409,18 @@ fn allocate_rows_buf( let array = array.as_any().downcast_ref::>().unwrap(); if processed_count == 0 { for opt_val in array.into_iter() { + let next_length = row_size_fixed + + crate::variable::encoded_len(opt_val, enc_field); unsafe { - lengths.push_unchecked( - row_size_fixed - + crate::variable::encoded_len(opt_val, enc_field), - ); + lengths.push_unchecked(next_length as u64); } } } else { for (opt_val, row_length) in array.into_iter().zip(lengths.iter_mut()) { - *row_length += crate::variable::encoded_len(opt_val, enc_field) + *row_length += + crate::variable::encoded_len(opt_val, enc_field) as u64 } } processed_count += 1; @@ -436,16 +436,14 @@ fn allocate_rows_buf( .map(|opt_s| opt_s.map(|s| s.as_bytes())); if processed_count == 0 { for opt_val in iter { - unsafe { - lengths.push_unchecked( - row_size_fixed - + crate::variable::encoded_len(opt_val, enc_field), - ) - } + let next_length = row_size_fixed + + crate::variable::encoded_len(opt_val, enc_field); + unsafe { lengths.push_unchecked(next_length as u64) } } } else { for (opt_val, row_length) in iter.zip(lengths.iter_mut()) { - *row_length += crate::variable::encoded_len(opt_val, enc_field) + *row_length += + crate::variable::encoded_len(opt_val, enc_field) as u64 } } processed_count += 1; @@ -466,12 +464,12 @@ fn allocate_rows_buf( for length in offsets.iter_mut() { let to_write = lagged_offset; lagged_offset = current_offset; - current_offset += *length; + current_offset += *length as usize; - *length = to_write; + *length = to_write as u64; } // ensure we have len + 1 offsets - offsets.push(lagged_offset); + offsets.push(lagged_offset as u64); // Only reserve. The init will be done later values.reserve(current_offset); @@ -496,10 +494,10 @@ fn allocate_rows_buf( // 0, 2, 4, 6 offsets.clear(); offsets.reserve(num_rows + 1); - let mut current_offset = 0; - offsets.push(current_offset); + let mut current_offset = 0_usize; + offsets.push(current_offset as u64); for _ in 0..num_rows { - offsets.push(current_offset); + offsets.push(current_offset as u64); current_offset += row_size; } n_bytes diff --git a/crates/polars-row/src/fixed.rs b/crates/polars-row/src/fixed.rs index 315eada42ae4..bec427b0e86f 100644 --- a/crates/polars-row/src/fixed.rs +++ b/crates/polars-row/src/fixed.rs @@ -144,12 +144,13 @@ impl FixedLengthEncoding for f64 { #[inline] fn encode_value( value: &T, - offset: &mut usize, + offset: &mut u64, descending: bool, buf: &mut [MaybeUninit], ) { - let end_offset = *offset + T::ENCODED_LEN; - let dst = unsafe { buf.get_unchecked_mut(*offset..end_offset) }; + let usize_offset = *offset as usize; + let end_offset = usize_offset + T::ENCODED_LEN; + let dst = unsafe { buf.get_unchecked_mut(usize_offset..end_offset) }; // set valid dst[0] = MaybeUninit::new(1); let mut encoded = value.encode(); @@ -162,7 +163,7 @@ fn encode_value( } dst[1..].copy_from_slice(encoded.as_ref().as_uninit()); - *offset = end_offset; + *offset = end_offset as u64; } pub(crate) unsafe fn encode_slice( @@ -197,16 +198,18 @@ pub(crate) unsafe fn encode_iter>, T: FixedLengthEn if let Some(value) = opt_value { encode_value(&value, offset, field.descending, values); } else { + let usize_offset = *offset as usize; unsafe { - *values.get_unchecked_mut(*offset) = MaybeUninit::new(get_null_sentinel(field)) + *values.get_unchecked_mut(usize_offset) = MaybeUninit::new(get_null_sentinel(field)) }; - let end_offset = *offset + T::ENCODED_LEN; + + let end_offset = usize_offset + T::ENCODED_LEN; // initialize remaining bytes - let remainder = values.get_unchecked_mut(*offset + 1..end_offset); + let remainder = values.get_unchecked_mut(usize_offset + 1..end_offset); remainder.fill(MaybeUninit::new(0)); - *offset = end_offset; + *offset = end_offset as u64; } } } diff --git a/crates/polars-row/src/row.rs b/crates/polars-row/src/row.rs index 1aa50e8b0e43..a991791963f6 100644 --- a/crates/polars-row/src/row.rs +++ b/crates/polars-row/src/row.rs @@ -35,26 +35,23 @@ impl EncodingField { #[derive(Default, Clone)] pub struct RowsEncoded { pub(crate) values: Vec, - pub(crate) offsets: Vec, + + // This vector is in practice a vec of usize's. + // However, since the vec is eventually passed to arrow as i64's, + // we need to make sure the right number of bytes are reserved. + // Usize's take 4 bytes of memory on 32bit systems, whereas i64 takes 8 bytes. + pub(crate) offsets: Vec, } -fn checks(offsets: &[usize]) { - assert_eq!( - size_of::(), - size_of::(), - "only supported on 64bit arch" - ); - assert!( - (*offsets.last().unwrap() as u64) < i64::MAX as u64, - "overflow" - ); +fn checks(offsets: &[u64]) { + assert!(*offsets.last().unwrap() < i64::MAX as u64, "overflow"); } -unsafe fn rows_to_array(buf: Vec, offsets: Vec) -> BinaryArray { +unsafe fn rows_to_array(buf: Vec, offsets: Vec) -> BinaryArray { checks(&offsets); // SAFETY: we checked overflow - let offsets = bytemuck::cast_vec::(offsets); + let offsets = bytemuck::cast_vec::(offsets); // SAFETY: monotonically increasing let offsets = Offsets::new_unchecked(offsets); @@ -63,13 +60,13 @@ unsafe fn rows_to_array(buf: Vec, offsets: Vec) -> BinaryArray { } impl RowsEncoded { - pub(crate) fn new(values: Vec, offsets: Vec) -> Self { + pub(crate) fn new(values: Vec, offsets: Vec) -> Self { RowsEncoded { values, offsets } } pub fn iter(&self) -> RowsEncodedIter { let iter = self.offsets[1..].iter(); - let offset = self.offsets[0]; + let offset = self.offsets[0] as usize; RowsEncodedIter { offset, end: iter, @@ -87,7 +84,7 @@ impl RowsEncoded { unsafe { let (_, values, _) = mmap::slice(&self.values).into_inner(); - let offsets = bytemuck::cast_slice::(self.offsets.as_slice()); + let offsets = bytemuck::cast_slice::(self.offsets.as_slice()); let (_, offsets, _) = mmap::slice(offsets).into_inner(); let offsets = OffsetsBuffer::new_unchecked(offsets); @@ -115,7 +112,7 @@ impl RowsEncoded { pub struct RowsEncodedIter<'a> { offset: usize, - end: std::slice::Iter<'a, usize>, + end: std::slice::Iter<'a, u64>, values: &'a [u8], } @@ -123,7 +120,7 @@ impl<'a> Iterator for RowsEncodedIter<'a> { type Item = &'a [u8]; fn next(&mut self) -> Option { - let new_offset = *self.end.next()?; + let new_offset = *self.end.next()? as usize; let payload = unsafe { self.values.get_unchecked(self.offset..new_offset) }; self.offset = new_offset; Some(payload) diff --git a/crates/polars-row/src/variable.rs b/crates/polars-row/src/variable.rs index f7485d44704a..a6ef8fe870bf 100644 --- a/crates/polars-row/src/variable.rs +++ b/crates/polars-row/src/variable.rs @@ -172,19 +172,21 @@ pub(crate) unsafe fn encode_iter<'a, I: Iterator>>( if field.no_order { for (offset, opt_value) in out.offsets.iter_mut().skip(1).zip(input) { - let dst = values.get_unchecked_mut(*offset..); + let dst: &mut [MaybeUninit] = + values.get_unchecked_mut((*offset as usize)..); let written_len = encode_one_no_order(dst, opt_value.map(|v| v.as_uninit()), field); - *offset += written_len; + *offset += written_len as u64; } } else { for (offset, opt_value) in out.offsets.iter_mut().skip(1).zip(input) { - let dst = values.get_unchecked_mut(*offset..); + let dst: &mut [MaybeUninit] = + values.get_unchecked_mut((*offset as usize)..); let written_len = encode_one(dst, opt_value.map(|v| v.as_uninit()), field); - *offset += written_len; + *offset += written_len as u64; } } - let offset = out.offsets.last().unwrap(); - let dst = values.get_unchecked_mut(*offset..); + let offset = *out.offsets.last().unwrap() as usize; + let dst: &mut [MaybeUninit] = values.get_unchecked_mut(offset..); // write remainder as zeros dst.fill(MaybeUninit::new(0)); out.values.set_len(out.values.capacity())