From 6c45ab8120f8f5b518faabad2083560ee031036b Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Mon, 9 Sep 2024 12:52:39 +0100 Subject: [PATCH 1/6] upgrade to smallvec 2 to fix invariant lifetimes --- crates/jiter/Cargo.toml | 2 +- crates/jiter/src/lazy_index_map.rs | 4 ++-- crates/jiter/src/python.rs | 4 ++-- crates/jiter/src/value.rs | 8 +++++--- crates/jiter/tests/main.rs | 9 +++++++++ 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/crates/jiter/Cargo.toml b/crates/jiter/Cargo.toml index 782f8b0b..ddcef5bd 100644 --- a/crates/jiter/Cargo.toml +++ b/crates/jiter/Cargo.toml @@ -15,7 +15,7 @@ repository = {workspace = true} num-bigint = "0.4.4" num-traits = "0.2.16" ahash = "0.8.0" -smallvec = "1.11.0" +smallvec = "2.0.0-alpha.7" pyo3 = { workspace = true, optional = true, features = ["num-bigint"] } lexical-parse-float = { version = "0.8.5", features = ["format"] } bitvec = "1.0.1" diff --git a/crates/jiter/src/lazy_index_map.rs b/crates/jiter/src/lazy_index_map.rs index b57bdcb0..5b40fea9 100644 --- a/crates/jiter/src/lazy_index_map.rs +++ b/crates/jiter/src/lazy_index_map.rs @@ -10,7 +10,7 @@ use smallvec::SmallVec; /// Like [IndexMap](https://docs.rs/indexmap/latest/indexmap/) but only builds the lookup map when it's needed. pub struct LazyIndexMap { - vec: SmallVec<[(K, V); 8]>, + vec: SmallVec<(K, V), 8>, map: OnceLock>, last_find: AtomicUsize, } @@ -150,7 +150,7 @@ impl PartialEq for LazyIndexMap { } struct IterUnique<'a, K, V> { - vec: &'a SmallVec<[(K, V); 8]>, + vec: &'a SmallVec<(K, V), 8>, map: &'a AHashMap, index: usize, } diff --git a/crates/jiter/src/python.rs b/crates/jiter/src/python.rs index b811d1c3..6cf7d95b 100644 --- a/crates/jiter/src/python.rs +++ b/crates/jiter/src/python.rs @@ -148,7 +148,7 @@ impl<'j, StringCache: StringMaybeCache, KeyCheck: MaybeKeyCheck, ParseNumber: Ma Ok(None) | Err(_) => return Ok(PyList::empty_bound(py).into_any()), }; - let mut vec: SmallVec<[Bound<'_, PyAny>; 8]> = SmallVec::with_capacity(8); + let mut vec: SmallVec, 8> = SmallVec::with_capacity(8); if let Err(e) = self._parse_array(py, peek_first, &mut vec) { if !self._allow_partial_err(&e) { return Err(e); @@ -174,7 +174,7 @@ impl<'j, StringCache: StringMaybeCache, KeyCheck: MaybeKeyCheck, ParseNumber: Ma &mut self, py: Python<'py>, peek_first: Peek, - vec: &mut SmallVec<[Bound<'py, PyAny>; 8]>, + vec: &mut SmallVec, 8>, ) -> JsonResult<()> { let v = self._check_take_value(py, peek_first)?; vec.push(v); diff --git a/crates/jiter/src/value.rs b/crates/jiter/src/value.rs index 78f3e024..0335cba6 100644 --- a/crates/jiter/src/value.rs +++ b/crates/jiter/src/value.rs @@ -23,7 +23,7 @@ pub enum JsonValue<'s> { Object(JsonObject<'s>), } -pub type JsonArray<'s> = Arc; 8]>>; +pub type JsonArray<'s> = Arc, 8>>; pub type JsonObject<'s> = Arc, JsonValue<'s>>>; #[cfg(feature = "python")] @@ -81,7 +81,9 @@ fn value_static(v: JsonValue<'_>) -> JsonValue<'static> { JsonValue::BigInt(b) => JsonValue::BigInt(b), JsonValue::Float(f) => JsonValue::Float(f), JsonValue::Str(s) => JsonValue::Str(s.into_owned().into()), - JsonValue::Array(v) => JsonValue::Array(Arc::new(v.iter().map(JsonValue::to_static).collect::>())), + JsonValue::Array(v) => { + JsonValue::Array(Arc::new(v.iter().map(JsonValue::to_static).collect::>())) + } JsonValue::Object(o) => JsonValue::Object(Arc::new(o.to_static())), } } @@ -173,7 +175,7 @@ fn take_value<'j, 's>( } Peek::Array => { // we could do something clever about guessing the size of the array - let mut array: SmallVec<[JsonValue<'s>; 8]> = SmallVec::new(); + let mut array: SmallVec, 8> = SmallVec::new(); if let Some(peek_first) = parser.array_first()? { check_recursion!(recursion_limit, parser.index, let v = take_value(peek_first, parser, tape, recursion_limit, allow_inf_nan, create_cow)?; diff --git a/crates/jiter/tests/main.rs b/crates/jiter/tests/main.rs index 3d672cd6..65d87708 100644 --- a/crates/jiter/tests/main.rs +++ b/crates/jiter/tests/main.rs @@ -1653,3 +1653,12 @@ fn test_unicode_roundtrip() { assert_eq!(cow, "中文"); assert!(matches!(cow, Cow::Owned(_))); } + +#[test] +fn test_invariant_lifetimes() { + let v1 = JsonValue::Str("foobar".into()); + + let s = "foobar".to_string(); + let v2 = JsonValue::Str(s.as_str().into()); + assert_eq!(v1, v2); +} From 66f0bfe4bf063ca59bc9425fa6b30e23202ffb7b Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Mon, 9 Sep 2024 16:04:49 +0100 Subject: [PATCH 2/6] try reworking LazyIndexMap to fix lifetimes & other bugs --- crates/jiter/Cargo.toml | 1 + crates/jiter/src/lazy_index_map.rs | 366 ++++++++++++++++++++++------- crates/jiter/tests/main.rs | 8 +- 3 files changed, 286 insertions(+), 89 deletions(-) diff --git a/crates/jiter/Cargo.toml b/crates/jiter/Cargo.toml index ddcef5bd..9d137cce 100644 --- a/crates/jiter/Cargo.toml +++ b/crates/jiter/Cargo.toml @@ -19,6 +19,7 @@ smallvec = "2.0.0-alpha.7" pyo3 = { workspace = true, optional = true, features = ["num-bigint"] } lexical-parse-float = { version = "0.8.5", features = ["format"] } bitvec = "1.0.1" +indexmap = "2.5.0" [features] python = ["dep:pyo3", "dep:pyo3-build-config"] diff --git a/crates/jiter/src/lazy_index_map.rs b/crates/jiter/src/lazy_index_map.rs index 5b40fea9..922f8f9f 100644 --- a/crates/jiter/src/lazy_index_map.rs +++ b/crates/jiter/src/lazy_index_map.rs @@ -1,18 +1,25 @@ use std::borrow::{Borrow, Cow}; +use std::cell::Cell; use std::fmt; -use std::hash::Hash; +use std::hash::{DefaultHasher, Hash, Hasher}; +use std::mem::MaybeUninit; use std::slice::Iter as SliceIter; use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::OnceLock; -use ahash::AHashMap; -use smallvec::SmallVec; +use ahash::RandomState; +use bitvec::order::Lsb0; +use indexmap::IndexMap; /// Like [IndexMap](https://docs.rs/indexmap/latest/indexmap/) but only builds the lookup map when it's needed. +#[derive(Clone)] pub struct LazyIndexMap { - vec: SmallVec<(K, V), 8>, - map: OnceLock>, - last_find: AtomicUsize, + inner: LazyIndexMapInner, +} + +#[derive(Clone)] +enum LazyIndexMapInner { + Array(LazyIndexMapArray), + Map(IndexMap), } impl Default for LazyIndexMap @@ -25,23 +32,13 @@ where } } -impl Clone for LazyIndexMap { - fn clone(&self) -> Self { - Self { - vec: self.vec.clone(), - map: self.map.clone(), - last_find: AtomicUsize::new(0), - } - } -} - impl fmt::Debug for LazyIndexMap where K: Clone + fmt::Debug + Eq + Hash, V: fmt::Debug, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_map().entries(self.iter_unique()).finish() + f.debug_map().entries(self.iter()).finish() } } @@ -51,30 +48,68 @@ const HASHMAP_THRESHOLD: usize = 16; /// Like [IndexMap](https://docs.rs/indexmap/latest/indexmap/) but only builds the lookup map when it's needed. impl LazyIndexMap where - K: Clone + fmt::Debug + Eq + Hash, + K: fmt::Debug + Eq + Hash, V: fmt::Debug, { pub fn new() -> Self { Self { - vec: SmallVec::new(), - map: OnceLock::new(), - last_find: AtomicUsize::new(0), + inner: LazyIndexMapInner::Array(LazyIndexMapArray::new()), } } pub fn insert(&mut self, key: K, value: V) { - if let Some(map) = self.map.get_mut() { - map.insert(key.clone(), self.vec.len()); + match self { + Self { + inner: LazyIndexMapInner::Array(vec), + } => { + if let Some((key, value)) = vec.insert(key, value) { + // array is full, convert to map + let LazyIndexMapInner::Array(vec) = std::mem::replace( + &mut self.inner, + LazyIndexMapInner::Map(IndexMap::with_capacity_and_hasher( + HASHMAP_THRESHOLD + 1, + RandomState::default(), + )), + ) else { + unreachable!("known to be a vec"); + }; + let LazyIndexMapInner::Map(map) = &mut self.inner else { + unreachable!("just set to be a map"); + }; + for (k, v) in vec.into_complete_data() { + map.insert(k, v); + } + map.insert(key, value); + } + } + Self { + inner: LazyIndexMapInner::Map(map), + } => { + map.insert(key, value); + } } - self.vec.push((key, value)); } pub fn len(&self) -> usize { - self.get_map().len() + match self { + Self { + inner: LazyIndexMapInner::Array(vec), + } => vec.duplicates_mask()[..vec.data().len()].count_ones(), + Self { + inner: LazyIndexMapInner::Map(map), + } => map.len(), + } } pub fn is_empty(&self) -> bool { - self.vec.is_empty() + match self { + Self { + inner: LazyIndexMapInner::Array(vec), + } => vec.data().is_empty(), + Self { + inner: LazyIndexMapInner::Map(map), + } => map.is_empty(), + } } pub fn get(&self, key: &Q) -> Option<&V> @@ -82,93 +117,254 @@ where K: Borrow + PartialEq, Q: Hash + Eq + ?Sized, { - let vec_len = self.vec.len(); - // if the vec is longer than the threshold, we use the hashmap for lookups - if vec_len > HASHMAP_THRESHOLD { - self.get_map().get(key).map(|&i| &self.vec[i].1) - } else { - // otherwise we find the value in the vec - // we assume the most likely position for the match is at `last_find + 1` + match &self.inner { + LazyIndexMapInner::Array(vec) => vec.get(key), + LazyIndexMapInner::Map(map) => map.get(key), + } + } + + pub fn keys(&self) -> impl Iterator { + self.iter().map(|(k, _)| k) + } + + pub fn iter(&self) -> impl Iterator { + match &self.inner { + LazyIndexMapInner::Array(vec) => { + // SAFETY: data is known to be initialized up to len + let data = vec.data(); + let mask = vec.duplicates_mask().clone(); + LazyIndexMapIter::Vec { + iter: data.iter(), + mask: mask.into_iter(), + } + } + LazyIndexMapInner::Map(map) => LazyIndexMapIter::Map(map.iter()), + } + } +} + +mod index_map_vec { + use super::*; + + pub(super) struct LazyIndexMapArray { + data: Box<[MaybeUninit<(K, V)>; HASHMAP_THRESHOLD]>, + len: usize, + last_find: AtomicUsize, + duplicates_mask: DuplicatesMask, + } + + type DuplicatesMask = bitvec::BitArr!(for HASHMAP_THRESHOLD, in Cell); + + impl LazyIndexMapArray { + pub fn new() -> Self { + Self { + data: boxed_uninit_array(), + len: 0, + last_find: AtomicUsize::new(0), + duplicates_mask: DuplicatesMask::ZERO, + } + } + + pub fn data(&self) -> &[(K, V)] { + // SAFETY: data is known to be initialized up to len + unsafe { std::slice::from_raw_parts(self.data.as_ptr().cast::<(K, V)>(), self.len) } + } + } + + impl LazyIndexMapArray { + /// If the vec is full, returns the key-value pair that was not inserted. + pub fn insert(&mut self, key: K, value: V) -> Option<(K, V)> { + if self.len >= HASHMAP_THRESHOLD { + return Some((key, value)); + } + self.data[self.len].write((key, value)); + self.len += 1; + // clear cached mask + self.duplicates_mask = DuplicatesMask::ZERO; + None + } + + pub fn get(&self, key: &Q) -> Option<&V> + where + K: Borrow + PartialEq, + Q: Hash + Eq + ?Sized, + { + if self.len == 0 { + return None; + } + + let data = self.data(); + let mask = self.duplicates_mask(); + let first_try = self.last_find.load(Ordering::Relaxed) + 1; - for i in first_try..first_try + vec_len { - let index = i % vec_len; - let (k, v) = &self.vec[index]; - if k == key { + for i in first_try..first_try + data.len() { + let index = i % data.len(); + if !mask[index] { + continue; + } + let (k, v) = &data[index]; + if k.borrow() == key { self.last_find.store(index, Ordering::Relaxed); return Some(v); } } None } - } - pub fn keys(&self) -> impl Iterator { - self.vec.iter().map(|(k, _)| k) - } + pub fn into_complete_data(self) -> impl IntoIterator { + self.data + .into_iter() + .take(self.len) + // SAFETY: reading initialized section only + .map(|x| unsafe { x.assume_init() }) + } - pub fn iter(&self) -> SliceIter<'_, (K, V)> { - self.vec.iter() + pub fn duplicates_mask(&self) -> &DuplicatesMask { + let data = self.data(); + if self.duplicates_mask == DuplicatesMask::ZERO { + let new_mask = build_duplicates_mask(data); + // FIXME: is there a way to write the whole thing at once? + for i in 0..data.len() { + self.duplicates_mask.set_aliased(i, new_mask[i]); + } + } + &self.duplicates_mask + } } - pub fn iter_unique(&self) -> impl Iterator { - IterUnique { - vec: &self.vec, - map: self.get_map(), - index: 0, + impl<'j> LazyIndexMapArray, crate::JsonValue<'j>> { + pub fn to_static(&self) -> LazyIndexMapArray, crate::JsonValue<'static>> { + let mut new_data = boxed_uninit_array(); + for (i, (k, v)) in self.data().iter().enumerate() { + new_data[i] = MaybeUninit::new((Cow::Owned(k.to_string()), v.to_static())); + } + LazyIndexMapArray { + data: new_data, + len: self.len, + last_find: AtomicUsize::new(self.last_find.load(Ordering::Relaxed)), + duplicates_mask: self.duplicates_mask.clone(), + } } } - fn get_map(&self) -> &AHashMap { - self.map.get_or_init(|| { - self.vec - .iter() - .enumerate() - .map(|(index, (key, _))| (key.clone(), index)) - .collect() - }) + impl Clone for LazyIndexMapArray { + fn clone(&self) -> Self { + let mut new_data = boxed_uninit_array(); + for (i, value) in self.data().iter().enumerate() { + // SAFETY: initialized up to i + new_data[i] = MaybeUninit::new(value.clone()); + } + LazyIndexMapArray { + data: new_data, + len: self.len, + last_find: AtomicUsize::new(self.last_find.load(Ordering::Relaxed)), + duplicates_mask: self.duplicates_mask.clone(), + } + } } -} -impl<'j> LazyIndexMap, crate::JsonValue<'j>> { - pub(crate) fn to_static(&self) -> LazyIndexMap, crate::JsonValue<'static>> { - LazyIndexMap { - vec: self - .vec - .iter() - .map(|(k, v)| (k.to_string().into(), v.to_static())) - .collect(), - map: OnceLock::new(), - last_find: AtomicUsize::new(0), + fn build_duplicates_mask(data: &[(K, V)]) -> bitvec::BitArr!(for HASHMAP_THRESHOLD, in u16) { + let hashes_and_indices: &mut [(u64, usize)] = &mut [(0u64, 0usize); HASHMAP_THRESHOLD][..data.len()]; + let mut mask = bitvec::bitarr![u16, Lsb0; 1; HASHMAP_THRESHOLD]; + for (i, (k, _)) in data.iter().enumerate() { + // SAFETY: data is known to be initialized + let mut hasher = DefaultHasher::new(); + k.hash(&mut hasher); + let hash = hasher.finish(); + hashes_and_indices[i] = (hash, i); } + hashes_and_indices.sort_unstable(); + + for i in 0..data.len() { + let (hash, index) = hashes_and_indices[i]; + for (next_hash, next_index) in hashes_and_indices.iter().skip(i + 1) { + if *next_hash != hash { + break; + } + // is a duplicate key; prefer the later element + if data[*next_index].0 == data[index].0 { + mask.set(index, false); + break; + } + } + } + mask } -} -impl PartialEq for LazyIndexMap { - fn eq(&self, other: &Self) -> bool { - self.vec == other.vec + // in the future this should be Box::new([const { MaybeUninit::::uninit() }; 16]); + // waiting on inline const expressions to be on stable + fn boxed_uninit_array() -> Box<[MaybeUninit; 16]> { + Box::new([ + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + MaybeUninit::uninit(), + ]) } } -struct IterUnique<'a, K, V> { - vec: &'a SmallVec<(K, V), 8>, - map: &'a AHashMap, - index: usize, +use index_map_vec::LazyIndexMapArray; + +enum LazyIndexMapIter<'a, K, V> { + Vec { + iter: SliceIter<'a, (K, V)>, + // to mask duplicate entries + mask: ) as IntoIterator>::IntoIter, + }, + Map(indexmap::map::Iter<'a, K, V>), } -impl<'a, K: Hash + Eq, V> Iterator for IterUnique<'a, K, V> { +impl<'a, K, V> Iterator for LazyIndexMapIter<'a, K, V> { type Item = (&'a K, &'a V); fn next(&mut self) -> Option { - while self.index < self.vec.len() { - let (k, v) = &self.vec[self.index]; - if let Some(map_index) = self.map.get(k) { - if *map_index == self.index { - self.index += 1; - return Some((k, v)); + match self { + LazyIndexMapIter::Vec { iter, mask } => { + while let Some((k, v)) = iter.next() { + let is_not_duplicate = mask.next().expect("mask covers array length"); + if is_not_duplicate { + return Some((k, v)); + } } + None } - self.index += 1; + LazyIndexMapIter::Map(iter) => iter.next(), } - None + } +} + +impl<'j> LazyIndexMap, crate::JsonValue<'j>> { + pub(crate) fn to_static(&self) -> LazyIndexMap, crate::JsonValue<'static>> { + let inner = match &self.inner { + LazyIndexMapInner::Array(vec) => LazyIndexMapInner::Array(vec.to_static()), + LazyIndexMapInner::Map(map) => LazyIndexMapInner::Map( + map.iter() + .map(|(k, v)| (Cow::Owned(k.to_string()), v.to_static())) + .collect(), + ), + }; + LazyIndexMap { inner } + } +} + +impl PartialEq for LazyIndexMap +where + K: fmt::Debug + Eq + Hash, + V: fmt::Debug, +{ + fn eq(&self, other: &Self) -> bool { + self.iter().eq(other.iter()) } } diff --git a/crates/jiter/tests/main.rs b/crates/jiter/tests/main.rs index 65d87708..4284395e 100644 --- a/crates/jiter/tests/main.rs +++ b/crates/jiter/tests/main.rs @@ -919,11 +919,11 @@ fn test_crazy_massive_int() { } #[test] -fn unique_iter_object() { +fn iter_object() { let value = JsonValue::parse(br#" {"x": 1, "x": 2} "#, false).unwrap(); if let JsonValue::Object(obj) = value { assert_eq!(obj.len(), 1); - let mut unique = obj.iter_unique(); + let mut unique = obj.iter(); let first = unique.next().unwrap(); assert_eq!(first.0, "x"); assert_eq!(first.1, &JsonValue::Int(2)); @@ -934,11 +934,11 @@ fn unique_iter_object() { } #[test] -fn unique_iter_object_repeat() { +fn iter_object_repeat() { let value = JsonValue::parse(br#" {"x": 1, "x": 1} "#, false).unwrap(); if let JsonValue::Object(obj) = value { assert_eq!(obj.len(), 1); - let mut unique = obj.iter_unique(); + let mut unique = obj.iter(); let first = unique.next().unwrap(); assert_eq!(first.0, "x"); assert_eq!(first.1, &JsonValue::Int(1)); From 9c67b85399604fcaaafc37f1003b40469207b496 Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Mon, 9 Sep 2024 16:08:43 +0100 Subject: [PATCH 3/6] use atomic mask --- crates/jiter/src/lazy_index_map.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crates/jiter/src/lazy_index_map.rs b/crates/jiter/src/lazy_index_map.rs index 922f8f9f..31324c72 100644 --- a/crates/jiter/src/lazy_index_map.rs +++ b/crates/jiter/src/lazy_index_map.rs @@ -4,7 +4,7 @@ use std::fmt; use std::hash::{DefaultHasher, Hash, Hasher}; use std::mem::MaybeUninit; use std::slice::Iter as SliceIter; -use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::atomic::{AtomicU16, AtomicUsize, Ordering}; use ahash::RandomState; use bitvec::order::Lsb0; @@ -144,6 +144,8 @@ where } mod index_map_vec { + use std::sync::atomic::AtomicU16; + use super::*; pub(super) struct LazyIndexMapArray { @@ -153,7 +155,7 @@ mod index_map_vec { duplicates_mask: DuplicatesMask, } - type DuplicatesMask = bitvec::BitArr!(for HASHMAP_THRESHOLD, in Cell); + type DuplicatesMask = bitvec::BitArr!(for HASHMAP_THRESHOLD, in AtomicU16); impl LazyIndexMapArray { pub fn new() -> Self { @@ -321,7 +323,7 @@ enum LazyIndexMapIter<'a, K, V> { Vec { iter: SliceIter<'a, (K, V)>, // to mask duplicate entries - mask: ) as IntoIterator>::IntoIter, + mask: ::IntoIter, }, Map(indexmap::map::Iter<'a, K, V>), } From 1534be3ece2b7b9860e78a8f510b854d8108f506 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Mon, 9 Sep 2024 16:30:47 +0100 Subject: [PATCH 4/6] fix linting --- crates/fuzz/fuzz_targets/compare_to_serde.rs | 2 +- crates/jiter/src/lazy_index_map.rs | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/crates/fuzz/fuzz_targets/compare_to_serde.rs b/crates/fuzz/fuzz_targets/compare_to_serde.rs index 53aad191..900932e8 100644 --- a/crates/fuzz/fuzz_targets/compare_to_serde.rs +++ b/crates/fuzz/fuzz_targets/compare_to_serde.rs @@ -30,7 +30,7 @@ pub fn values_equal(jiter_value: &JiterValue, serde_value: &SerdeValue) -> bool if o1.len() != o2.len() { return false; } - for (k1, v1) in o1.iter_unique() { + for (k1, v1) in o1.iter() { if let Some(v2) = o2.get::(k1.as_ref()) { if !values_equal(v1, v2) { return false; diff --git a/crates/jiter/src/lazy_index_map.rs b/crates/jiter/src/lazy_index_map.rs index 31324c72..3c439cb9 100644 --- a/crates/jiter/src/lazy_index_map.rs +++ b/crates/jiter/src/lazy_index_map.rs @@ -1,13 +1,10 @@ use std::borrow::{Borrow, Cow}; -use std::cell::Cell; use std::fmt; -use std::hash::{DefaultHasher, Hash, Hasher}; -use std::mem::MaybeUninit; +use std::hash::Hash; use std::slice::Iter as SliceIter; -use std::sync::atomic::{AtomicU16, AtomicUsize, Ordering}; +use std::sync::atomic::AtomicU16; use ahash::RandomState; -use bitvec::order::Lsb0; use indexmap::IndexMap; /// Like [IndexMap](https://docs.rs/indexmap/latest/indexmap/) but only builds the lookup map when it's needed. @@ -144,9 +141,13 @@ where } mod index_map_vec { - use std::sync::atomic::AtomicU16; + use bitvec::order::Lsb0; + use std::borrow::{Borrow, Cow}; + use std::hash::{DefaultHasher, Hash, Hasher}; + use std::mem::MaybeUninit; + use std::sync::atomic::{AtomicU16, AtomicUsize, Ordering}; - use super::*; + use super::HASHMAP_THRESHOLD; pub(super) struct LazyIndexMapArray { data: Box<[MaybeUninit<(K, V)>; HASHMAP_THRESHOLD]>, @@ -334,7 +335,7 @@ impl<'a, K, V> Iterator for LazyIndexMapIter<'a, K, V> { fn next(&mut self) -> Option { match self { LazyIndexMapIter::Vec { iter, mask } => { - while let Some((k, v)) = iter.next() { + for (k, v) in iter.by_ref() { let is_not_duplicate = mask.next().expect("mask covers array length"); if is_not_duplicate { return Some((k, v)); From e4a1041b75bda502cf5cf941fe1b4b5b8fb143cf Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Tue, 10 Sep 2024 12:28:27 +0100 Subject: [PATCH 5/6] fix memory and perf bugs --- crates/jiter/src/lazy_index_map.rs | 79 ++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 20 deletions(-) diff --git a/crates/jiter/src/lazy_index_map.rs b/crates/jiter/src/lazy_index_map.rs index 3c439cb9..3d3e9125 100644 --- a/crates/jiter/src/lazy_index_map.rs +++ b/crates/jiter/src/lazy_index_map.rs @@ -2,7 +2,6 @@ use std::borrow::{Borrow, Cow}; use std::fmt; use std::hash::Hash; use std::slice::Iter as SliceIter; -use std::sync::atomic::AtomicU16; use ahash::RandomState; use indexmap::IndexMap; @@ -73,7 +72,7 @@ where let LazyIndexMapInner::Map(map) = &mut self.inner else { unreachable!("just set to be a map"); }; - for (k, v) in vec.into_complete_data() { + for (k, v) in vec { map.insert(k, v); } map.insert(key, value); @@ -129,7 +128,7 @@ where LazyIndexMapInner::Array(vec) => { // SAFETY: data is known to be initialized up to len let data = vec.data(); - let mask = vec.duplicates_mask().clone(); + let mask = vec.duplicates_mask(); LazyIndexMapIter::Vec { iter: data.iter(), mask: mask.into_iter(), @@ -142,9 +141,11 @@ where mod index_map_vec { use bitvec::order::Lsb0; + use bitvec::view::BitViewSized; + use bitvec::BitArr; use std::borrow::{Borrow, Cow}; use std::hash::{DefaultHasher, Hash, Hasher}; - use std::mem::MaybeUninit; + use std::mem::{ManuallyDrop, MaybeUninit}; use std::sync::atomic::{AtomicU16, AtomicUsize, Ordering}; use super::HASHMAP_THRESHOLD; @@ -156,7 +157,7 @@ mod index_map_vec { duplicates_mask: DuplicatesMask, } - type DuplicatesMask = bitvec::BitArr!(for HASHMAP_THRESHOLD, in AtomicU16); + type DuplicatesMask = BitArr!(for HASHMAP_THRESHOLD, in AtomicU16); impl LazyIndexMapArray { pub fn new() -> Self { @@ -214,24 +215,15 @@ mod index_map_vec { None } - pub fn into_complete_data(self) -> impl IntoIterator { - self.data - .into_iter() - .take(self.len) - // SAFETY: reading initialized section only - .map(|x| unsafe { x.assume_init() }) - } - - pub fn duplicates_mask(&self) -> &DuplicatesMask { + pub fn duplicates_mask(&self) -> BitArr!(for HASHMAP_THRESHOLD, in u16) { let data = self.data(); if self.duplicates_mask == DuplicatesMask::ZERO { let new_mask = build_duplicates_mask(data); - // FIXME: is there a way to write the whole thing at once? - for i in 0..data.len() { - self.duplicates_mask.set_aliased(i, new_mask[i]); - } + self.duplicates_mask.data[0].store(new_mask.data[0], Ordering::Relaxed); + new_mask + } else { + [self.duplicates_mask.data[0].load(Ordering::Relaxed)].into_bitarray() } - &self.duplicates_mask } } @@ -266,6 +258,53 @@ mod index_map_vec { } } + impl IntoIterator for LazyIndexMapArray { + type Item = (K, V); + type IntoIter = LazyIndexMapArrayIterator; + + fn into_iter(self) -> Self::IntoIter { + LazyIndexMapArrayIterator { + data: ManuallyDrop::new(self), + index: 0, + } + } + } + + impl Drop for LazyIndexMapArray { + fn drop(&mut self) { + for entry in self.data.iter_mut().take(self.len) { + // SAFETY: initialized up to len + unsafe { entry.assume_init_drop() } + } + } + } + + pub struct LazyIndexMapArrayIterator { + data: ManuallyDrop>, + index: usize, + } + + impl Iterator for LazyIndexMapArrayIterator { + type Item = (K, V); + + fn next(&mut self) -> Option { + if self.index < self.data.len { + let value = unsafe { self.data.data[self.index].assume_init_read() }; + self.index += 1; + Some(value) + } else { + None + } + } + } + + impl Drop for LazyIndexMapArrayIterator { + fn drop(&mut self) { + // consume the remaining values + for _ in self {} + } + } + fn build_duplicates_mask(data: &[(K, V)]) -> bitvec::BitArr!(for HASHMAP_THRESHOLD, in u16) { let hashes_and_indices: &mut [(u64, usize)] = &mut [(0u64, 0usize); HASHMAP_THRESHOLD][..data.len()]; let mut mask = bitvec::bitarr![u16, Lsb0; 1; HASHMAP_THRESHOLD]; @@ -324,7 +363,7 @@ enum LazyIndexMapIter<'a, K, V> { Vec { iter: SliceIter<'a, (K, V)>, // to mask duplicate entries - mask: ::IntoIter, + mask: ::IntoIter, }, Map(indexmap::map::Iter<'a, K, V>), } From e054b28f5e5bf4935f6a3debd783cbb93684b8fa Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Tue, 10 Sep 2024 12:29:12 +0100 Subject: [PATCH 6/6] clippy --- crates/jiter/src/lazy_index_map.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/jiter/src/lazy_index_map.rs b/crates/jiter/src/lazy_index_map.rs index 3d3e9125..3beee6de 100644 --- a/crates/jiter/src/lazy_index_map.rs +++ b/crates/jiter/src/lazy_index_map.rs @@ -217,7 +217,7 @@ mod index_map_vec { pub fn duplicates_mask(&self) -> BitArr!(for HASHMAP_THRESHOLD, in u16) { let data = self.data(); - if self.duplicates_mask == DuplicatesMask::ZERO { + if self.duplicates_mask == { DuplicatesMask::ZERO } { let new_mask = build_duplicates_mask(data); self.duplicates_mask.data[0].store(new_mask.data[0], Ordering::Relaxed); new_mask