diff --git a/Cargo.toml b/Cargo.toml index 7e50b438f..68a2e2de2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,6 +46,9 @@ doc-comment = "0.3.1" bumpalo = { version = "3.13.0", features = ["allocator-api2"] } rkyv = { version = "0.7.42", features = ["validation"] } +[target.'cfg(unix)'.dev-dependencies] +libc = "0.2" + [features] default = ["ahash", "inline-more", "allocator-api2"] diff --git a/src/map.rs b/src/map.rs index 88a826582..7f3b58067 100644 --- a/src/map.rs +++ b/src/map.rs @@ -8958,3 +8958,113 @@ mod test_map { assert_eq!(dropped.load(Ordering::SeqCst), 0); } } + +#[cfg(all(test, unix))] +mod test_map_with_mmap_allocations { + use super::HashMap; + use crate::raw::prev_pow2; + use allocator_api2::alloc::{AllocError, Allocator}; + use core::alloc::Layout; + use core::ptr::{null_mut, NonNull}; + + /// This is not a production quality allocator, just good enough for + /// some basic tests. + #[derive(Clone, Copy, Debug)] + struct MmapAllocator { + /// Guarantee this is a power of 2. + page_size: usize, + } + + impl MmapAllocator { + fn new() -> Result { + let result = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }; + if result < 1 { + return Err(AllocError); + } + + let page_size = result as usize; + if !page_size.is_power_of_two() { + Err(AllocError) + } else { + Ok(Self { page_size }) + } + } + + fn fit_to_page_size(&self, n: usize) -> Result { + // If n=0, give a single page (wasteful, I know). + let n = if n == 0 { self.page_size } else { n }; + + match n & (self.page_size - 1) { + 0 => Ok(n), + rem => n.checked_add(self.page_size - rem).ok_or(AllocError), + } + } + } + + unsafe impl Allocator for MmapAllocator { + fn allocate(&self, layout: Layout) -> Result, AllocError> { + if layout.align() > self.page_size { + return Err(AllocError); + } + + let size = self.fit_to_page_size(layout.size())?; + let null = null_mut(); + let len = size as libc::size_t; + let prot = libc::PROT_READ | libc::PROT_WRITE; + let flags = libc::MAP_PRIVATE | libc::MAP_ANON; + let result = unsafe { libc::mmap(null, len, prot, flags, -1, 0) }; + + if result == libc::MAP_FAILED { + return Err(AllocError); + } + + let addr = NonNull::new(result.cast()).ok_or(AllocError)?; + Ok(NonNull::slice_from_raw_parts(addr, size)) + } + + unsafe fn deallocate(&self, ptr: NonNull, layout: Layout) { + // If they allocated it with this layout, it must round correctly. + let size = self.fit_to_page_size(layout.size()).unwrap(); + _ = libc::munmap(ptr.as_ptr().cast(), size); + } + } + + #[test] + fn test_tiny_allocation_gets_rounded_to_page_size() { + let alloc = MmapAllocator::new().unwrap(); + let mut map: HashMap = HashMap::with_capacity_in(1, alloc); + + // Size of an element plus its control byte. + let rough_bucket_size = core::mem::size_of::<(usize, ())>() + 1; + + // Accounting for some misc. padding that's likely in the allocation + // due to rounding to group width, etc. + let overhead = 3 * core::mem::size_of::(); + let num_buckets = (alloc.page_size - overhead) / rough_bucket_size; + // Buckets are always powers of 2. + let min_elems = prev_pow2(num_buckets); + // Real load-factor is 7/8, but this is a lower estimation, so 1/2. + let min_capacity = min_elems >> 1; + let capacity = map.capacity(); + assert!( + capacity >= min_capacity, + "failed: {capacity} >= {min_capacity}" + ); + + // Fill it up. + for i in 0..capacity { + map.insert(i, ()); + } + // Capacity should not have changed and it should be full. + assert_eq!(capacity, map.len()); + assert_eq!(capacity, map.capacity()); + + // Alright, make it grow. + map.insert(capacity, ()); + assert!( + capacity < map.capacity(), + "failed: {capacity} < {}", + map.capacity() + ); + } +} diff --git a/src/raw/alloc.rs b/src/raw/alloc.rs index 15299e7b0..a9f88c259 100644 --- a/src/raw/alloc.rs +++ b/src/raw/alloc.rs @@ -1,4 +1,14 @@ -pub(crate) use self::inner::{do_alloc, Allocator, Global}; +pub(crate) use self::inner::{Allocator, Global}; +use crate::alloc::alloc::Layout; +use core::ptr::NonNull; + +#[allow(clippy::map_err_ignore)] +pub(crate) fn do_alloc(alloc: &A, layout: Layout) -> Result, ()> { + match alloc.allocate(layout) { + Ok(ptr) => Ok(ptr), + Err(_) => Err(()), + } +} // Nightly-case. // Use unstable `allocator_api` feature. @@ -6,17 +16,7 @@ pub(crate) use self::inner::{do_alloc, Allocator, Global}; // This is used when building for `std`. #[cfg(feature = "nightly")] mod inner { - use crate::alloc::alloc::Layout; pub use crate::alloc::alloc::{Allocator, Global}; - use core::ptr::NonNull; - - #[allow(clippy::map_err_ignore)] - pub(crate) fn do_alloc(alloc: &A, layout: Layout) -> Result, ()> { - match alloc.allocate(layout) { - Ok(ptr) => Ok(ptr.as_non_null_ptr()), - Err(_) => Err(()), - } - } } // Basic non-nightly case. @@ -27,17 +27,7 @@ mod inner { // `core::alloc::Allocator`. #[cfg(all(not(feature = "nightly"), feature = "allocator-api2"))] mod inner { - use crate::alloc::alloc::Layout; pub use allocator_api2::alloc::{Allocator, Global}; - use core::ptr::NonNull; - - #[allow(clippy::map_err_ignore)] - pub(crate) fn do_alloc(alloc: &A, layout: Layout) -> Result, ()> { - match alloc.allocate(layout) { - Ok(ptr) => Ok(ptr.cast()), - Err(_) => Err(()), - } - } } // No-defaults case. @@ -55,7 +45,7 @@ mod inner { #[allow(clippy::missing_safety_doc)] // not exposed outside of this crate pub unsafe trait Allocator { - fn allocate(&self, layout: Layout) -> Result, ()>; + fn allocate(&self, layout: Layout) -> Result, ()>; unsafe fn deallocate(&self, ptr: NonNull, layout: Layout); } @@ -64,8 +54,11 @@ mod inner { unsafe impl Allocator for Global { #[inline] - fn allocate(&self, layout: Layout) -> Result, ()> { - unsafe { NonNull::new(alloc(layout)).ok_or(()) } + fn allocate(&self, layout: Layout) -> Result, ()> { + match unsafe { NonNull::new(alloc(layout)) } { + Some(ptr) => Ok(NonNull::slice_from_raw_parts(ptr, layout.size())), + None => Err(()), + } } #[inline] unsafe fn deallocate(&self, ptr: NonNull, layout: Layout) { @@ -79,8 +72,4 @@ mod inner { Global } } - - pub(crate) fn do_alloc(alloc: &A, layout: Layout) -> Result, ()> { - alloc.allocate(layout) - } } diff --git a/src/raw/mod.rs b/src/raw/mod.rs index c8e8e2912..f4fccde9a 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -192,14 +192,40 @@ impl ProbeSeq { // Workaround for emscripten bug emscripten-core/emscripten-fastcomp#258 #[cfg_attr(target_os = "emscripten", inline(never))] #[cfg_attr(not(target_os = "emscripten"), inline)] -fn capacity_to_buckets(cap: usize) -> Option { +fn capacity_to_buckets(cap: usize, table_layout: TableLayout) -> Option { debug_assert_ne!(cap, 0); + // todo: what happens with ZSTs? + // Consider a small layout like TableLayout { size: 1, ctrl_align: 16 } on + // a platform with Group::WIDTH of 16 (like x86_64 with SSE2). For small + // bucket sizes, this ends up wasting quite a few bytes just to pad to the + // relatively larger ctrl_align: + // + // | capacity | buckets | bytes allocated | bytes per item | + // | -------- | ------- | --------------- | -------------- | + // | 3 | 4 | 36 | (Yikes!) 12.0 | + // | 7 | 8 | 40 | (Poor) 5.7 | + // | 14 | 16 | 48 | 3.4 | + // | 28 | 32 | 80 | 3.3 | + // + // The ratio of ctrl_align / size is used to set a minimum buckets so + // that padding to the alignment isn't dominating the total allocation. + let cap = { + let q = table_layout.ctrl_align / table_layout.size.max(1); + match q.checked_mul(7) { + None => cap, + Some(x) => { + let adjusted = x / 8; + adjusted.max(cap) + } + } + }; + // For small tables we require at least 1 empty bucket so that lookups are // guaranteed to terminate if an element doesn't exist in the table. if cap < 8 { // We don't bother with a table size of 2 buckets since that can only - // hold a single element. Instead we skip directly to a 4 bucket table + // hold a single element. Instead, skip directly to a 4 bucket table // which can hold 3 elements. return Some(if cap < 4 { 4 } else { 8 }); } @@ -1126,7 +1152,7 @@ impl RawTable { // elements. If the calculation overflows then the requested bucket // count must be larger than what we have right and nothing needs to be // done. - let min_buckets = match capacity_to_buckets(min_size) { + let min_buckets = match capacity_to_buckets(min_size, Self::TABLE_LAYOUT) { Some(buckets) => buckets, None => return, }; @@ -1257,14 +1283,8 @@ impl RawTable { /// * If `self.table.items != 0`, calling of this function with `capacity` /// equal to 0 (`capacity == 0`) results in [`undefined behavior`]. /// - /// * If `capacity_to_buckets(capacity) < Group::WIDTH` and - /// `self.table.items > capacity_to_buckets(capacity)` - /// calling this function results in [`undefined behavior`]. - /// - /// * If `capacity_to_buckets(capacity) >= Group::WIDTH` and - /// `self.table.items > capacity_to_buckets(capacity)` - /// calling this function are never return (will go into an - /// infinite loop). + /// * If `self.table.items > capacity_to_buckets(capacity, Self::TABLE_LAYOUT)` + /// calling this function are never return (will loop infinitely). /// /// See [`RawTableInner::find_insert_slot`] for more information. /// @@ -1716,6 +1736,40 @@ impl RawTableInner { } } +/// Find the previous power of 2. If it's already a power of 2, it's unchanged. +/// Passing zero is undefined behavior. +pub(crate) fn prev_pow2(z: usize) -> usize { + let shift = mem::size_of::() * 8 - 1; + 1 << (shift - (z.leading_zeros() as usize)) +} + +fn maximum_buckets_in( + allocation_size: usize, + table_layout: TableLayout, + group_width: usize, +) -> usize { + // Given an equation like: + // z >= x * y + x + g + // x can be maximized by doing: + // x = (z - g) / (y + 1) + // If you squint: + // x is the number of buckets + // y is the table_layout.size + // z is the size of the allocation + // g is the group width + // But this is ignoring the padding needed for ctrl_align. + // If we remember these restrictions: + // x is always a power of 2 + // Layout size for T must always be a multiple of T + // Then the alignment can be ignored if we add the constraint: + // x * y >= table_layout.ctrl_align + // This is taken care of by `capacity_to_buckets`. + let numerator = allocation_size - group_width; + let denominator = table_layout.size + 1; // todo: ZSTs? + let quotient = numerator / denominator; + prev_pow2(quotient) +} + impl RawTableInner { /// Allocates a new [`RawTableInner`] with the given number of buckets. /// The control bytes and buckets are left uninitialized. @@ -1733,7 +1787,7 @@ impl RawTableInner { unsafe fn new_uninitialized( alloc: &A, table_layout: TableLayout, - buckets: usize, + mut buckets: usize, fallibility: Fallibility, ) -> Result where @@ -1742,13 +1796,29 @@ impl RawTableInner { debug_assert!(buckets.is_power_of_two()); // Avoid `Option::ok_or_else` because it bloats LLVM IR. - let (layout, ctrl_offset) = match table_layout.calculate_layout_for(buckets) { + let (layout, mut ctrl_offset) = match table_layout.calculate_layout_for(buckets) { Some(lco) => lco, None => return Err(fallibility.capacity_overflow()), }; let ptr: NonNull = match do_alloc(alloc, layout) { - Ok(block) => block.cast(), + Ok(block) => { + // Utilize over-sized allocations. + let x = maximum_buckets_in(block.len(), table_layout, Group::WIDTH); + debug_assert!(x >= buckets); + // Calculate the new ctrl_offset. + let (_oversized_layout, oversized_ctrl_offset) = + match table_layout.calculate_layout_for(x) { + Some(lco) => lco, + None => unsafe { hint::unreachable_unchecked() }, + }; + debug_assert!(_oversized_layout.size() <= block.len()); + debug_assert!(oversized_ctrl_offset >= ctrl_offset); + ctrl_offset = oversized_ctrl_offset; + buckets = x; + + block.cast() + } Err(_) => return Err(fallibility.alloc_err(layout)), }; @@ -1782,8 +1852,8 @@ impl RawTableInner { // SAFETY: We checked that we could successfully allocate the new table, and then // initialized all control bytes with the constant `EMPTY` byte. unsafe { - let buckets = - capacity_to_buckets(capacity).ok_or_else(|| fallibility.capacity_overflow())?; + let buckets = capacity_to_buckets(capacity, table_layout) + .ok_or_else(|| fallibility.capacity_overflow())?; let result = Self::new_uninitialized(alloc, table_layout, buckets, fallibility)?; // SAFETY: We checked that the table is allocated and therefore the table already has @@ -4566,6 +4636,43 @@ impl RawExtractIf<'_, T, A> { mod test_map { use super::*; + #[test] + fn test_prev_pow2() { + // Skip 0, not defined for that input. + let mut pow2: usize = 1; + while (pow2 << 1) > 0 { + let next_pow2 = pow2 << 1; + assert_eq!(pow2, prev_pow2(pow2)); + // Need to skip 2, because it's also a power of 2, so it doesn't + // return the previous power of 2. + if next_pow2 > 2 { + assert_eq!(pow2, prev_pow2(pow2 + 1)); + assert_eq!(pow2, prev_pow2(next_pow2 - 1)); + } + pow2 = next_pow2; + } + } + + #[test] + fn test_minimum_capacity_for_small_types() { + #[track_caller] + fn test_t() { + let raw_table: RawTable = RawTable::with_capacity(1); + let actual_buckets = raw_table.buckets(); + let min_buckets = Group::WIDTH / core::mem::size_of::(); + assert!( + actual_buckets >= min_buckets, + "expected at least {min_buckets} buckets, got {actual_buckets} buckets" + ); + } + + test_t::(); + + // This is only "small" for some platforms, like x86_64 with SSE2, but + // there's no harm in running it on other platforms. + test_t::(); + } + fn rehash_in_place(table: &mut RawTable, hasher: impl Fn(&T) -> u64) { unsafe { table.table.rehash_in_place(