From f63722094b92019f365135bc2792f8a5997afbeb Mon Sep 17 00:00:00 2001
From: Levi Morrison <levi.morrison@datadoghq.com>
Date: Tue, 7 May 2024 22:16:35 -0600
Subject: [PATCH 1/3] perf: increase min buckets on very small types

Consider `HashSet<u8>` on x86_64 with SSE with various bucket sizes and
how many bytes the allocation ends up being:

| buckets | capacity | allocated bytes |
| ------- | -------- | --------------- |
|       4 |        3 |              36 |
|       8 |        7 |              40 |
|      16 |       14 |              48 |
|      32 |       28 |              80 |

In general, doubling the number of buckets should roughly double the
number of bytes used. However, for small bucket sizes for these small
TableLayouts (4 -> 8, 8 -> 16), it doesn't happen. This is an edge case
which happens because of padding of the control bytes and adding the
Group::WIDTH. Taking the buckets from 4 to 16 (4x) only takes the
allocated bytes from 36 to 48 (~1.3x).

This platform isn't the only one with edges. Here's aarch64 on an M1
for the same `HashSet<u8>`:

| buckets | capacity | allocated bytes |
| ------- | -------- | --------------- |
|       4 |        3 |              20 |
|       8 |        7 |              24 |
|      16 |       14 |              40 |

Notice 4 -> 8 buckets leading to only 4 more bytes (20 -> 24) instead
of roughly doubling.

Generalized, `buckets * table_layout.size` needs to be at least as big
as `table_layout.ctrl_align`. For the cases I listed above, we'd get
these new minimum bucket sizes:

 - x86_64 with SSE: 16
 - aarch64: 8

This is a niche optimization. However, it also removes possible
undefined behavior edge case in resize operations. In addition, it
may be a useful property to utilize over-sized allocations (see
rust-lang/hashbrown#523).
---
 src/raw/mod.rs | 66 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 53 insertions(+), 13 deletions(-)
diff --git a/src/raw/mod.rs b/src/raw/mod.rs
index c8e8e2912..5cfd4a3dc 100644
--- a/src/raw/mod.rs
+++ b/src/raw/mod.rs
@@ -192,14 +192,40 @@ impl ProbeSeq {
 // Workaround for emscripten bug emscripten-core/emscripten-fastcomp#258
 #[cfg_attr(target_os = "emscripten", inline(never))]
 #[cfg_attr(not(target_os = "emscripten"), inline)]
-fn capacity_to_buckets(cap: usize) -> Option<usize> {
+fn capacity_to_buckets(cap: usize, table_layout: TableLayout) -> Option<usize> {
     debug_assert_ne!(cap, 0);
 
+    // todo: what happens with ZSTs?
+    // Consider a small layout like TableLayout { size: 1, ctrl_align: 16 } on
+    // a platform with Group::WIDTH of 16 (like x86_64 with SSE2). For small
+    // bucket sizes, this ends up wasting quite a few bytes just to pad to the
+    // relatively larger ctrl_align:
+    //
+    // | capacity | buckets | bytes allocated | bytes per item |
+    // | -------- | ------- | --------------- | -------------- |
+    // |        3 |       4 |              36 | (Yikes!)  12.0 |
+    // |        7 |       8 |              40 | (Poor)     5.7 |
+    // |       14 |      16 |              48 |            3.4 |
+    // |       28 |      32 |              80 |            3.3 |
+    //
+    // The ratio of ctrl_align / size is used to set a minimum buckets so
+    // that padding to the alignment isn't dominating the total allocation.
+    let cap = {
+        let q = table_layout.ctrl_align / table_layout.size.max(1);
+        match q.checked_mul(7) {
+            None => cap,
+            Some(x) => {
+                let adjusted = x / 8;
+                adjusted.max(cap)
+            }
+        }
+    };
+
     // For small tables we require at least 1 empty bucket so that lookups are
     // guaranteed to terminate if an element doesn't exist in the table.
     if cap < 8 {
         // We don't bother with a table size of 2 buckets since that can only
-        // hold a single element. Instead we skip directly to a 4 bucket table
+        // hold a single element. Instead, skip directly to a 4 bucket table
         // which can hold 3 elements.
         return Some(if cap < 4 { 4 } else { 8 });
     }
@@ -1126,7 +1152,7 @@ impl<T, A: Allocator> RawTable<T, A> {
         // elements. If the calculation overflows then the requested bucket
         // count must be larger than what we have right and nothing needs to be
         // done.
-        let min_buckets = match capacity_to_buckets(min_size) {
+        let min_buckets = match capacity_to_buckets(min_size, Self::TABLE_LAYOUT) {
             Some(buckets) => buckets,
             None => return,
         };
@@ -1257,14 +1283,8 @@ impl<T, A: Allocator> RawTable<T, A> {
     /// * If `self.table.items != 0`, calling of this function with `capacity`
     ///   equal to 0 (`capacity == 0`) results in [`undefined behavior`].
     ///
-    /// * If `capacity_to_buckets(capacity) < Group::WIDTH` and
-    ///   `self.table.items > capacity_to_buckets(capacity)`
-    ///   calling this function results in [`undefined behavior`].
-    ///
-    /// * If `capacity_to_buckets(capacity) >= Group::WIDTH` and
-    ///   `self.table.items > capacity_to_buckets(capacity)`
-    ///   calling this function are never return (will go into an
-    ///   infinite loop).
+    /// * If `self.table.items > capacity_to_buckets(capacity, Self::TABLE_LAYOUT)`
+    ///   calling this function are never return (will loop infinitely).
     ///
     /// See [`RawTableInner::find_insert_slot`] for more information.
     ///
@@ -1782,8 +1802,8 @@ impl RawTableInner {
             // SAFETY: We checked that we could successfully allocate the new table, and then
             // initialized all control bytes with the constant `EMPTY` byte.
             unsafe {
-                let buckets =
-                    capacity_to_buckets(capacity).ok_or_else(|| fallibility.capacity_overflow())?;
+                let buckets = capacity_to_buckets(capacity, table_layout)
+                    .ok_or_else(|| fallibility.capacity_overflow())?;
 
                 let result = Self::new_uninitialized(alloc, table_layout, buckets, fallibility)?;
                 // SAFETY: We checked that the table is allocated and therefore the table already has
@@ -4566,6 +4586,26 @@ impl<T, A: Allocator> RawExtractIf<'_, T, A> {
 mod test_map {
     use super::*;
 
+    #[test]
+    fn test_minimum_capacity_for_small_types() {
+        #[track_caller]
+        fn test_t<T>() {
+            let raw_table: RawTable<T> = RawTable::with_capacity(1);
+            let actual_buckets = raw_table.buckets();
+            let min_buckets = Group::WIDTH / core::mem::size_of::<T>();
+            assert!(
+                actual_buckets >= min_buckets,
+                "expected at least {min_buckets} buckets, got {actual_buckets} buckets"
+            );
+        }
+
+        test_t::<u8>();
+
+        // This is only "small" for some platforms, like x86_64 with SSE2, but
+        // there's no harm in running it on other platforms.
+        test_t::<u16>();
+    }
+
     fn rehash_in_place<T>(table: &mut RawTable<T>, hasher: impl Fn(&T) -> u64) {
         unsafe {
             table.table.rehash_in_place(

From 89f6d1f117f3f8e54bce5351767b340e0ce6e2bd Mon Sep 17 00:00:00 2001
From: Levi Morrison <levi.morrison@datadoghq.com>
Date: Wed, 8 May 2024 09:34:47 -0600
Subject: [PATCH 2/3] feat: recognize and use over sized allocations

Allocators are allowed to return a larger memory chunk than was asked
for. If the amount extra is large enough, then the hash map can use the
extra space. The Global allocator will not hit this path, because it
won't over-size enough to matter, but custom allocators may. An example
of an allocator which allocates full system pages is included in the
test suite (Unix only because it uses `mmap`).
---
 Cargo.toml       |   3 ++
 src/map.rs       | 100 +++++++++++++++++++++++++++++++++++++++++++++++
 src/raw/alloc.rs |  45 ++++++++-------------
 src/raw/mod.rs   |  73 ++++++++++++++++++++++++++++++++--
 4 files changed, 190 insertions(+), 31 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 7e50b438f..68a2e2de2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,6 +46,9 @@ doc-comment = "0.3.1"
 bumpalo = { version = "3.13.0", features = ["allocator-api2"] }
 rkyv = { version = "0.7.42", features = ["validation"] }
 
+[target.'cfg(unix)'.dev-dependencies]
+libc = "0.2"
+
 [features]
 default = ["ahash", "inline-more", "allocator-api2"]
 
diff --git a/src/map.rs b/src/map.rs
index 88a826582..b1aba22c7 100644
--- a/src/map.rs
+++ b/src/map.rs
@@ -8958,3 +8958,103 @@ mod test_map {
         assert_eq!(dropped.load(Ordering::SeqCst), 0);
     }
 }
+
+#[cfg(all(test, unix))]
+mod test_map_with_mmap_allocations {
+    use super::HashMap;
+    use allocator_api2::alloc::{AllocError, Allocator};
+    use core::alloc::Layout;
+    use core::ptr::{null_mut, NonNull};
+
+    /// This is not a production quality allocator, just good enough for
+    /// some basic tests.
+    #[derive(Clone, Copy, Debug)]
+    struct MmapAllocator {
+        /// Guarantee this is a power of 2.
+        page_size: usize,
+    }
+
+    impl MmapAllocator {
+        fn new() -> Result<Self, AllocError> {
+            let result = unsafe { libc::sysconf(libc::_SC_PAGESIZE) };
+            if result < 1 {
+                return Err(AllocError);
+            }
+
+            let page_size = result as usize;
+            if !page_size.is_power_of_two() {
+                Err(AllocError)
+            } else {
+                Ok(Self { page_size })
+            }
+        }
+
+        fn fit_to_page_size(&self, n: usize) -> Result<usize, AllocError> {
+            // If n=0, give a single page (wasteful, I know).
+            let n = if n == 0 { self.page_size } else { n };
+
+            match n & (self.page_size - 1) {
+                0 => Ok(n),
+                rem => n.checked_add(self.page_size - rem).ok_or(AllocError),
+            }
+        }
+    }
+
+    unsafe impl Allocator for MmapAllocator {
+        fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, AllocError> {
+            if layout.align() > self.page_size {
+                return Err(AllocError);
+            }
+
+            let size = self.fit_to_page_size(layout.size())?;
+            let null = null_mut();
+            let len = size as libc::size_t;
+            let prot = libc::PROT_READ | libc::PROT_WRITE;
+            let flags = libc::MAP_PRIVATE | libc::MAP_ANON;
+            let result = unsafe { libc::mmap(null, len, prot, flags, -1, 0) };
+
+            if result == libc::MAP_FAILED {
+                return Err(AllocError);
+            }
+
+            let addr = NonNull::new(result.cast()).ok_or(AllocError)?;
+            Ok(NonNull::slice_from_raw_parts(addr, size))
+        }
+
+        unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout) {
+            // If they allocated it with this layout, it must round correctly.
+            let size = self.fit_to_page_size(layout.size()).unwrap();
+            _ = libc::munmap(ptr.as_ptr().cast(), size);
+        }
+    }
+
+    #[test]
+    fn test_tiny_allocation_gets_rounded_to_page_size() {
+        let alloc = MmapAllocator::new().unwrap();
+        let mut map: HashMap<usize, (), _, _> = HashMap::with_capacity_in(1, alloc);
+
+        let rough_bucket_size = core::mem::size_of::<(usize, (), usize)>();
+        let x = alloc.page_size / rough_bucket_size;
+        // x * ¾ should account for control bytes and also load factor, at
+        // least for realistic page sizes (4096+).
+        let min_elems = x / 4 * 3;
+        let capacity = map.capacity();
+        assert!(capacity > min_elems, "failed: {capacity} > {min_elems}");
+
+        // Fill it up.
+        for i in 0..capacity {
+            map.insert(i, ());
+        }
+        // Capacity should not have changed and it should be full.
+        assert_eq!(capacity, map.len());
+        assert_eq!(capacity, map.capacity());
+
+        // Alright, make it grow.
+        map.insert(capacity, ());
+        assert!(
+            capacity < map.capacity(),
+            "failed: {capacity} < {}",
+            map.capacity()
+        );
+    }
+}
diff --git a/src/raw/alloc.rs b/src/raw/alloc.rs
index 15299e7b0..a9f88c259 100644
--- a/src/raw/alloc.rs
+++ b/src/raw/alloc.rs
@@ -1,4 +1,14 @@
-pub(crate) use self::inner::{do_alloc, Allocator, Global};
+pub(crate) use self::inner::{Allocator, Global};
+use crate::alloc::alloc::Layout;
+use core::ptr::NonNull;
+
+#[allow(clippy::map_err_ignore)]
+pub(crate) fn do_alloc<A: Allocator>(alloc: &A, layout: Layout) -> Result<NonNull<[u8]>, ()> {
+    match alloc.allocate(layout) {
+        Ok(ptr) => Ok(ptr),
+        Err(_) => Err(()),
+    }
+}
 
 // Nightly-case.
 // Use unstable `allocator_api` feature.
@@ -6,17 +16,7 @@ pub(crate) use self::inner::{do_alloc, Allocator, Global};
 // This is used when building for `std`.
 #[cfg(feature = "nightly")]
 mod inner {
-    use crate::alloc::alloc::Layout;
     pub use crate::alloc::alloc::{Allocator, Global};
-    use core::ptr::NonNull;
-
-    #[allow(clippy::map_err_ignore)]
-    pub(crate) fn do_alloc<A: Allocator>(alloc: &A, layout: Layout) -> Result<NonNull<u8>, ()> {
-        match alloc.allocate(layout) {
-            Ok(ptr) => Ok(ptr.as_non_null_ptr()),
-            Err(_) => Err(()),
-        }
-    }
 }
 
 // Basic non-nightly case.
@@ -27,17 +27,7 @@ mod inner {
 // `core::alloc::Allocator`.
 #[cfg(all(not(feature = "nightly"), feature = "allocator-api2"))]
 mod inner {
-    use crate::alloc::alloc::Layout;
     pub use allocator_api2::alloc::{Allocator, Global};
-    use core::ptr::NonNull;
-
-    #[allow(clippy::map_err_ignore)]
-    pub(crate) fn do_alloc<A: Allocator>(alloc: &A, layout: Layout) -> Result<NonNull<u8>, ()> {
-        match alloc.allocate(layout) {
-            Ok(ptr) => Ok(ptr.cast()),
-            Err(_) => Err(()),
-        }
-    }
 }
 
 // No-defaults case.
@@ -55,7 +45,7 @@ mod inner {
 
     #[allow(clippy::missing_safety_doc)] // not exposed outside of this crate
     pub unsafe trait Allocator {
-        fn allocate(&self, layout: Layout) -> Result<NonNull<u8>, ()>;
+        fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, ()>;
         unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout);
     }
 
@@ -64,8 +54,11 @@ mod inner {
 
     unsafe impl Allocator for Global {
         #[inline]
-        fn allocate(&self, layout: Layout) -> Result<NonNull<u8>, ()> {
-            unsafe { NonNull::new(alloc(layout)).ok_or(()) }
+        fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, ()> {
+            match unsafe { NonNull::new(alloc(layout)) } {
+                Some(ptr) => Ok(NonNull::slice_from_raw_parts(ptr, layout.size())),
+                None => Err(()),
+            }
         }
         #[inline]
         unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout) {
@@ -79,8 +72,4 @@ mod inner {
             Global
         }
     }
-
-    pub(crate) fn do_alloc<A: Allocator>(alloc: &A, layout: Layout) -> Result<NonNull<u8>, ()> {
-        alloc.allocate(layout)
-    }
 }
diff --git a/src/raw/mod.rs b/src/raw/mod.rs
index 5cfd4a3dc..edad217da 100644
--- a/src/raw/mod.rs
+++ b/src/raw/mod.rs
@@ -1736,6 +1736,40 @@ impl RawTableInner {
     }
 }
 
+/// Find the previous power of 2. If it's already a power of 2, it's unchanged.
+/// Passing zero is undefined behavior.
+fn prev_pow2(z: usize) -> usize {
+    let shift = mem::size_of::<usize>() * 8 - 1;
+    1 << (shift - (z.leading_zeros() as usize))
+}
+
+fn maximum_buckets_in(
+    allocation_size: usize,
+    table_layout: TableLayout,
+    group_width: usize,
+) -> usize {
+    // Given an equation like:
+    //   z >= x * y + x + g
+    // x can be maximized by doing:
+    //   x = (z - g) / (y + 1)
+    // If you squint:
+    //   x is the number of buckets
+    //   y is the table_layout.size
+    //   z is the size of the allocation
+    //   g is the group width
+    // But this is ignoring the padding needed for ctrl_align.
+    // If we remember these restrictions:
+    //   x is always a power of 2
+    //   Layout size for T must always be a multiple of T
+    // Then the alignment can be ignored if we add the constraint:
+    //   x * y >= table_layout.ctrl_align
+    // This is taken care of by `capacity_to_buckets`.
+    let numerator = allocation_size - group_width;
+    let denominator = table_layout.size + 1; // todo: ZSTs?
+    let quotient = numerator / denominator;
+    prev_pow2(quotient)
+}
+
 impl RawTableInner {
     /// Allocates a new [`RawTableInner`] with the given number of buckets.
     /// The control bytes and buckets are left uninitialized.
@@ -1753,7 +1787,7 @@ impl RawTableInner {
     unsafe fn new_uninitialized<A>(
         alloc: &A,
         table_layout: TableLayout,
-        buckets: usize,
+        mut buckets: usize,
         fallibility: Fallibility,
     ) -> Result<Self, TryReserveError>
     where
@@ -1762,13 +1796,29 @@ impl RawTableInner {
         debug_assert!(buckets.is_power_of_two());
 
         // Avoid `Option::ok_or_else` because it bloats LLVM IR.
-        let (layout, ctrl_offset) = match table_layout.calculate_layout_for(buckets) {
+        let (layout, mut ctrl_offset) = match table_layout.calculate_layout_for(buckets) {
             Some(lco) => lco,
             None => return Err(fallibility.capacity_overflow()),
         };
 
         let ptr: NonNull<u8> = match do_alloc(alloc, layout) {
-            Ok(block) => block.cast(),
+            Ok(block) => {
+                // Utilize over-sized allocations.
+                let x = maximum_buckets_in(block.len(), table_layout, Group::WIDTH);
+                debug_assert!(x >= buckets);
+                // Calculate the new ctrl_offset.
+                let (_oversized_layout, oversized_ctrl_offset) =
+                    match table_layout.calculate_layout_for(x) {
+                        Some(lco) => lco,
+                        None => unsafe { hint::unreachable_unchecked() },
+                    };
+                debug_assert!(_oversized_layout.size() <= block.len());
+                debug_assert!(oversized_ctrl_offset >= ctrl_offset);
+                ctrl_offset = oversized_ctrl_offset;
+                buckets = x;
+
+                block.cast()
+            }
             Err(_) => return Err(fallibility.alloc_err(layout)),
         };
 
@@ -4586,6 +4636,23 @@ impl<T, A: Allocator> RawExtractIf<'_, T, A> {
 mod test_map {
     use super::*;
 
+    #[test]
+    fn test_prev_pow2() {
+        // Skip 0, not defined for that input.
+        let mut pow2: usize = 1;
+        while (pow2 << 1) > 0 {
+            let next_pow2 = pow2 << 1;
+            assert_eq!(pow2, prev_pow2(pow2));
+            // Need to skip 2, because it's also a power of 2, so it doesn't
+            // return the previous power of 2.
+            if next_pow2 > 2 {
+                assert_eq!(pow2, prev_pow2(pow2 + 1));
+                assert_eq!(pow2, prev_pow2(next_pow2 - 1));
+            }
+            pow2 = next_pow2;
+        }
+    }
+
     #[test]
     fn test_minimum_capacity_for_small_types() {
         #[track_caller]

From 15fcd44abb8effcce25b511e523ea8479b328b08 Mon Sep 17 00:00:00 2001
From: Levi Morrison <levi.morrison@datadoghq.com>
Date: Fri, 17 May 2024 21:27:39 -0600
Subject: [PATCH 3/3] fix test accuracy

---
 src/map.rs     | 22 ++++++++++++++++------
 src/raw/mod.rs |  2 +-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/map.rs b/src/map.rs
index b1aba22c7..7f3b58067 100644
--- a/src/map.rs
+++ b/src/map.rs
@@ -8962,6 +8962,7 @@ mod test_map {
 #[cfg(all(test, unix))]
 mod test_map_with_mmap_allocations {
     use super::HashMap;
+    use crate::raw::prev_pow2;
     use allocator_api2::alloc::{AllocError, Allocator};
     use core::alloc::Layout;
     use core::ptr::{null_mut, NonNull};
@@ -9033,13 +9034,22 @@ mod test_map_with_mmap_allocations {
         let alloc = MmapAllocator::new().unwrap();
         let mut map: HashMap<usize, (), _, _> = HashMap::with_capacity_in(1, alloc);
 
-        let rough_bucket_size = core::mem::size_of::<(usize, (), usize)>();
-        let x = alloc.page_size / rough_bucket_size;
-        // x * ¾ should account for control bytes and also load factor, at
-        // least for realistic page sizes (4096+).
-        let min_elems = x / 4 * 3;
+        // Size of an element plus its control byte.
+        let rough_bucket_size = core::mem::size_of::<(usize, ())>() + 1;
+
+        // Accounting for some misc. padding that's likely in the allocation
+        // due to rounding to group width, etc.
+        let overhead = 3 * core::mem::size_of::<usize>();
+        let num_buckets = (alloc.page_size - overhead) / rough_bucket_size;
+        // Buckets are always powers of 2.
+        let min_elems = prev_pow2(num_buckets);
+        // Real load-factor is 7/8, but this is a lower estimation, so 1/2.
+        let min_capacity = min_elems >> 1;
         let capacity = map.capacity();
-        assert!(capacity > min_elems, "failed: {capacity} > {min_elems}");
+        assert!(
+            capacity >= min_capacity,
+            "failed: {capacity} >= {min_capacity}"
+        );
 
         // Fill it up.
         for i in 0..capacity {
diff --git a/src/raw/mod.rs b/src/raw/mod.rs
index edad217da..f4fccde9a 100644
--- a/src/raw/mod.rs
+++ b/src/raw/mod.rs
@@ -1738,7 +1738,7 @@ impl RawTableInner {
 
 /// Find the previous power of 2. If it's already a power of 2, it's unchanged.
 /// Passing zero is undefined behavior.
-fn prev_pow2(z: usize) -> usize {
+pub(crate) fn prev_pow2(z: usize) -> usize {
     let shift = mem::size_of::<usize>() * 8 - 1;
     1 << (shift - (z.leading_zeros() as usize))
 }