diff --git a/Cargo.toml b/Cargo.toml
index 7e50b438f..68a2e2de2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,6 +46,9 @@ doc-comment = "0.3.1"
 bumpalo = { version = "3.13.0", features = ["allocator-api2"] }
 rkyv = { version = "0.7.42", features = ["validation"] }
 
+[target.'cfg(unix)'.dev-dependencies]
+libc = "0.2"
+
 [features]
 default = ["ahash", "inline-more", "allocator-api2"]
 
diff --git a/src/map.rs b/src/map.rs
index 88a826582..7f3b58067 100644
--- a/src/map.rs
+++ b/src/map.rs
@@ -8958,3 +8958,113 @@ mod test_map {
         assert_eq!(dropped.load(Ordering::SeqCst), 0);
     }
 }
+
+#[cfg(all(test, unix))]
+mod test_map_with_mmap_allocations {
+    use super::HashMap;
+    use crate::raw::prev_pow2;
+    use allocator_api2::alloc::{AllocError, Allocator};
+    use core::alloc::Layout;
+    use core::ptr::{null_mut, NonNull};
+
+    /// This is not a production quality allocator, just good enough for
+    /// some basic tests.
+    #[derive(Clone, Copy, Debug)]
+    struct MmapAllocator {
+        /// Guarantee this is a power of 2.
+        page_size: usize,
+    }
+
+    impl MmapAllocator {
+        fn new() -> Result<Self, AllocError> {
+            let result = unsafe { libc::sysconf(libc::_SC_PAGESIZE) };
+            if result < 1 {
+                return Err(AllocError);
+            }
+
+            let page_size = result as usize;
+            if !page_size.is_power_of_two() {
+                Err(AllocError)
+            } else {
+                Ok(Self { page_size })
+            }
+        }
+
+        fn fit_to_page_size(&self, n: usize) -> Result<usize, AllocError> {
+            // If n=0, give a single page (wasteful, I know).
+            let n = if n == 0 { self.page_size } else { n };
+
+            match n & (self.page_size - 1) {
+                0 => Ok(n),
+                rem => n.checked_add(self.page_size - rem).ok_or(AllocError),
+            }
+        }
+    }
+
+    unsafe impl Allocator for MmapAllocator {
+        fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, AllocError> {
+            if layout.align() > self.page_size {
+                return Err(AllocError);
+            }
+
+            let size = self.fit_to_page_size(layout.size())?;
+            let null = null_mut();
+            let len = size as libc::size_t;
+            let prot = libc::PROT_READ | libc::PROT_WRITE;
+            let flags = libc::MAP_PRIVATE | libc::MAP_ANON;
+            let result = unsafe { libc::mmap(null, len, prot, flags, -1, 0) };
+
+            if result == libc::MAP_FAILED {
+                return Err(AllocError);
+            }
+
+            let addr = NonNull::new(result.cast()).ok_or(AllocError)?;
+            Ok(NonNull::slice_from_raw_parts(addr, size))
+        }
+
+        unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout) {
+            // If they allocated it with this layout, it must round correctly.
+            let size = self.fit_to_page_size(layout.size()).unwrap();
+            _ = libc::munmap(ptr.as_ptr().cast(), size);
+        }
+    }
+
+    #[test]
+    fn test_tiny_allocation_gets_rounded_to_page_size() {
+        let alloc = MmapAllocator::new().unwrap();
+        let mut map: HashMap<usize, (), _, _> = HashMap::with_capacity_in(1, alloc);
+
+        // Size of an element plus its control byte.
+        let rough_bucket_size = core::mem::size_of::<(usize, ())>() + 1;
+
+        // Accounting for some misc. padding that's likely in the allocation
+        // due to rounding to group width, etc.
+        let overhead = 3 * core::mem::size_of::<usize>();
+        let num_buckets = (alloc.page_size - overhead) / rough_bucket_size;
+        // Buckets are always powers of 2.
+        let min_elems = prev_pow2(num_buckets);
+        // Real load-factor is 7/8, but this is a lower estimation, so 1/2.
+        let min_capacity = min_elems >> 1;
+        let capacity = map.capacity();
+        assert!(
+            capacity >= min_capacity,
+            "failed: {capacity} >= {min_capacity}"
+        );
+
+        // Fill it up.
+        for i in 0..capacity {
+            map.insert(i, ());
+        }
+        // Capacity should not have changed and it should be full.
+        assert_eq!(capacity, map.len());
+        assert_eq!(capacity, map.capacity());
+
+        // Alright, make it grow.
+        map.insert(capacity, ());
+        assert!(
+            capacity < map.capacity(),
+            "failed: {capacity} < {}",
+            map.capacity()
+        );
+    }
+}
diff --git a/src/raw/alloc.rs b/src/raw/alloc.rs
index 15299e7b0..a9f88c259 100644
--- a/src/raw/alloc.rs
+++ b/src/raw/alloc.rs
@@ -1,4 +1,14 @@
-pub(crate) use self::inner::{do_alloc, Allocator, Global};
+pub(crate) use self::inner::{Allocator, Global};
+use crate::alloc::alloc::Layout;
+use core::ptr::NonNull;
+
+#[allow(clippy::map_err_ignore)]
+pub(crate) fn do_alloc<A: Allocator>(alloc: &A, layout: Layout) -> Result<NonNull<[u8]>, ()> {
+    match alloc.allocate(layout) {
+        Ok(ptr) => Ok(ptr),
+        Err(_) => Err(()),
+    }
+}
 
 // Nightly-case.
 // Use unstable `allocator_api` feature.
@@ -6,17 +16,7 @@ pub(crate) use self::inner::{do_alloc, Allocator, Global};
 // This is used when building for `std`.
 #[cfg(feature = "nightly")]
 mod inner {
-    use crate::alloc::alloc::Layout;
     pub use crate::alloc::alloc::{Allocator, Global};
-    use core::ptr::NonNull;
-
-    #[allow(clippy::map_err_ignore)]
-    pub(crate) fn do_alloc<A: Allocator>(alloc: &A, layout: Layout) -> Result<NonNull<u8>, ()> {
-        match alloc.allocate(layout) {
-            Ok(ptr) => Ok(ptr.as_non_null_ptr()),
-            Err(_) => Err(()),
-        }
-    }
 }
 
 // Basic non-nightly case.
@@ -27,17 +27,7 @@ mod inner {
 // `core::alloc::Allocator`.
 #[cfg(all(not(feature = "nightly"), feature = "allocator-api2"))]
 mod inner {
-    use crate::alloc::alloc::Layout;
     pub use allocator_api2::alloc::{Allocator, Global};
-    use core::ptr::NonNull;
-
-    #[allow(clippy::map_err_ignore)]
-    pub(crate) fn do_alloc<A: Allocator>(alloc: &A, layout: Layout) -> Result<NonNull<u8>, ()> {
-        match alloc.allocate(layout) {
-            Ok(ptr) => Ok(ptr.cast()),
-            Err(_) => Err(()),
-        }
-    }
 }
 
 // No-defaults case.
@@ -55,7 +45,7 @@ mod inner {
 
     #[allow(clippy::missing_safety_doc)] // not exposed outside of this crate
     pub unsafe trait Allocator {
-        fn allocate(&self, layout: Layout) -> Result<NonNull<u8>, ()>;
+        fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, ()>;
         unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout);
     }
 
@@ -64,8 +54,11 @@ mod inner {
 
     unsafe impl Allocator for Global {
         #[inline]
-        fn allocate(&self, layout: Layout) -> Result<NonNull<u8>, ()> {
-            unsafe { NonNull::new(alloc(layout)).ok_or(()) }
+        fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, ()> {
+            match unsafe { NonNull::new(alloc(layout)) } {
+                Some(ptr) => Ok(NonNull::slice_from_raw_parts(ptr, layout.size())),
+                None => Err(()),
+            }
         }
         #[inline]
         unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout) {
@@ -79,8 +72,4 @@ mod inner {
             Global
         }
     }
-
-    pub(crate) fn do_alloc<A: Allocator>(alloc: &A, layout: Layout) -> Result<NonNull<u8>, ()> {
-        alloc.allocate(layout)
-    }
 }
diff --git a/src/raw/mod.rs b/src/raw/mod.rs
index c8e8e2912..f4fccde9a 100644
--- a/src/raw/mod.rs
+++ b/src/raw/mod.rs
@@ -192,14 +192,40 @@ impl ProbeSeq {
 // Workaround for emscripten bug emscripten-core/emscripten-fastcomp#258
 #[cfg_attr(target_os = "emscripten", inline(never))]
 #[cfg_attr(not(target_os = "emscripten"), inline)]
-fn capacity_to_buckets(cap: usize) -> Option<usize> {
+fn capacity_to_buckets(cap: usize, table_layout: TableLayout) -> Option<usize> {
     debug_assert_ne!(cap, 0);
 
+    // todo: what happens with ZSTs?
+    // Consider a small layout like TableLayout { size: 1, ctrl_align: 16 } on
+    // a platform with Group::WIDTH of 16 (like x86_64 with SSE2). For small
+    // bucket sizes, this ends up wasting quite a few bytes just to pad to the
+    // relatively larger ctrl_align:
+    //
+    // | capacity | buckets | bytes allocated | bytes per item |
+    // | -------- | ------- | --------------- | -------------- |
+    // |        3 |       4 |              36 | (Yikes!)  12.0 |
+    // |        7 |       8 |              40 | (Poor)     5.7 |
+    // |       14 |      16 |              48 |            3.4 |
+    // |       28 |      32 |              80 |            3.3 |
+    //
+    // The ratio of ctrl_align / size is used to set a minimum buckets so
+    // that padding to the alignment isn't dominating the total allocation.
+    let cap = {
+        let q = table_layout.ctrl_align / table_layout.size.max(1);
+        match q.checked_mul(7) {
+            None => cap,
+            Some(x) => {
+                let adjusted = x / 8;
+                adjusted.max(cap)
+            }
+        }
+    };
+
     // For small tables we require at least 1 empty bucket so that lookups are
     // guaranteed to terminate if an element doesn't exist in the table.
     if cap < 8 {
         // We don't bother with a table size of 2 buckets since that can only
-        // hold a single element. Instead we skip directly to a 4 bucket table
+        // hold a single element. Instead, skip directly to a 4 bucket table
         // which can hold 3 elements.
         return Some(if cap < 4 { 4 } else { 8 });
     }
@@ -1126,7 +1152,7 @@ impl<T, A: Allocator> RawTable<T, A> {
         // elements. If the calculation overflows then the requested bucket
         // count must be larger than what we have right and nothing needs to be
         // done.
-        let min_buckets = match capacity_to_buckets(min_size) {
+        let min_buckets = match capacity_to_buckets(min_size, Self::TABLE_LAYOUT) {
             Some(buckets) => buckets,
             None => return,
         };
@@ -1257,14 +1283,8 @@ impl<T, A: Allocator> RawTable<T, A> {
     /// * If `self.table.items != 0`, calling of this function with `capacity`
     ///   equal to 0 (`capacity == 0`) results in [`undefined behavior`].
     ///
-    /// * If `capacity_to_buckets(capacity) < Group::WIDTH` and
-    ///   `self.table.items > capacity_to_buckets(capacity)`
-    ///   calling this function results in [`undefined behavior`].
-    ///
-    /// * If `capacity_to_buckets(capacity) >= Group::WIDTH` and
-    ///   `self.table.items > capacity_to_buckets(capacity)`
-    ///   calling this function are never return (will go into an
-    ///   infinite loop).
+    /// * If `self.table.items > capacity_to_buckets(capacity, Self::TABLE_LAYOUT)`
+    ///   calling this function are never return (will loop infinitely).
     ///
     /// See [`RawTableInner::find_insert_slot`] for more information.
     ///
@@ -1716,6 +1736,40 @@ impl RawTableInner {
     }
 }
 
+/// Find the previous power of 2. If it's already a power of 2, it's unchanged.
+/// Passing zero is undefined behavior.
+pub(crate) fn prev_pow2(z: usize) -> usize {
+    let shift = mem::size_of::<usize>() * 8 - 1;
+    1 << (shift - (z.leading_zeros() as usize))
+}
+
+fn maximum_buckets_in(
+    allocation_size: usize,
+    table_layout: TableLayout,
+    group_width: usize,
+) -> usize {
+    // Given an equation like:
+    //   z >= x * y + x + g
+    // x can be maximized by doing:
+    //   x = (z - g) / (y + 1)
+    // If you squint:
+    //   x is the number of buckets
+    //   y is the table_layout.size
+    //   z is the size of the allocation
+    //   g is the group width
+    // But this is ignoring the padding needed for ctrl_align.
+    // If we remember these restrictions:
+    //   x is always a power of 2
+    //   Layout size for T must always be a multiple of T
+    // Then the alignment can be ignored if we add the constraint:
+    //   x * y >= table_layout.ctrl_align
+    // This is taken care of by `capacity_to_buckets`.
+    let numerator = allocation_size - group_width;
+    let denominator = table_layout.size + 1; // todo: ZSTs?
+    let quotient = numerator / denominator;
+    prev_pow2(quotient)
+}
+
 impl RawTableInner {
     /// Allocates a new [`RawTableInner`] with the given number of buckets.
     /// The control bytes and buckets are left uninitialized.
@@ -1733,7 +1787,7 @@ impl RawTableInner {
     unsafe fn new_uninitialized<A>(
         alloc: &A,
         table_layout: TableLayout,
-        buckets: usize,
+        mut buckets: usize,
         fallibility: Fallibility,
     ) -> Result<Self, TryReserveError>
     where
@@ -1742,13 +1796,29 @@ impl RawTableInner {
         debug_assert!(buckets.is_power_of_two());
 
         // Avoid `Option::ok_or_else` because it bloats LLVM IR.
-        let (layout, ctrl_offset) = match table_layout.calculate_layout_for(buckets) {
+        let (layout, mut ctrl_offset) = match table_layout.calculate_layout_for(buckets) {
             Some(lco) => lco,
             None => return Err(fallibility.capacity_overflow()),
         };
 
         let ptr: NonNull<u8> = match do_alloc(alloc, layout) {
-            Ok(block) => block.cast(),
+            Ok(block) => {
+                // Utilize over-sized allocations.
+                let x = maximum_buckets_in(block.len(), table_layout, Group::WIDTH);
+                debug_assert!(x >= buckets);
+                // Calculate the new ctrl_offset.
+                let (_oversized_layout, oversized_ctrl_offset) =
+                    match table_layout.calculate_layout_for(x) {
+                        Some(lco) => lco,
+                        None => unsafe { hint::unreachable_unchecked() },
+                    };
+                debug_assert!(_oversized_layout.size() <= block.len());
+                debug_assert!(oversized_ctrl_offset >= ctrl_offset);
+                ctrl_offset = oversized_ctrl_offset;
+                buckets = x;
+
+                block.cast()
+            }
             Err(_) => return Err(fallibility.alloc_err(layout)),
         };
 
@@ -1782,8 +1852,8 @@ impl RawTableInner {
             // SAFETY: We checked that we could successfully allocate the new table, and then
             // initialized all control bytes with the constant `EMPTY` byte.
             unsafe {
-                let buckets =
-                    capacity_to_buckets(capacity).ok_or_else(|| fallibility.capacity_overflow())?;
+                let buckets = capacity_to_buckets(capacity, table_layout)
+                    .ok_or_else(|| fallibility.capacity_overflow())?;
 
                 let result = Self::new_uninitialized(alloc, table_layout, buckets, fallibility)?;
                 // SAFETY: We checked that the table is allocated and therefore the table already has
@@ -4566,6 +4636,43 @@ impl<T, A: Allocator> RawExtractIf<'_, T, A> {
 mod test_map {
     use super::*;
 
+    #[test]
+    fn test_prev_pow2() {
+        // Skip 0, not defined for that input.
+        let mut pow2: usize = 1;
+        while (pow2 << 1) > 0 {
+            let next_pow2 = pow2 << 1;
+            assert_eq!(pow2, prev_pow2(pow2));
+            // Need to skip 2, because it's also a power of 2, so it doesn't
+            // return the previous power of 2.
+            if next_pow2 > 2 {
+                assert_eq!(pow2, prev_pow2(pow2 + 1));
+                assert_eq!(pow2, prev_pow2(next_pow2 - 1));
+            }
+            pow2 = next_pow2;
+        }
+    }
+
+    #[test]
+    fn test_minimum_capacity_for_small_types() {
+        #[track_caller]
+        fn test_t<T>() {
+            let raw_table: RawTable<T> = RawTable::with_capacity(1);
+            let actual_buckets = raw_table.buckets();
+            let min_buckets = Group::WIDTH / core::mem::size_of::<T>();
+            assert!(
+                actual_buckets >= min_buckets,
+                "expected at least {min_buckets} buckets, got {actual_buckets} buckets"
+            );
+        }
+
+        test_t::<u8>();
+
+        // This is only "small" for some platforms, like x86_64 with SSE2, but
+        // there's no harm in running it on other platforms.
+        test_t::<u16>();
+    }
+
     fn rehash_in_place<T>(table: &mut RawTable<T>, hasher: impl Fn(&T) -> u64) {
         unsafe {
             table.table.rehash_in_place(