From 8615a46954b35354e4ba21d10c2a555f61ca39ac Mon Sep 17 00:00:00 2001
From: Brian Smith <brian@briansmith.org>
Date: Mon, 27 Jan 2025 13:53:32 -0800
Subject: [PATCH] arithmetic: Avoid heap & simplify alignment logic in
 `elem_exp_consttime`.

Avoid allocating on the heap. Let the compiler do the alignment
instead of manually aligning the start of the table.
---
 src/arithmetic.rs                |  1 +
 src/arithmetic/bigint.rs         | 52 ++++++++++++++++++++++----------
 src/arithmetic/limb512aligned.rs | 47 +++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 16 deletions(-)
 create mode 100644 src/arithmetic/limb512aligned.rs
diff --git a/src/arithmetic.rs b/src/arithmetic.rs
index fb35cfefa..df7138e29 100644
--- a/src/arithmetic.rs
+++ b/src/arithmetic.rs
@@ -24,6 +24,7 @@ mod constant;
 pub mod bigint;
 
 pub(crate) mod inout;
+mod limb512aligned;
 pub mod montgomery;
 
 mod n0;
diff --git a/src/arithmetic/bigint.rs b/src/arithmetic/bigint.rs
index 50e4d3a14..136bbf2c1 100644
--- a/src/arithmetic/bigint.rs
+++ b/src/arithmetic/bigint.rs
@@ -42,14 +42,13 @@ pub(crate) use self::{
     modulusvalue::OwnedModulusValue,
     private_exponent::PrivateExponent,
 };
-use super::{inout::AliasingSlices3, montgomery::*, LimbSliceError, MAX_LIMBS};
+use super::{inout::AliasingSlices3, limb512aligned, montgomery::*, LimbSliceError, MAX_LIMBS};
 use crate::{
     bits::BitLength,
     c,
     error::{self, LenMismatchError},
     limb::{self, Limb, LIMB_BITS},
 };
-use alloc::vec;
 use core::{
     marker::PhantomData,
     num::{NonZeroU64, NonZeroUsize},
@@ -410,6 +409,11 @@ pub(crate) fn elem_exp_vartime<M>(
     acc
 }
 
+// 4096 is the maximum size we support for elem_exp_consttime.
+const ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS: usize = 4096 / LIMB_BITS;
+const _LIMBS_PER_CHUNK_DIVIDES_ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS: () =
+    assert!(ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS % limb512aligned::LIMBS_PER_CHUNK == 0);
+
 #[cfg(not(target_arch = "x86_64"))]
 pub fn elem_exp_consttime<M>(
     base: Elem<M, R>,
@@ -422,8 +426,18 @@ pub fn elem_exp_consttime<M>(
     const TABLE_ENTRIES: usize = 1 << WINDOW_BITS;
 
     let num_limbs = m.limbs().len();
+    if num_limbs % limb512aligned::LIMBS_PER_CHUNK != 0 {
+        return Err(error::Unspecified);
+    }
+    let cpe = num_limbs / limb512aligned::LIMBS_PER_CHUNK; // chunks per entry.
 
-    let mut table = vec![0; TABLE_ENTRIES * num_limbs];
+    type Storage = limb512aligned::Limb512AlignedStorage<
+        { ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS * TABLE_ENTRIES },
+    >;
+    let mut table = Storage::zeroed();
+    let table = table
+        .as_flattened_mut(TABLE_ENTRIES, cpe)
+        .ok_or_else(|| error::Unspecified)?;
 
     fn gather<M>(table: &[Limb], acc: &mut Elem<M, R>, i: Window) {
         prefixed_extern! {
@@ -463,9 +477,9 @@ pub fn elem_exp_consttime<M>(
     }
 
     // table[0] = base**0 (i.e. 1).
-    m.oneR(entry_mut(&mut table, 0, num_limbs));
+    m.oneR(entry_mut(table, 0, num_limbs));
 
-    entry_mut(&mut table, 1, num_limbs).copy_from_slice(&base.limbs);
+    entry_mut(table, 1, num_limbs).copy_from_slice(&base.limbs);
     for i in 2..TABLE_ENTRIES {
         let (src1, src2) = if i % 2 == 0 {
             (i / 2, i / 2)
@@ -503,7 +517,7 @@ pub fn elem_exp_consttime<M>(
     exponent: &PrivateExponent,
     m: &Modulus<M>,
 ) -> Result<Elem<M, Unencoded>, error::Unspecified> {
-    use crate::{cpu, limb::LIMB_BYTES};
+    use crate::cpu;
 
     // Pretty much all the math here requires CPU feature detection to have
     // been done. `cpu_features` isn't threaded through all the internal
@@ -516,6 +530,7 @@ pub fn elem_exp_consttime<M>(
     // inputs `tmp`, `am`, and `np` that immediately follow the table. All the
     // awkwardness here stems from trying to use the assembly code like OpenSSL
     // does.
+    const MOD_EXP_CTIME_ALIGN: usize = 64;
 
     use crate::limb::{LeakyWindow, Window};
 
@@ -523,16 +538,21 @@ pub fn elem_exp_consttime<M>(
     const TABLE_ENTRIES: usize = 1 << WINDOW_BITS;
 
     let num_limbs = m.limbs().len();
-
-    const ALIGNMENT: usize = 64;
-    assert_eq!(ALIGNMENT % LIMB_BYTES, 0);
-    let mut table = vec![0; ((TABLE_ENTRIES + 3) * num_limbs) + ALIGNMENT];
-    let (table, state) = {
-        let misalignment = (table.as_ptr() as usize) % ALIGNMENT;
-        let table = &mut table[((ALIGNMENT - misalignment) / LIMB_BYTES)..];
-        assert_eq!((table.as_ptr() as usize) % ALIGNMENT, 0);
-        table.split_at_mut(TABLE_ENTRIES * num_limbs)
-    };
+    if num_limbs % limb512aligned::LIMBS_PER_CHUNK != 0 {
+        return Err(error::Unspecified);
+    }
+    let cpe = num_limbs / limb512aligned::LIMBS_PER_CHUNK; // chunks per entry.
+
+    const TABLE_ENTRIES_PLUS_3: usize = TABLE_ENTRIES + 3;
+    type Storage = limb512aligned::Limb512AlignedStorage<
+        { ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS * TABLE_ENTRIES_PLUS_3 },
+    >;
+    let mut table = Storage::zeroed();
+    let table = table
+        .as_flattened_mut(TABLE_ENTRIES_PLUS_3, cpe)
+        .ok_or_else(|| error::Unspecified)?;
+    assert_eq!((table.as_ptr() as usize) % MOD_EXP_CTIME_ALIGN, 0);
+    let (table, state) = table.split_at_mut(TABLE_ENTRIES * num_limbs);
 
     fn scatter(table: &mut [Limb], acc: &[Limb], i: LeakyWindow, num_limbs: usize) {
         prefixed_extern! {
diff --git a/src/arithmetic/limb512aligned.rs b/src/arithmetic/limb512aligned.rs
new file mode 100644
index 000000000..82146bd12
--- /dev/null
+++ b/src/arithmetic/limb512aligned.rs
@@ -0,0 +1,47 @@
+// Copyright 2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::limb::{Limb, LIMB_BITS};
+use core::mem::size_of;
+
+// Some x86_64 assembly is written under the assumption that some of its
+// input data and/or temporary storage is aligned to `MOD_EXP_CTIME_ALIGN`
+// bytes, which was/is 64 in OpenSSL.
+//
+// We use this in the non-X86-64 implementation of exponentiation as well,
+// with the hope of converging th two implementations into one.
+#[repr(C, align(64))]
+pub struct Limb512AlignedStorage<const N: usize>([Limb; N]);
+
+const _LIMB_SIZE_DIVIDES_ALIGNMENT: () = assert!(64 % size_of::<Limb>() == 0);
+
+pub(super) const LIMBS_PER_CHUNK: usize = 512 / LIMB_BITS;
+
+impl<const N: usize> Limb512AlignedStorage<N> {
+    pub fn zeroed() -> Self {
+        assert_eq!(N % LIMBS_PER_CHUNK, 0); // TODO: const.
+        Self([0; N])
+    }
+
+    pub fn as_flattened_mut(
+        &mut self,
+        num_entries: usize,
+        chunks_per_entry: usize,
+    ) -> Option<&mut [Limb]> {
+        let total_limbs = num_entries
+            .checked_mul(chunks_per_entry)?
+            .checked_mul(LIMBS_PER_CHUNK)?;
+        self.0.get_mut(..total_limbs)
+    }
+}