From 8615a46954b35354e4ba21d10c2a555f61ca39ac Mon Sep 17 00:00:00 2001 From: Brian Smith Date: Mon, 27 Jan 2025 13:53:32 -0800 Subject: [PATCH] arithmetic: Avoid heap & simplify alignment logic in `elem_exp_consttime`. Avoid allocating on the heap. Let the compiler do the alignment instead of manually aligning the start of the table. --- src/arithmetic.rs | 1 + src/arithmetic/bigint.rs | 52 ++++++++++++++++++++++---------- src/arithmetic/limb512aligned.rs | 47 +++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 16 deletions(-) create mode 100644 src/arithmetic/limb512aligned.rs diff --git a/src/arithmetic.rs b/src/arithmetic.rs index fb35cfefa..df7138e29 100644 --- a/src/arithmetic.rs +++ b/src/arithmetic.rs @@ -24,6 +24,7 @@ mod constant; pub mod bigint; pub(crate) mod inout; +mod limb512aligned; pub mod montgomery; mod n0; diff --git a/src/arithmetic/bigint.rs b/src/arithmetic/bigint.rs index 50e4d3a14..136bbf2c1 100644 --- a/src/arithmetic/bigint.rs +++ b/src/arithmetic/bigint.rs @@ -42,14 +42,13 @@ pub(crate) use self::{ modulusvalue::OwnedModulusValue, private_exponent::PrivateExponent, }; -use super::{inout::AliasingSlices3, montgomery::*, LimbSliceError, MAX_LIMBS}; +use super::{inout::AliasingSlices3, limb512aligned, montgomery::*, LimbSliceError, MAX_LIMBS}; use crate::{ bits::BitLength, c, error::{self, LenMismatchError}, limb::{self, Limb, LIMB_BITS}, }; -use alloc::vec; use core::{ marker::PhantomData, num::{NonZeroU64, NonZeroUsize}, @@ -410,6 +409,11 @@ pub(crate) fn elem_exp_vartime( acc } +// 4096 is the maximum size we support for elem_exp_consttime. +const ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS: usize = 4096 / LIMB_BITS; +const _LIMBS_PER_CHUNK_DIVIDES_ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS: () = + assert!(ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS % limb512aligned::LIMBS_PER_CHUNK == 0); + #[cfg(not(target_arch = "x86_64"))] pub fn elem_exp_consttime( base: Elem, @@ -422,8 +426,18 @@ pub fn elem_exp_consttime( const TABLE_ENTRIES: usize = 1 << WINDOW_BITS; let num_limbs = m.limbs().len(); + if num_limbs % limb512aligned::LIMBS_PER_CHUNK != 0 { + return Err(error::Unspecified); + } + let cpe = num_limbs / limb512aligned::LIMBS_PER_CHUNK; // chunks per entry. - let mut table = vec![0; TABLE_ENTRIES * num_limbs]; + type Storage = limb512aligned::Limb512AlignedStorage< + { ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS * TABLE_ENTRIES }, + >; + let mut table = Storage::zeroed(); + let table = table + .as_flattened_mut(TABLE_ENTRIES, cpe) + .ok_or_else(|| error::Unspecified)?; fn gather(table: &[Limb], acc: &mut Elem, i: Window) { prefixed_extern! { @@ -463,9 +477,9 @@ pub fn elem_exp_consttime( } // table[0] = base**0 (i.e. 1). - m.oneR(entry_mut(&mut table, 0, num_limbs)); + m.oneR(entry_mut(table, 0, num_limbs)); - entry_mut(&mut table, 1, num_limbs).copy_from_slice(&base.limbs); + entry_mut(table, 1, num_limbs).copy_from_slice(&base.limbs); for i in 2..TABLE_ENTRIES { let (src1, src2) = if i % 2 == 0 { (i / 2, i / 2) @@ -503,7 +517,7 @@ pub fn elem_exp_consttime( exponent: &PrivateExponent, m: &Modulus, ) -> Result, error::Unspecified> { - use crate::{cpu, limb::LIMB_BYTES}; + use crate::cpu; // Pretty much all the math here requires CPU feature detection to have // been done. `cpu_features` isn't threaded through all the internal @@ -516,6 +530,7 @@ pub fn elem_exp_consttime( // inputs `tmp`, `am`, and `np` that immediately follow the table. All the // awkwardness here stems from trying to use the assembly code like OpenSSL // does. + const MOD_EXP_CTIME_ALIGN: usize = 64; use crate::limb::{LeakyWindow, Window}; @@ -523,16 +538,21 @@ pub fn elem_exp_consttime( const TABLE_ENTRIES: usize = 1 << WINDOW_BITS; let num_limbs = m.limbs().len(); - - const ALIGNMENT: usize = 64; - assert_eq!(ALIGNMENT % LIMB_BYTES, 0); - let mut table = vec![0; ((TABLE_ENTRIES + 3) * num_limbs) + ALIGNMENT]; - let (table, state) = { - let misalignment = (table.as_ptr() as usize) % ALIGNMENT; - let table = &mut table[((ALIGNMENT - misalignment) / LIMB_BYTES)..]; - assert_eq!((table.as_ptr() as usize) % ALIGNMENT, 0); - table.split_at_mut(TABLE_ENTRIES * num_limbs) - }; + if num_limbs % limb512aligned::LIMBS_PER_CHUNK != 0 { + return Err(error::Unspecified); + } + let cpe = num_limbs / limb512aligned::LIMBS_PER_CHUNK; // chunks per entry. + + const TABLE_ENTRIES_PLUS_3: usize = TABLE_ENTRIES + 3; + type Storage = limb512aligned::Limb512AlignedStorage< + { ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS * TABLE_ENTRIES_PLUS_3 }, + >; + let mut table = Storage::zeroed(); + let table = table + .as_flattened_mut(TABLE_ENTRIES_PLUS_3, cpe) + .ok_or_else(|| error::Unspecified)?; + assert_eq!((table.as_ptr() as usize) % MOD_EXP_CTIME_ALIGN, 0); + let (table, state) = table.split_at_mut(TABLE_ENTRIES * num_limbs); fn scatter(table: &mut [Limb], acc: &[Limb], i: LeakyWindow, num_limbs: usize) { prefixed_extern! { diff --git a/src/arithmetic/limb512aligned.rs b/src/arithmetic/limb512aligned.rs new file mode 100644 index 000000000..82146bd12 --- /dev/null +++ b/src/arithmetic/limb512aligned.rs @@ -0,0 +1,47 @@ +// Copyright 2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::limb::{Limb, LIMB_BITS}; +use core::mem::size_of; + +// Some x86_64 assembly is written under the assumption that some of its +// input data and/or temporary storage is aligned to `MOD_EXP_CTIME_ALIGN` +// bytes, which was/is 64 in OpenSSL. +// +// We use this in the non-X86-64 implementation of exponentiation as well, +// with the hope of converging th two implementations into one. +#[repr(C, align(64))] +pub struct Limb512AlignedStorage([Limb; N]); + +const _LIMB_SIZE_DIVIDES_ALIGNMENT: () = assert!(64 % size_of::() == 0); + +pub(super) const LIMBS_PER_CHUNK: usize = 512 / LIMB_BITS; + +impl Limb512AlignedStorage { + pub fn zeroed() -> Self { + assert_eq!(N % LIMBS_PER_CHUNK, 0); // TODO: const. + Self([0; N]) + } + + pub fn as_flattened_mut( + &mut self, + num_entries: usize, + chunks_per_entry: usize, + ) -> Option<&mut [Limb]> { + let total_limbs = num_entries + .checked_mul(chunks_per_entry)? + .checked_mul(LIMBS_PER_CHUNK)?; + self.0.get_mut(..total_limbs) + } +}