From fce5ce8bc5b8414c62f625d3064615bf802dbd5c Mon Sep 17 00:00:00 2001 From: evoskuil Date: Mon, 25 Nov 2024 22:26:45 -0500 Subject: [PATCH] Add native sha compression. --- include/bitcoin/system/hash/sha/algorithm.hpp | 11 ++ .../impl/hash/sha/algorithm_compress.ipp | 1 - .../system/impl/hash/sha/algorithm_native.ipp | 141 +++++++++++++++--- .../system/intrinsics/xcpu/defines.hpp | 2 + 4 files changed, 136 insertions(+), 19 deletions(-) diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp index 84b2b00d6b..ddad253c40 100644 --- a/include/bitcoin/system/hash/sha/algorithm.hpp +++ b/include/bitcoin/system/hash/sha/algorithm.hpp @@ -365,6 +365,17 @@ class algorithm INLINE static void schedule_native(xbuffer_t& xbuffer) NOEXCEPT; INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT; + template + INLINE static void round_native(wstate_t& state, + const wbuffer_t& wk) NOEXCEPT; + + ////INLINE static void summarize_native(wstate_t& out, + //// wstate_t& in) NOEXCEPT; + + template + INLINE static void compress_native(wstate_t& state, + wbuffer_t& wbuffer) NOEXCEPT; + template INLINE static void compress_native(xstate_t& xstate, const xbuffer_t& xbuffer) NOEXCEPT; diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp index ada7bae8f4..f9da36fd0f 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp @@ -173,7 +173,6 @@ template constexpr void CLASS:: compress_(auto& state, const auto& buffer) NOEXCEPT { - // SHA-NI/256: 64/4 = 16 quad rounds, 8/4 = 2 state elements. // This is a copy (state type varies due to vectorization). const auto start = state; diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp index a7c2d20e4b..13c9c5f85f 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp @@ -35,6 +35,10 @@ namespace libbitcoin { namespace system { namespace sha { + +// schedule +// ---------------------------------------------------------------------------- +// protected TEMPLATE template @@ -43,19 +47,21 @@ prepare_native(wbuffer_t& wbuffer) NOEXCEPT { if constexpr (SHA::strength == 160) { - ////static_assert(false, "sha160 not implemented"); - } - else if constexpr (use_neon) - { - static_assert(SHA::strength == 256); - - ////static_assert(false, "neon not implemented"); + if constexpr (use_neon) + { + } + else if constexpr (use_shani) + { + } } - else if constexpr (use_shani) + else if constexpr (SHA::strength == 256) { - static_assert(SHA::strength == 256); - - wbuffer[Round] = mm_sha256msg2_epu32 + if constexpr (use_neon) + { + } + else if constexpr (use_shani) + { + wbuffer[Round] = mm_sha256msg2_epu32 ( mm_add_epi32 ( @@ -70,6 +76,7 @@ prepare_native(wbuffer_t& wbuffer) NOEXCEPT ), wbuffer[Round - 1] ); + } } } @@ -101,16 +108,12 @@ schedule(wbuffer_t& wbuffer) NOEXCEPT konstant(array_cast(wbuffer)); } -// schedule -// ---------------------------------------------------------------------------- -// protected - TEMPLATE INLINE void CLASS:: schedule_native(buffer_t& buffer) NOEXCEPT { // neon and sha160 not yet implemented, sha512 is not native. - if constexpr (SHA::strength != 160 && SHA::strength != 512 && !use_neon) + if constexpr (SHA::strength == 256 && !use_neon) { schedule(array_cast(buffer)); } @@ -133,6 +136,99 @@ schedule_native(xbuffer_t& xbuffer) NOEXCEPT // ---------------------------------------------------------------------------- // protected +TEMPLATE +template +INLINE void CLASS:: +round_native(wstate_t& state, + const wbuffer_t& wk) NOEXCEPT +{ + if constexpr (SHA::strength == 160) + { + if constexpr (use_neon) + { + } + else if constexpr (use_shani) + { + } + } + else if constexpr (SHA::strength == 256) + { + if constexpr (use_neon) + { + } + else if constexpr (use_shani) + { + state[1] = mm_sha256rnds2_epu32(state[1], state[0], wk[Round]); + state[0] = mm_sha256rnds2_epu32(state[0], state[1], + mm_shuffle_epi32(wk[Round], 0x0e)); + } + } +} + +////TEMPLATE +////INLINE void CLASS:: +////summarize_native(wstate_t& out, wstate_t& in) NOEXCEPT +////{ +//// if constexpr (SHA::strength == 160) +//// { +//// if constexpr (use_neon) +//// { +//// } +//// else if constexpr (use_shani) +//// { +//// } +//// } +//// else if constexpr (SHA::strength == 256) +//// { +//// if constexpr (use_neon) +//// { +//// } +//// else if constexpr (use_shani) +//// { +//// out[0] = mm_add_epi32(out[0], in[0]); +//// out[1] = mm_add_epi32(out[1], in[1]); +//// } +//// } +////} + +TEMPLATE +template +INLINE void CLASS:: +compress_native(wstate_t& wstate, + wbuffer_t& wbuffer) NOEXCEPT +{ + // This is a copy. + const auto start = wstate; + + round_native< 0, Lane>(wstate, wbuffer); + round_native< 1, Lane>(wstate, wbuffer); + round_native< 2, Lane>(wstate, wbuffer); + round_native< 3, Lane>(wstate, wbuffer); + round_native< 4, Lane>(wstate, wbuffer); + round_native< 5, Lane>(wstate, wbuffer); + round_native< 6, Lane>(wstate, wbuffer); + round_native< 7, Lane>(wstate, wbuffer); + round_native< 8, Lane>(wstate, wbuffer); + round_native< 9, Lane>(wstate, wbuffer); + round_native<10, Lane>(wstate, wbuffer); + round_native<11, Lane>(wstate, wbuffer); + round_native<12, Lane>(wstate, wbuffer); + round_native<13, Lane>(wstate, wbuffer); + round_native<14, Lane>(wstate, wbuffer); + round_native<15, Lane>(wstate, wbuffer); + + if constexpr (SHA::rounds == 80) + { + round_native<16, Lane>(wstate, wbuffer); + round_native<17, Lane>(wstate, wbuffer); + round_native<18, Lane>(wstate, wbuffer); + round_native<19, Lane>(wstate, wbuffer); + } + + ////summarize_native(wstate, start); + summarize(array_cast(wstate), array_cast(start)); +} + TEMPLATE template INLINE void CLASS:: @@ -157,8 +253,17 @@ template INLINE void CLASS:: compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT { - // TODO: Single block compression. - compress_(state, buffer); + // TODO: sha160 state is too small to array cast into two xwords. + // neon and sha160 not yet implemented, sha512 is not native. + if constexpr (SHA::strength == 256 && !use_neon) + { + compress_native(array_cast(state), + array_cast(buffer)); + } + else + { + compress_(state, buffer); + } } } // namespace sha diff --git a/include/bitcoin/system/intrinsics/xcpu/defines.hpp b/include/bitcoin/system/intrinsics/xcpu/defines.hpp index 79e5b06334..66c50f84c4 100644 --- a/include/bitcoin/system/intrinsics/xcpu/defines.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/defines.hpp @@ -133,6 +133,7 @@ BC_POP_WARNING() #define mm_extract_epi32(a, Lane) {} #define mm_extract_epi64(a, Lane) {} #define mm_shuffle_epi8(a, mask) (a) + #define mm_shuffle_epi32(a, mask) (a) #define mm_load_si128(a) {} #define mm_loadu_si128(a) {} #define mm_store_si128(memory, a) @@ -167,6 +168,7 @@ BC_POP_WARNING() #define mm_extract_epi32(a, Lane) _mm_extract_epi32(a, Lane) #define mm_extract_epi64(a, Lane) _mm_extract_epi64(a, Lane) // undefined for X32 #define mm_shuffle_epi8(a, mask) _mm_shuffle_epi8(a, mask) + #define mm_shuffle_epi32(a, mask) _mm_shuffle_epi32(a, mask) #define mm_load_si128(a) _mm_load_si128(a) #define mm_loadu_si128(a) _mm_loadu_si128(a) #define mm_store_si128(memory, a) _mm_store_si128(memory, a)