From fce5ce8bc5b8414c62f625d3064615bf802dbd5c Mon Sep 17 00:00:00 2001
From: evoskuil <eric@voskuil.org>
Date: Mon, 25 Nov 2024 22:26:45 -0500
Subject: [PATCH] Add native sha compression.

---
 include/bitcoin/system/hash/sha/algorithm.hpp |  11 ++
 .../impl/hash/sha/algorithm_compress.ipp      |   1 -
 .../system/impl/hash/sha/algorithm_native.ipp | 141 +++++++++++++++---
 .../system/intrinsics/xcpu/defines.hpp        |   2 +
 4 files changed, 136 insertions(+), 19 deletions(-)
diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp
index 84b2b00d6b..ddad253c40 100644
--- a/include/bitcoin/system/hash/sha/algorithm.hpp
+++ b/include/bitcoin/system/hash/sha/algorithm.hpp
@@ -365,6 +365,17 @@ class algorithm
     INLINE static void schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
     INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT;
 
+    template<size_t Round, size_t Lane>
+    INLINE static void round_native(wstate_t<xint128_t>& state,
+        const wbuffer_t<xint128_t>& wk) NOEXCEPT;
+
+    ////INLINE static void summarize_native(wstate_t<xint128_t>& out,
+    ////    wstate_t<xint128_t>& in) NOEXCEPT;
+
+    template <size_t Lane>
+    INLINE static void compress_native(wstate_t<xint128_t>& state,
+        wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
+
     template <typename xWord, size_t Lane>
     INLINE static void compress_native(xstate_t<xWord>& xstate,
         const xbuffer_t<xWord>& xbuffer) NOEXCEPT;
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
index ada7bae8f4..f9da36fd0f 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
@@ -173,7 +173,6 @@ template <size_t Lane>
 constexpr void CLASS::
 compress_(auto& state, const auto& buffer) NOEXCEPT
 {
-    // SHA-NI/256: 64/4 = 16 quad rounds, 8/4 = 2 state elements.
     // This is a copy (state type varies due to vectorization).
     const auto start = state;
 
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
index a7c2d20e4b..13c9c5f85f 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
@@ -35,6 +35,10 @@
 namespace libbitcoin {
 namespace system {
 namespace sha {
+    
+// schedule
+// ----------------------------------------------------------------------------
+// protected
 
 TEMPLATE
 template<size_t Round>
@@ -43,19 +47,21 @@ prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
 {
     if constexpr (SHA::strength == 160)
     {
-        ////static_assert(false, "sha160 not implemented");
-    }
-    else if constexpr (use_neon)
-    {
-        static_assert(SHA::strength == 256);
-
-        ////static_assert(false, "neon not implemented");
+        if constexpr (use_neon)
+        {
+        }
+        else if constexpr (use_shani)
+        {
+        }
     }
-    else if constexpr (use_shani)
+    else if constexpr (SHA::strength == 256)
     {
-        static_assert(SHA::strength == 256);
-
-        wbuffer[Round] = mm_sha256msg2_epu32
+        if constexpr (use_neon)
+        {
+        }
+        else if constexpr (use_shani)
+        {
+            wbuffer[Round] = mm_sha256msg2_epu32
             (
                 mm_add_epi32
                 (
@@ -70,6 +76,7 @@ prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
                 ),
                 wbuffer[Round - 1]
             );
+        }
     }
 }
 
@@ -101,16 +108,12 @@ schedule(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
     konstant(array_cast<word_t>(wbuffer));
 }
 
-// schedule
-// ----------------------------------------------------------------------------
-// protected
-
 TEMPLATE
 INLINE void CLASS::
 schedule_native(buffer_t& buffer) NOEXCEPT
 {
     // neon and sha160 not yet implemented, sha512 is not native.
-    if constexpr (SHA::strength != 160 && SHA::strength != 512 && !use_neon)
+    if constexpr (SHA::strength == 256 && !use_neon)
     {
         schedule(array_cast<xint128_t>(buffer));
     }
@@ -133,6 +136,99 @@ schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT
 // ----------------------------------------------------------------------------
 // protected
 
+TEMPLATE
+template<size_t Round, size_t Lane>
+INLINE void CLASS::
+round_native(wstate_t<xint128_t>& state,
+    const wbuffer_t<xint128_t>& wk) NOEXCEPT
+{
+    if constexpr (SHA::strength == 160)
+    {
+        if constexpr (use_neon)
+        {
+        }
+        else if constexpr (use_shani)
+        {
+        }
+    }
+    else if constexpr (SHA::strength == 256)
+    {
+        if constexpr (use_neon)
+        {
+        }
+        else if constexpr (use_shani)
+        {
+            state[1] = mm_sha256rnds2_epu32(state[1], state[0], wk[Round]);
+            state[0] = mm_sha256rnds2_epu32(state[0], state[1],
+                mm_shuffle_epi32(wk[Round], 0x0e));
+        }
+    }
+}
+
+////TEMPLATE
+////INLINE void CLASS::
+////summarize_native(wstate_t<xint128_t>& out, wstate_t<xint128_t>& in) NOEXCEPT
+////{
+////    if constexpr (SHA::strength == 160)
+////    {
+////        if constexpr (use_neon)
+////        {
+////        }
+////        else if constexpr (use_shani)
+////        {
+////        }
+////    }
+////    else if constexpr (SHA::strength == 256)
+////    {
+////        if constexpr (use_neon)
+////        {
+////        }
+////        else if constexpr (use_shani)
+////        {
+////            out[0] = mm_add_epi32(out[0], in[0]);
+////            out[1] = mm_add_epi32(out[1], in[1]);
+////        }
+////    }
+////}
+
+TEMPLATE
+template <size_t Lane>
+INLINE void CLASS::
+compress_native(wstate_t<xint128_t>& wstate,
+    wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
+{ 
+    // This is a copy.
+    const auto start = wstate;
+
+    round_native< 0, Lane>(wstate, wbuffer);
+    round_native< 1, Lane>(wstate, wbuffer);
+    round_native< 2, Lane>(wstate, wbuffer);
+    round_native< 3, Lane>(wstate, wbuffer);
+    round_native< 4, Lane>(wstate, wbuffer);
+    round_native< 5, Lane>(wstate, wbuffer);
+    round_native< 6, Lane>(wstate, wbuffer);
+    round_native< 7, Lane>(wstate, wbuffer);
+    round_native< 8, Lane>(wstate, wbuffer);
+    round_native< 9, Lane>(wstate, wbuffer);
+    round_native<10, Lane>(wstate, wbuffer);
+    round_native<11, Lane>(wstate, wbuffer);
+    round_native<12, Lane>(wstate, wbuffer);
+    round_native<13, Lane>(wstate, wbuffer);
+    round_native<14, Lane>(wstate, wbuffer);
+    round_native<15, Lane>(wstate, wbuffer);
+
+    if constexpr (SHA::rounds == 80)
+    {
+        round_native<16, Lane>(wstate, wbuffer);
+        round_native<17, Lane>(wstate, wbuffer);
+        round_native<18, Lane>(wstate, wbuffer);
+        round_native<19, Lane>(wstate, wbuffer);
+    }
+
+    ////summarize_native(wstate, start);
+    summarize(array_cast<word_t>(wstate), array_cast<word_t>(start));
+}
+
 TEMPLATE
 template <typename xWord, size_t Lane>
 INLINE void CLASS::
@@ -157,8 +253,17 @@ template <size_t Lane>
 INLINE void CLASS::
 compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT
 {
-    // TODO: Single block compression.
-    compress_<Lane>(state, buffer);
+    // TODO: sha160 state is too small to array cast into two xwords.
+    // neon and sha160 not yet implemented, sha512 is not native.
+    if constexpr (SHA::strength == 256 && !use_neon)
+    {
+        compress_native<Lane>(array_cast<xint128_t>(state),
+            array_cast<xint128_t>(buffer));
+    }
+    else
+    {
+        compress_<Lane>(state, buffer);
+    }
 }
 
 } // namespace sha
diff --git a/include/bitcoin/system/intrinsics/xcpu/defines.hpp b/include/bitcoin/system/intrinsics/xcpu/defines.hpp
index 79e5b06334..66c50f84c4 100644
--- a/include/bitcoin/system/intrinsics/xcpu/defines.hpp
+++ b/include/bitcoin/system/intrinsics/xcpu/defines.hpp
@@ -133,6 +133,7 @@ BC_POP_WARNING()
     #define mm_extract_epi32(a, Lane)   {}
     #define mm_extract_epi64(a, Lane)   {}
     #define mm_shuffle_epi8(a, mask)    (a)
+    #define mm_shuffle_epi32(a, mask)   (a)
     #define mm_load_si128(a)            {}
     #define mm_loadu_si128(a)           {}
     #define mm_store_si128(memory, a)
@@ -167,6 +168,7 @@ BC_POP_WARNING()
     #define mm_extract_epi32(a, Lane)   _mm_extract_epi32(a, Lane)
     #define mm_extract_epi64(a, Lane)   _mm_extract_epi64(a, Lane) // undefined for X32
     #define mm_shuffle_epi8(a, mask)    _mm_shuffle_epi8(a, mask)
+    #define mm_shuffle_epi32(a, mask)   _mm_shuffle_epi32(a, mask)
     #define mm_load_si128(a)            _mm_load_si128(a)
     #define mm_loadu_si128(a)           _mm_loadu_si128(a)
     #define mm_store_si128(memory, a)   _mm_store_si128(memory, a)