From 8dbdb7d273fec285adc80a024db54514c9b1735f Mon Sep 17 00:00:00 2001 From: Chris Marslender Date: Fri, 13 Oct 2023 10:17:48 -0500 Subject: [PATCH 01/11] Swap blake3 to FetchContent --- CMakeLists.txt | 57 +- src/b3/blake3.c | 598 ------- src/b3/blake3.h | 56 - src/b3/blake3_avx2.c | 325 ---- src/b3/blake3_avx2_x86-64_unix.S | 1802 ------------------- src/b3/blake3_avx512.c | 1204 ------------- src/b3/blake3_avx512_x86-64_unix.S | 2572 ---------------------------- src/b3/blake3_dispatch.c | 245 --- src/b3/blake3_impl.h | 235 --- src/b3/blake3_portable.c | 168 -- src/b3/blake3_sse41.c | 559 ------ src/b3/blake3_sse41_x86-64_unix.S | 2014 ---------------------- src/calculate_bucket.hpp | 2 +- 13 files changed, 27 insertions(+), 9810 deletions(-) delete mode 100644 src/b3/blake3.c delete mode 100644 src/b3/blake3.h delete mode 100644 src/b3/blake3_avx2.c delete mode 100644 src/b3/blake3_avx2_x86-64_unix.S delete mode 100644 src/b3/blake3_avx512.c delete mode 100644 src/b3/blake3_avx512_x86-64_unix.S delete mode 100644 src/b3/blake3_dispatch.c delete mode 100644 src/b3/blake3_impl.h delete mode 100644 src/b3/blake3_portable.c delete mode 100644 src/b3/blake3_sse41.c delete mode 100644 src/b3/blake3_sse41_x86-64_unix.S diff --git a/CMakeLists.txt b/CMakeLists.txt index 16f37ec77..1c02d070c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,7 +34,6 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(cxxopts) - option(CP_LINK_BLADEBIT_HARVESTER "Links libbladebit_harvester at build time instead of dynamically loading it." OFF) option(CP_BUILD_BLADEBIT_HARVESTER "Pulls bladebit harvester target from git and builds it as a dependency.") @@ -123,38 +122,34 @@ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -fno-omit-frame-pointer -fsanitize=thre set (CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} -fno-omit-frame-pointer -fsanitize=thread") ENDIF() -IF (WIN32) -set(BLAKE3_SRC - src/b3/blake3.c - src/b3/blake3_portable.c - src/b3/blake3_dispatch.c - src/b3/blake3_avx2.c - src/b3/blake3_avx512.c - src/b3/blake3_sse41.c -) -ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64|arm64") -set(BLAKE3_SRC - src/b3/blake3.c - src/b3/blake3_portable.c - src/b3/blake3_dispatch.c +pybind11_add_module(chiapos ${CMAKE_CURRENT_SOURCE_DIR}/python-bindings/chiapos.cpp src/chacha8.c) +add_executable(ProofOfSpace + src/cli.cpp + src/chacha8.c ) -ELSE() -set(BLAKE3_SRC - src/b3/blake3.c - src/b3/blake3_portable.c - src/b3/blake3_dispatch.c - src/b3/blake3_avx2_x86-64_unix.S - src/b3/blake3_avx512_x86-64_unix.S - src/b3/blake3_sse41_x86-64_unix.S + +FetchContent_Declare( + blake3 + GIT_REPOSITORY https://github.com/BLAKE3-team/BLAKE3.git + GIT_TAG 1.5.0 ) -ENDIF() -pybind11_add_module(chiapos ${CMAKE_CURRENT_SOURCE_DIR}/python-bindings/chiapos.cpp src/chacha8.c ${BLAKE3_SRC}) +FetchContent_GetProperties(blake3) +if(NOT blake3_POPULATED) + FetchContent_Populate(blake3) -add_executable(ProofOfSpace - src/cli.cpp - src/chacha8.c - ${BLAKE3_SRC} + # Set BLAKE3 to build as a shared library + set(BUILD_SHARED_LIBS TRUE CACHE BOOL "Build shared libraries" FORCE) + + add_subdirectory(${blake3_SOURCE_DIR}/c ${blake3_BINARY_DIR}) +endif() + +set(BLAKE3_SRC ${blake3_SOURCE_DIR}/c) +set(BLAKE3_INCLUDE_DIR ${blake3_SOURCE_DIR}/c) +target_link_libraries(ProofOfSpace PRIVATE blake3) +include_directories( + ${INCLUDE_DIRECTORIES} + ${BLAKE3_INCLUDE_DIR} ) option(BUILD_PROOF_OF_SPACE_STATICALLY "Build ProofOfSpace target statically" OFF) @@ -173,8 +168,8 @@ FetchContent_MakeAvailable(Catch2) add_executable(RunTests tests/test.cpp src/chacha8.c - ${BLAKE3_SRC} ) +target_link_libraries(RunTests PRIVATE blake3) find_package(Threads REQUIRED) @@ -231,7 +226,7 @@ if (${CP_LINK_BLADEBIT_HARVESTER}) target_link_directories(chiapos PUBLIC ${CMAKE_SOURCE_DIR}/libs/green_reaper/lib) target_link_directories(ProofOfSpace PUBLIC ${CMAKE_SOURCE_DIR}/libs/green_reaper/lib) target_link_directories(RunTests PUBLIC ${CMAKE_SOURCE_DIR}/libs/green_reaper/lib) - + set_property(TARGET chiapos APPEND PROPERTY BUILD_RPATH "$ORIGIN") set_property(TARGET ProofOfSpace APPEND PROPERTY BUILD_RPATH "$ORIGIN") set_property(TARGET RunTests APPEND PROPERTY BUILD_RPATH "$ORIGIN") diff --git a/src/b3/blake3.c b/src/b3/blake3.c deleted file mode 100644 index 0acefbade..000000000 --- a/src/b3/blake3.c +++ /dev/null @@ -1,598 +0,0 @@ -#include -#include -#include - -#include "blake3.h" -#include "blake3_impl.h" - -INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], - uint8_t flags) { - memcpy(self->cv, key, BLAKE3_KEY_LEN); - self->chunk_counter = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - self->buf_len = 0; - self->blocks_compressed = 0; - self->flags = flags; -} - -INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], - uint64_t chunk_counter) { - memcpy(self->cv, key, BLAKE3_KEY_LEN); - self->chunk_counter = chunk_counter; - self->blocks_compressed = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - self->buf_len = 0; -} - -INLINE size_t chunk_state_len(const blake3_chunk_state *self) { - return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + - ((size_t)self->buf_len); -} - -INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, - const uint8_t *input, size_t input_len) { - size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); - if (take > input_len) { - take = input_len; - } - uint8_t *dest = self->buf + ((size_t)self->buf_len); - memcpy(dest, input, take); - self->buf_len += (uint8_t)take; - return take; -} - -INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { - if (self->blocks_compressed == 0) { - return CHUNK_START; - } else { - return 0; - } -} - -typedef struct { - uint32_t input_cv[8]; - uint64_t counter; - uint8_t block[BLAKE3_BLOCK_LEN]; - uint8_t block_len; - uint8_t flags; -} output_t; - -INLINE output_t make_output(const uint32_t input_cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - output_t ret; - memcpy(ret.input_cv, input_cv, 32); - memcpy(ret.block, block, BLAKE3_BLOCK_LEN); - ret.block_len = block_len; - ret.counter = counter; - ret.flags = flags; - return ret; -} - -// Chaining values within a given chunk (specifically the compress_in_place -// interface) are represented as words. This avoids unnecessary bytes<->words -// conversion overhead in the portable implementation. However, the hash_many -// interface handles both user input and parent node blocks, so it accepts -// bytes. For that reason, chaining values in the CV stack are represented as -// bytes. -INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { - uint32_t cv_words[8]; - memcpy(cv_words, self->input_cv, 32); - blake3_compress_in_place(cv_words, self->block, self->block_len, - self->counter, self->flags); - memcpy(cv, cv_words, 32); -} - -INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, - size_t out_len) { - uint64_t output_block_counter = seek / 64; - size_t offset_within_block = seek % 64; - uint8_t wide_buf[64]; - while (out_len > 0) { - blake3_compress_xof(self->input_cv, self->block, self->block_len, - output_block_counter, self->flags | ROOT, wide_buf); - size_t available_bytes = 64 - offset_within_block; - size_t memcpy_len; - if (out_len > available_bytes) { - memcpy_len = available_bytes; - } else { - memcpy_len = out_len; - } - memcpy(out, wide_buf + offset_within_block, memcpy_len); - out += memcpy_len; - out_len -= memcpy_len; - output_block_counter += 1; - offset_within_block = 0; - } -} - -INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, - size_t input_len) { - if (self->buf_len > 0) { - size_t take = chunk_state_fill_buf(self, input, input_len); - input += take; - input_len -= take; - if (input_len > 0) { - blake3_compress_in_place( - self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, - self->flags | chunk_state_maybe_start_flag(self)); - self->blocks_compressed += 1; - self->buf_len = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - } - } - - while (input_len > BLAKE3_BLOCK_LEN) { - blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, - self->chunk_counter, - self->flags | chunk_state_maybe_start_flag(self)); - self->blocks_compressed += 1; - input += BLAKE3_BLOCK_LEN; - input_len -= BLAKE3_BLOCK_LEN; - } - - size_t take = chunk_state_fill_buf(self, input, input_len); - input += take; - input_len -= take; -} - -INLINE output_t chunk_state_output(const blake3_chunk_state *self) { - uint8_t block_flags = - self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; - return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, - block_flags); -} - -INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], - const uint32_t key[8], uint8_t flags) { - return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); -} - -// Given some input larger than one chunk, return the number of bytes that -// should go in the left subtree. This is the largest power-of-2 number of -// chunks that leaves at least 1 byte for the right subtree. -INLINE size_t left_len(size_t content_len) { - // Subtract 1 to reserve at least one byte for the right side. content_len - // should always be greater than BLAKE3_CHUNK_LEN. - size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; - return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time -// on a single thread. Write out the chunk chaining values and return the -// number of chunks hashed. These chunks are never the root and never empty; -// those cases use a different codepath. -INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, - const uint32_t key[8], - uint64_t chunk_counter, uint8_t flags, - uint8_t *out) { -#if defined(BLAKE3_TESTING) - assert(0 < input_len); - assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); -#endif - - const uint8_t *chunks_array[MAX_SIMD_DEGREE]; - size_t input_position = 0; - size_t chunks_array_len = 0; - while (input_len - input_position >= BLAKE3_CHUNK_LEN) { - chunks_array[chunks_array_len] = &input[input_position]; - input_position += BLAKE3_CHUNK_LEN; - chunks_array_len += 1; - } - - blake3_hash_many(chunks_array, chunks_array_len, - BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, - true, flags, CHUNK_START, CHUNK_END, out); - - // Hash the remaining partial chunk, if there is one. Note that the empty - // chunk (meaning the empty message) is a different codepath. - if (input_len > input_position) { - uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; - blake3_chunk_state chunk_state; - chunk_state_init(&chunk_state, key, flags); - chunk_state.chunk_counter = counter; - chunk_state_update(&chunk_state, &input[input_position], - input_len - input_position); - output_t output = chunk_state_output(&chunk_state); - output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); - return chunks_array_len + 1; - } else { - return chunks_array_len; - } -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time -// on a single thread. Write out the parent chaining values and return the -// number of parents hashed. (If there's an odd input chaining value left over, -// return it as an additional output.) These parents are never the root and -// never empty; those cases use a different codepath. -INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, - size_t num_chaining_values, - const uint32_t key[8], uint8_t flags, - uint8_t *out) { -#if defined(BLAKE3_TESTING) - assert(2 <= num_chaining_values); - assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); -#endif - - const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; - size_t parents_array_len = 0; - while (num_chaining_values - (2 * parents_array_len) >= 2) { - parents_array[parents_array_len] = - &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; - parents_array_len += 1; - } - - blake3_hash_many(parents_array, parents_array_len, 1, key, - 0, // Parents always use counter 0. - false, flags | PARENT, - 0, // Parents have no start flags. - 0, // Parents have no end flags. - out); - - // If there's an odd child left over, it becomes an output. - if (num_chaining_values > 2 * parents_array_len) { - memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], - &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], - BLAKE3_OUT_LEN); - return parents_array_len + 1; - } else { - return parents_array_len; - } -} - -// The wide helper function returns (writes out) an array of chaining values -// and returns the length of that array. The number of chaining values returned -// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, -// if the input is shorter than that many chunks. The reason for maintaining a -// wide array of chaining values going back up the tree, is to allow the -// implementation to hash as many parents in parallel as possible. -// -// As a special case when the SIMD degree is 1, this function will still return -// at least 2 outputs. This guarantees that this function doesn't perform the -// root compression. (If it did, it would use the wrong flags, and also we -// wouldn't be able to implement exendable ouput.) Note that this function is -// not used when the whole input is only 1 chunk long; that's a different -// codepath. -// -// Why not just have the caller split the input on the first update(), instead -// of implementing this special rule? Because we don't want to limit SIMD or -// multi-threading parallelism for that update(). -static size_t blake3_compress_subtree_wide(const uint8_t *input, - size_t input_len, - const uint32_t key[8], - uint64_t chunk_counter, - uint8_t flags, uint8_t *out) { - // Note that the single chunk case does *not* bump the SIMD degree up to 2 - // when it is 1. If this implementation adds multi-threading in the future, - // this gives us the option of multi-threading even the 2-chunk case, which - // can help performance on smaller platforms. - if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { - return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, - out); - } - - // With more than simd_degree chunks, we need to recurse. Start by dividing - // the input into left and right subtrees. (Note that this is only optimal - // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree - // of 3 or something, we'll need a more complicated strategy.) - size_t left_input_len = left_len(input_len); - size_t right_input_len = input_len - left_input_len; - const uint8_t *right_input = &input[left_input_len]; - uint64_t right_chunk_counter = - chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); - - // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to - // account for the special case of returning 2 outputs when the SIMD degree - // is 1. - uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; - size_t degree = blake3_simd_degree(); - if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { - // The special case: We always use a degree of at least two, to make - // sure there are two outputs. Except, as noted above, at the chunk - // level, where we allow degree=1. (Note that the 1-chunk-input case is - // a different codepath.) - degree = 2; - } - uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; - - // Recurse! If this implementation adds multi-threading support in the - // future, this is where it will go. - size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, - chunk_counter, flags, cv_array); - size_t right_n = blake3_compress_subtree_wide( - right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); - - // The special case again. If simd_degree=1, then we'll have left_n=1 and - // right_n=1. Rather than compressing them into a single output, return - // them directly, to make sure we always have at least two outputs. - if (left_n == 1) { - memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); - return 2; - } - - // Otherwise, do one layer of parent node compression. - size_t num_chaining_values = left_n + right_n; - return compress_parents_parallel(cv_array, num_chaining_values, key, flags, - out); -} - -// Hash a subtree with compress_subtree_wide(), and then condense the resulting -// list of chaining values down to a single parent node. Don't compress that -// last parent node, however. Instead, return its message bytes (the -// concatenated chaining values of its children). This is necessary when the -// first call to update() supplies a complete subtree, because the topmost -// parent node of that subtree could end up being the root. It's also necessary -// for extended output in the general case. -// -// As with compress_subtree_wide(), this function is not used on inputs of 1 -// chunk or less. That's a different codepath. -INLINE void compress_subtree_to_parent_node( - const uint8_t *input, size_t input_len, const uint32_t key[8], - uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { -#if defined(BLAKE3_TESTING) - assert(input_len > BLAKE3_CHUNK_LEN); -#endif - - uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; - size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, - chunk_counter, flags, cv_array); - - // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, - // compress_subtree_wide() returns more than 2 chaining values. Condense - // them into 2 by forming parent nodes repeatedly. - uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; - while (num_cvs > 2) { - num_cvs = - compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); - memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); - } - memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); -} - -INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], - uint8_t flags) { - memcpy(self->key, key, BLAKE3_KEY_LEN); - chunk_state_init(&self->chunk, key, flags); - self->cv_stack_len = 0; -} - -void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } - -void blake3_hasher_init_keyed(blake3_hasher *self, - const uint8_t key[BLAKE3_KEY_LEN]) { - uint32_t key_words[8]; - load_key_words(key, key_words); - hasher_init_base(self, key_words, KEYED_HASH); -} - -void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { - blake3_hasher context_hasher; - hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); - blake3_hasher_update(&context_hasher, context, strlen(context)); - uint8_t context_key[BLAKE3_KEY_LEN]; - blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); - uint32_t context_key_words[8]; - load_key_words(context_key, context_key_words); - hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); -} - -// As described in hasher_push_cv() below, we do "lazy merging", delaying -// merges until right before the next CV is about to be added. This is -// different from the reference implementation. Another difference is that we -// aren't always merging 1 chunk at a time. Instead, each CV might represent -// any power-of-two number of chunks, as long as the smaller-above-larger stack -// order is maintained. Instead of the "count the trailing 0-bits" algorithm -// described in the spec, we use a "count the total number of 1-bits" variant -// that doesn't require us to retain the subtree size of the CV on top of the -// stack. The principle is the same: each CV that should remain in the stack is -// represented by a 1-bit in the total number of chunks (or bytes) so far. -INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { - size_t post_merge_stack_len = (size_t)popcnt(total_len); - while (self->cv_stack_len > post_merge_stack_len) { - uint8_t *parent_node = - &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; - output_t output = parent_output(parent_node, self->key, self->chunk.flags); - output_chaining_value(&output, parent_node); - self->cv_stack_len -= 1; - } -} - -// In reference_impl.rs, we merge the new CV with existing CVs from the stack -// before pushing it. We can do that because we know more input is coming, so -// we know none of the merges are root. -// -// This setting is different. We want to feed as much input as possible to -// compress_subtree_wide(), without setting aside anything for the chunk_state. -// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once -// as a single subtree, if at all possible. -// -// This leads to two problems: -// 1) This 64 KiB input might be the only call that ever gets made to update. -// In this case, the root node of the 64 KiB subtree would be the root node -// of the whole tree, and it would need to be ROOT finalized. We can't -// compress it until we know. -// 2) This 64 KiB input might complete a larger tree, whose root node is -// similarly going to be the the root of the whole tree. For example, maybe -// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the -// node at the root of the 256 KiB subtree until we know how to finalize it. -// -// The second problem is solved with "lazy merging". That is, when we're about -// to add a CV to the stack, we don't merge it with anything first, as the -// reference impl does. Instead we do merges using the *previous* CV that was -// added, which is sitting on top of the stack, and we put the new CV -// (unmerged) on top of the stack afterwards. This guarantees that we never -// merge the root node until finalize(). -// -// Solving the first problem requires an additional tool, -// compress_subtree_to_parent_node(). That function always returns the top -// *two* chaining values of the subtree it's compressing. We then do lazy -// merging with each of them separately, so that the second CV will always -// remain unmerged. (That also helps us support extendable output when we're -// hashing an input all-at-once.) -INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], - uint64_t chunk_counter) { - hasher_merge_cv_stack(self, chunk_counter); - memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, - BLAKE3_OUT_LEN); - self->cv_stack_len += 1; -} - -void blake3_hasher_update(blake3_hasher *self, const void *input, - size_t input_len) { - // Explicitly checking for zero avoids causing UB by passing a null pointer - // to memcpy. This comes up in practice with things like: - // std::vector v; - // blake3_hasher_update(&hasher, v.data(), v.size()); - if (input_len == 0) { - return; - } - - const uint8_t *input_bytes = (const uint8_t *)input; - - // If we have some partial chunk bytes in the internal chunk_state, we need - // to finish that chunk first. - if (chunk_state_len(&self->chunk) > 0) { - size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); - if (take > input_len) { - take = input_len; - } - chunk_state_update(&self->chunk, input_bytes, take); - input_bytes += take; - input_len -= take; - // If we've filled the current chunk and there's more coming, finalize this - // chunk and proceed. In this case we know it's not the root. - if (input_len > 0) { - output_t output = chunk_state_output(&self->chunk); - uint8_t chunk_cv[32]; - output_chaining_value(&output, chunk_cv); - hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); - chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); - } else { - return; - } - } - - // Now the chunk_state is clear, and we have more input. If there's more than - // a single chunk (so, definitely not the root chunk), hash the largest whole - // subtree we can, with the full benefits of SIMD (and maybe in the future, - // multi-threading) parallelism. Two restrictions: - // - The subtree has to be a power-of-2 number of chunks. Only subtrees along - // the right edge can be incomplete, and we don't know where the right edge - // is going to be until we get to finalize(). - // - The subtree must evenly divide the total number of chunks up until this - // point (if total is not 0). If the current incomplete subtree is only - // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have - // to complete the current subtree first. - // Because we might need to break up the input to form powers of 2, or to - // evenly divide what we already have, this part runs in a loop. - while (input_len > BLAKE3_CHUNK_LEN) { - size_t subtree_len = round_down_to_power_of_2(input_len); - uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; - // Shrink the subtree_len until it evenly divides the count so far. We know - // that subtree_len itself is a power of 2, so we can use a bitmasking - // trick instead of an actual remainder operation. (Note that if the caller - // consistently passes power-of-2 inputs of the same size, as is hopefully - // typical, this loop condition will always fail, and subtree_len will - // always be the full length of the input.) - // - // An aside: We don't have to shrink subtree_len quite this much. For - // example, if count_so_far is 1, we could pass 2 chunks to - // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still - // get the right answer in the end, and we might get to use 2-way SIMD - // parallelism. The problem with this optimization, is that it gets us - // stuck always hashing 2 chunks. The total number of chunks will remain - // odd, and we'll never graduate to higher degrees of parallelism. See - // https://github.com/BLAKE3-team/BLAKE3/issues/69. - while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { - subtree_len /= 2; - } - // The shrunken subtree_len might now be 1 chunk long. If so, hash that one - // chunk by itself. Otherwise, compress the subtree into a pair of CVs. - uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; - if (subtree_len <= BLAKE3_CHUNK_LEN) { - blake3_chunk_state chunk_state; - chunk_state_init(&chunk_state, self->key, self->chunk.flags); - chunk_state.chunk_counter = self->chunk.chunk_counter; - chunk_state_update(&chunk_state, input_bytes, subtree_len); - output_t output = chunk_state_output(&chunk_state); - uint8_t cv[BLAKE3_OUT_LEN]; - output_chaining_value(&output, cv); - hasher_push_cv(self, cv, chunk_state.chunk_counter); - } else { - // This is the high-performance happy path, though getting here depends - // on the caller giving us a long enough input. - uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; - compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, - self->chunk.chunk_counter, - self->chunk.flags, cv_pair); - hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); - hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], - self->chunk.chunk_counter + (subtree_chunks / 2)); - } - self->chunk.chunk_counter += subtree_chunks; - input_bytes += subtree_len; - input_len -= subtree_len; - } - - // If there's any remaining input less than a full chunk, add it to the chunk - // state. In that case, also do a final merge loop to make sure the subtree - // stack doesn't contain any unmerged pairs. The remaining input means we - // know these merges are non-root. This merge loop isn't strictly necessary - // here, because hasher_push_chunk_cv already does its own merge loop, but it - // simplifies blake3_hasher_finalize below. - if (input_len > 0) { - chunk_state_update(&self->chunk, input_bytes, input_len); - hasher_merge_cv_stack(self, self->chunk.chunk_counter); - } -} - -void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, - size_t out_len) { - blake3_hasher_finalize_seek(self, 0, out, out_len); -} - -void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, - uint8_t *out, size_t out_len) { - // Explicitly checking for zero avoids causing UB by passing a null pointer - // to memcpy. This comes up in practice with things like: - // std::vector v; - // blake3_hasher_finalize(&hasher, v.data(), v.size()); - if (out_len == 0) { - return; - } - - // If the subtree stack is empty, then the current chunk is the root. - if (self->cv_stack_len == 0) { - output_t output = chunk_state_output(&self->chunk); - output_root_bytes(&output, seek, out, out_len); - return; - } - // If there are any bytes in the chunk state, finalize that chunk and do a - // roll-up merge between that chunk hash and every subtree in the stack. In - // this case, the extra merge loop at the end of blake3_hasher_update - // guarantees that none of the subtrees in the stack need to be merged with - // each other first. Otherwise, if there are no bytes in the chunk state, - // then the top of the stack is a chunk hash, and we start the merge from - // that. - output_t output; - size_t cvs_remaining; - if (chunk_state_len(&self->chunk) > 0) { - cvs_remaining = self->cv_stack_len; - output = chunk_state_output(&self->chunk); - } else { - // There are always at least 2 CVs in the stack in this case. - cvs_remaining = self->cv_stack_len - 2; - output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, - self->chunk.flags); - } - while (cvs_remaining > 0) { - cvs_remaining -= 1; - uint8_t parent_block[BLAKE3_BLOCK_LEN]; - memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); - output_chaining_value(&output, &parent_block[32]); - output = parent_output(parent_block, self->key, self->chunk.flags); - } - output_root_bytes(&output, seek, out, out_len); -} diff --git a/src/b3/blake3.h b/src/b3/blake3.h deleted file mode 100644 index 5060e38b7..000000000 --- a/src/b3/blake3.h +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef BLAKE3_H -#define BLAKE3_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define BLAKE3_KEY_LEN 32 -#define BLAKE3_OUT_LEN 32 -#define BLAKE3_BLOCK_LEN 64 -#define BLAKE3_CHUNK_LEN 1024 -#define BLAKE3_MAX_DEPTH 54 -#define BLAKE3_MAX_SIMD_DEGREE 16 - -// This struct is a private implementation detail. It has to be here because -// it's part of blake3_hasher below. -typedef struct { - uint32_t cv[8]; - uint64_t chunk_counter; - uint8_t buf[BLAKE3_BLOCK_LEN]; - uint8_t buf_len; - uint8_t blocks_compressed; - uint8_t flags; -} blake3_chunk_state; - -typedef struct { - uint32_t key[8]; - blake3_chunk_state chunk; - uint8_t cv_stack_len; - // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, - // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk - // requires a 4th entry, rather than merging everything down to 1, because we - // don't know whether more input is coming. This is different from how the - // reference implementation does things. - uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; -} blake3_hasher; - -void blake3_hasher_init(blake3_hasher *self); -void blake3_hasher_init_keyed(blake3_hasher *self, - const uint8_t key[BLAKE3_KEY_LEN]); -void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); -void blake3_hasher_update(blake3_hasher *self, const void *input, - size_t input_len); -void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, - size_t out_len); -void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, - uint8_t *out, size_t out_len); - -#ifdef __cplusplus -} -#endif - -#endif /* BLAKE3_H */ diff --git a/src/b3/blake3_avx2.c b/src/b3/blake3_avx2.c deleted file mode 100644 index c5a2ce9e2..000000000 --- a/src/b3/blake3_avx2.c +++ /dev/null @@ -1,325 +0,0 @@ -#include "blake3_impl.h" - -#include - -#define DEGREE 8 - -INLINE __m256i loadu(const uint8_t src[32]) { - return _mm256_loadu_si256((const __m256i *)src); -} - -INLINE void storeu(__m256i src, uint8_t dest[16]) { - _mm256_storeu_si256((__m256i *)dest, src); -} - -INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } - -// Note that clang-format doesn't like the name "xor" for some reason. -INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } - -INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } - -INLINE __m256i rot16(__m256i x) { - return _mm256_shuffle_epi8( - x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, - 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); -} - -INLINE __m256i rot12(__m256i x) { - return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); -} - -INLINE __m256i rot8(__m256i x) { - return _mm256_shuffle_epi8( - x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, - 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); -} - -INLINE __m256i rot7(__m256i x) { - return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); -} - -INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -INLINE void transpose_vecs(__m256i vecs[DEGREE]) { - // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high - // is 22/33/66/77. - __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); - __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); - __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); - __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); - __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); - __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); - __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); - __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); - - // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is - // 11/33. - __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); - __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); - __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); - __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); - __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); - __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); - __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); - __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); - - // Interleave 128-bit lanes. - vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); - vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); - vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); - vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); - vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); - vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); - vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); - vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); -} - -INLINE void transpose_msg_vecs(const uint8_t *const *inputs, - size_t block_offset, __m256i out[16]) { - out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]); - out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]); - out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]); - out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]); - out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]); - out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]); - out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]); - out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]); - out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]); - out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]); - out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]); - out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]); - out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]); - out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]); - out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]); - out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]); - for (size_t i = 0; i < 8; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs(&out[0]); - transpose_vecs(&out[8]); -} - -INLINE void load_counters(uint64_t counter, bool increment_counter, - __m256i *out_lo, __m256i *out_hi) { - const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter); - const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - const __m256i add1 = _mm256_and_si256(mask, add0); - __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1); - __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), - _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); - __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry); - *out_lo = l; - *out_hi = h; -} - -void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m256i h_vecs[8] = { - set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), - set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), - }; - __m256i counter_low_vec, counter_high_vec; - load_counters(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN); - __m256i block_flags_vec = set1(block_flags); - __m256i msg_vecs[16]; - transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m256i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn(v, msg_vecs, 0); - round_fn(v, msg_vecs, 1); - round_fn(v, msg_vecs, 2); - round_fn(v, msg_vecs, 3); - round_fn(v, msg_vecs, 4); - round_fn(v, msg_vecs, 5); - round_fn(v, msg_vecs, 6); - h_vecs[0] = xorv(v[0], v[8]); - h_vecs[1] = xorv(v[1], v[9]); - h_vecs[2] = xorv(v[2], v[10]); - h_vecs[3] = xorv(v[3], v[11]); - h_vecs[4] = xorv(v[4], v[12]); - h_vecs[5] = xorv(v[5], v[13]); - h_vecs[6] = xorv(v[6], v[14]); - h_vecs[7] = xorv(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(h_vecs); - storeu(h_vecs[0], &out[0 * sizeof(__m256i)]); - storeu(h_vecs[1], &out[1 * sizeof(__m256i)]); - storeu(h_vecs[2], &out[2 * sizeof(__m256i)]); - storeu(h_vecs[3], &out[3 * sizeof(__m256i)]); - storeu(h_vecs[4], &out[4 * sizeof(__m256i)]); - storeu(h_vecs[5], &out[5 * sizeof(__m256i)]); - storeu(h_vecs[6], &out[6 * sizeof(__m256i)]); - storeu(h_vecs[7], &out[7 * sizeof(__m256i)]); -} - -#if !defined(BLAKE3_NO_SSE41) -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#else -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif - -void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= DEGREE) { - blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += DEGREE; - } - inputs += DEGREE; - num_inputs -= DEGREE; - out = &out[DEGREE * BLAKE3_OUT_LEN]; - } -#if !defined(BLAKE3_NO_SSE41) - blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); -#else - blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); -#endif -} diff --git a/src/b3/blake3_avx2_x86-64_unix.S b/src/b3/blake3_avx2_x86-64_unix.S deleted file mode 100644 index d2b14d440..000000000 --- a/src/b3/blake3_avx2_x86-64_unix.S +++ /dev/null @@ -1,1802 +0,0 @@ -#ifdef __x86_64__ -.intel_syntax noprefix -.global _blake3_hash_many_avx2 -.global blake3_hash_many_avx2 -#ifdef __APPLE__ -.text -#else -.section .text -#endif - .p2align 6 -_blake3_hash_many_avx2: -blake3_hash_many_avx2: - push r15 - push r14 - push r13 - push r12 - push rbx - push rbp - mov rbp, rsp - sub rsp, 680 - and rsp, 0xFFFFFFFFFFFFFFC0 - neg r9d - vmovd xmm0, r9d - vpbroadcastd ymm0, xmm0 - vmovdqa ymmword ptr [rsp+0x280], ymm0 - vpand ymm1, ymm0, ymmword ptr [ADD0+rip] - vpand ymm2, ymm0, ymmword ptr [ADD1+rip] - vmovdqa ymmword ptr [rsp+0x220], ymm2 - vmovd xmm2, r8d - vpbroadcastd ymm2, xmm2 - vpaddd ymm2, ymm2, ymm1 - vmovdqa ymmword ptr [rsp+0x240], ymm2 - vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] - vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] - vpcmpgtd ymm2, ymm1, ymm2 - shr r8, 32 - vmovd xmm3, r8d - vpbroadcastd ymm3, xmm3 - vpsubd ymm3, ymm3, ymm2 - vmovdqa ymmword ptr [rsp+0x260], ymm3 - shl rdx, 6 - mov qword ptr [rsp+0x2A0], rdx - cmp rsi, 8 - jc 3f -2: - vpbroadcastd ymm0, dword ptr [rcx] - vpbroadcastd ymm1, dword ptr [rcx+0x4] - vpbroadcastd ymm2, dword ptr [rcx+0x8] - vpbroadcastd ymm3, dword ptr [rcx+0xC] - vpbroadcastd ymm4, dword ptr [rcx+0x10] - vpbroadcastd ymm5, dword ptr [rcx+0x14] - vpbroadcastd ymm6, dword ptr [rcx+0x18] - vpbroadcastd ymm7, dword ptr [rcx+0x1C] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x20] - mov r13, qword ptr [rdi+0x28] - mov r14, qword ptr [rdi+0x30] - mov r15, qword ptr [rdi+0x38] - movzx eax, byte ptr [rbp+0x38] - movzx ebx, byte ptr [rbp+0x40] - or eax, ebx - xor edx, edx -.p2align 5 -9: - movzx ebx, byte ptr [rbp+0x48] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x2A0] - cmove eax, ebx - mov dword ptr [rsp+0x200], eax - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x40] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x40] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x20], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x40], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x60], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x30] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x30] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x80], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0xA0], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0xC0], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0xE0], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x20] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x20] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x100], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x120], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x140], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x160], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x10] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x10] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x180], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x1A0], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x1C0], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x1E0], ymm11 - vpbroadcastd ymm15, dword ptr [rsp+0x200] - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - vpaddd ymm0, ymm0, ymmword ptr [rsp] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] - vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] - vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] - vpxor ymm15, ymm3, ymm15 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] - vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] - vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] - vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vpxor ymm0, ymm0, ymm8 - vpxor ymm1, ymm1, ymm9 - vpxor ymm2, ymm2, ymm10 - vpxor ymm3, ymm3, ymm11 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpxor ymm4, ymm4, ymm12 - vpxor ymm5, ymm5, ymm13 - vpxor ymm6, ymm6, ymm14 - vpxor ymm7, ymm7, ymm15 - movzx eax, byte ptr [rbp+0x38] - jne 9b - mov rbx, qword ptr [rbp+0x50] - vunpcklps ymm8, ymm0, ymm1 - vunpcklps ymm9, ymm2, ymm3 - vunpckhps ymm10, ymm0, ymm1 - vunpcklps ymm11, ymm4, ymm5 - vunpcklps ymm0, ymm6, ymm7 - vshufps ymm12, ymm8, ymm9, 78 - vblendps ymm1, ymm8, ymm12, 0xCC - vshufps ymm8, ymm11, ymm0, 78 - vunpckhps ymm13, ymm2, ymm3 - vblendps ymm2, ymm11, ymm8, 0xCC - vblendps ymm3, ymm12, ymm9, 0xCC - vperm2f128 ymm12, ymm1, ymm2, 0x20 - vmovups ymmword ptr [rbx], ymm12 - vunpckhps ymm14, ymm4, ymm5 - vblendps ymm4, ymm8, ymm0, 0xCC - vunpckhps ymm15, ymm6, ymm7 - vperm2f128 ymm7, ymm3, ymm4, 0x20 - vmovups ymmword ptr [rbx+0x20], ymm7 - vshufps ymm5, ymm10, ymm13, 78 - vblendps ymm6, ymm5, ymm13, 0xCC - vshufps ymm13, ymm14, ymm15, 78 - vblendps ymm10, ymm10, ymm5, 0xCC - vblendps ymm14, ymm14, ymm13, 0xCC - vperm2f128 ymm8, ymm10, ymm14, 0x20 - vmovups ymmword ptr [rbx+0x40], ymm8 - vblendps ymm15, ymm13, ymm15, 0xCC - vperm2f128 ymm13, ymm6, ymm15, 0x20 - vmovups ymmword ptr [rbx+0x60], ymm13 - vperm2f128 ymm9, ymm1, ymm2, 0x31 - vperm2f128 ymm11, ymm3, ymm4, 0x31 - vmovups ymmword ptr [rbx+0x80], ymm9 - vperm2f128 ymm14, ymm10, ymm14, 0x31 - vperm2f128 ymm15, ymm6, ymm15, 0x31 - vmovups ymmword ptr [rbx+0xA0], ymm11 - vmovups ymmword ptr [rbx+0xC0], ymm14 - vmovups ymmword ptr [rbx+0xE0], ymm15 - vmovdqa ymm0, ymmword ptr [rsp+0x220] - vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] - vmovdqa ymmword ptr [rsp+0x240], ymm1 - vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] - vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] - vpcmpgtd ymm2, ymm0, ymm2 - vmovdqa ymm0, ymmword ptr [rsp+0x260] - vpsubd ymm2, ymm0, ymm2 - vmovdqa ymmword ptr [rsp+0x260], ymm2 - add rdi, 64 - add rbx, 256 - mov qword ptr [rbp+0x50], rbx - sub rsi, 8 - cmp rsi, 8 - jnc 2b - test rsi, rsi - jnz 3f -4: - vzeroupper - mov rsp, rbp - pop rbp - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 5 -3: - mov rbx, qword ptr [rbp+0x50] - mov r15, qword ptr [rsp+0x2A0] - movzx r13d, byte ptr [rbp+0x38] - movzx r12d, byte ptr [rbp+0x48] - test rsi, 0x4 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovdqa ymm8, ymm0 - vmovdqa ymm9, ymm1 - vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] - vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] - vpunpckldq ymm14, ymm12, ymm13 - vpunpckhdq ymm15, ymm12, ymm13 - vpermq ymm14, ymm14, 0x50 - vpermq ymm15, ymm15, 0x50 - vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] - vpblendd ymm14, ymm14, ymm12, 0x44 - vpblendd ymm15, ymm15, ymm12, 0x44 - vmovdqa ymmword ptr [rsp], ymm14 - vmovdqa ymmword ptr [rsp+0x20], ymm15 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x200], eax - vmovups ymm2, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm3, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm2, ymm3, 136 - vshufps ymm5, ymm2, ymm3, 221 - vmovups ymm2, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm3, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm2, ymm3, 136 - vshufps ymm7, ymm2, ymm3, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - vmovups ymm10, ymmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 - vmovups ymm11, ymmword ptr [r10+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 - vshufps ymm12, ymm10, ymm11, 136 - vshufps ymm13, ymm10, ymm11, 221 - vmovups ymm10, ymmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 - vmovups ymm11, ymmword ptr [r10+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 - vshufps ymm14, ymm10, ymm11, 136 - vshufps ymm15, ymm10, ymm11, 221 - vpshufd ymm14, ymm14, 0x93 - vpshufd ymm15, ymm15, 0x93 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - vpbroadcastd ymm2, dword ptr [rsp+0x200] - vmovdqa ymm3, ymmword ptr [rsp] - vmovdqa ymm11, ymmword ptr [rsp+0x20] - vpblendd ymm3, ymm3, ymm2, 0x88 - vpblendd ymm11, ymm11, ymm2, 0x88 - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vmovdqa ymm10, ymm2 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm8, ymm8, ymm12 - vmovdqa ymmword ptr [rsp+0x40], ymm4 - nop - vmovdqa ymmword ptr [rsp+0x60], ymm12 - nop - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 12 - vpslld ymm9, ymm9, 20 - vpor ymm9, ymm9, ymm4 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vmovdqa ymmword ptr [rsp+0x80], ymm5 - vmovdqa ymmword ptr [rsp+0xA0], ymm13 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 7 - vpslld ymm9, ymm9, 25 - vpor ymm9, ymm9, ymm4 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm8, ymm8, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm11, ymm11, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpshufd ymm10, ymm10, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm8, ymm8, ymm14 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 12 - vpslld ymm9, ymm9, 20 - vpor ymm9, ymm9, ymm4 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm8, ymm8, ymm15 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 7 - vpslld ymm9, ymm9, 25 - vpor ymm9, ymm9, ymm4 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm8, ymm8, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm11, ymm11, 0x4E - vpshufd ymm2, ymm2, 0x93 - vpshufd ymm10, ymm10, 0x93 - dec al - je 9f - vmovdqa ymm4, ymmword ptr [rsp+0x40] - vmovdqa ymm5, ymmword ptr [rsp+0x80] - vshufps ymm12, ymm4, ymm5, 214 - vpshufd ymm13, ymm4, 0x0F - vpshufd ymm4, ymm12, 0x39 - vshufps ymm12, ymm6, ymm7, 250 - vpblendd ymm13, ymm13, ymm12, 0xAA - vpunpcklqdq ymm12, ymm7, ymm5 - vpblendd ymm12, ymm12, ymm6, 0x88 - vpshufd ymm12, ymm12, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymmword ptr [rsp+0x40], ymm13 - vmovdqa ymmword ptr [rsp+0x80], ymm12 - vmovdqa ymm12, ymmword ptr [rsp+0x60] - vmovdqa ymm13, ymmword ptr [rsp+0xA0] - vshufps ymm5, ymm12, ymm13, 214 - vpshufd ymm6, ymm12, 0x0F - vpshufd ymm12, ymm5, 0x39 - vshufps ymm5, ymm14, ymm15, 250 - vpblendd ymm6, ymm6, ymm5, 0xAA - vpunpcklqdq ymm5, ymm15, ymm13 - vpblendd ymm5, ymm5, ymm14, 0x88 - vpshufd ymm5, ymm5, 0x78 - vpunpckhdq ymm13, ymm13, ymm15 - vpunpckldq ymm14, ymm14, ymm13 - vpshufd ymm15, ymm14, 0x1E - vmovdqa ymm13, ymm6 - vmovdqa ymm14, ymm5 - vmovdqa ymm5, ymmword ptr [rsp+0x40] - vmovdqa ymm6, ymmword ptr [rsp+0x80] - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - vpxor ymm8, ymm8, ymm10 - vpxor ymm9, ymm9, ymm11 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovdqu xmmword ptr [rbx+0x40], xmm8 - vmovdqu xmmword ptr [rbx+0x50], xmm9 - vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 - vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 - vmovaps xmm8, xmmword ptr [rsp+0x280] - vmovaps xmm0, xmmword ptr [rsp+0x240] - vmovaps xmm1, xmmword ptr [rsp+0x250] - vmovaps xmm2, xmmword ptr [rsp+0x260] - vmovaps xmm3, xmmword ptr [rsp+0x270] - vblendvps xmm0, xmm0, xmm1, xmm8 - vblendvps xmm2, xmm2, xmm3, xmm8 - vmovaps xmmword ptr [rsp+0x240], xmm0 - vmovaps xmmword ptr [rsp+0x260], xmm2 - add rbx, 128 - add rdi, 32 - sub rsi, 4 -3: - test rsi, 0x2 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovd xmm13, dword ptr [rsp+0x240] - vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 - vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovd xmm14, dword ptr [rsp+0x244] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vinserti128 ymm13, ymm13, xmm14, 0x01 - vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] - vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x200], eax - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vpbroadcastd ymm8, dword ptr [rsp+0x200] - vpblendd ymm3, ymm13, ymm8, 0x88 - vmovups ymm8, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm8, ymm9, 136 - vshufps ymm5, ymm8, ymm9, 221 - vmovups ymm8, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm8, ymm9, 136 - vshufps ymm7, ymm8, ymm9, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm14 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm8 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm15 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm8 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm14 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm8 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm15 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm8 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x93 - dec al - jz 9f - vshufps ymm8, ymm4, ymm5, 214 - vpshufd ymm9, ymm4, 0x0F - vpshufd ymm4, ymm8, 0x39 - vshufps ymm8, ymm6, ymm7, 250 - vpblendd ymm9, ymm9, ymm8, 0xAA - vpunpcklqdq ymm8, ymm7, ymm5 - vpblendd ymm8, ymm8, ymm6, 0x88 - vpshufd ymm8, ymm8, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymm5, ymm9 - vmovdqa ymm6, ymm8 - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovaps ymm8, ymmword ptr [rsp+0x280] - vmovaps ymm0, ymmword ptr [rsp+0x240] - vmovups ymm1, ymmword ptr [rsp+0x248] - vmovaps ymm2, ymmword ptr [rsp+0x260] - vmovups ymm3, ymmword ptr [rsp+0x268] - vblendvps ymm0, ymm0, ymm1, ymm8 - vblendvps ymm2, ymm2, ymm3, ymm8 - vmovaps ymmword ptr [rsp+0x240], ymm0 - vmovaps ymmword ptr [rsp+0x260], ymm2 - add rbx, 64 - add rdi, 16 - sub rsi, 2 -3: - test rsi, 0x1 - je 4b - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+0x10] - vmovd xmm3, dword ptr [rsp+0x240] - vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 - vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovdqa xmm14, xmmword ptr [ROT16+rip] - vmovdqa xmm15, xmmword ptr [ROT8+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovdqa xmm3, xmm13 - vpinsrd xmm3, xmm3, eax, 3 - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vmovups xmm9, xmmword ptr [r8+rdx-0x30] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vmovups xmm9, xmmword ptr [r8+rdx-0x10] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm14 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 12 - vpslld xmm1, xmm1, 20 - vpor xmm1, xmm1, xmm8 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm15 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 7 - vpslld xmm1, xmm1, 25 - vpor xmm1, xmm1, xmm8 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm14 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 12 - vpslld xmm1, xmm1, 20 - vpor xmm1, xmm1, xmm8 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm15 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 7 - vpslld xmm1, xmm1, 25 - vpor xmm1, xmm1, xmm8 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - jmp 4b - - -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif -.p2align 6 -ADD0: - .long 0, 1, 2, 3, 4, 5, 6, 7 -ADD1: - .long 8, 8, 8, 8, 8, 8, 8, 8 -BLAKE3_IV_0: - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A -BLAKE3_BLOCK_LEN: - .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 - .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 -ROT16: - .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -ROT8: - .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 -CMP_MSB_MASK: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 -BLAKE3_IV: - .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A - -#endif // __x86_64__ diff --git a/src/b3/blake3_avx512.c b/src/b3/blake3_avx512.c deleted file mode 100644 index 77a5c385c..000000000 --- a/src/b3/blake3_avx512.c +++ /dev/null @@ -1,1204 +0,0 @@ -#include "blake3_impl.h" - -#include - -#define _mm_shuffle_ps2(a, b, c) \ - (_mm_castps_si128( \ - _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) - -INLINE __m128i loadu_128(const uint8_t src[16]) { - return _mm_loadu_si128((const __m128i *)src); -} - -INLINE __m256i loadu_256(const uint8_t src[32]) { - return _mm256_loadu_si256((const __m256i *)src); -} - -INLINE __m512i loadu_512(const uint8_t src[64]) { - return _mm512_loadu_si512((const __m512i *)src); -} - -INLINE void storeu_128(__m128i src, uint8_t dest[16]) { - _mm_storeu_si128((__m128i *)dest, src); -} - -INLINE void storeu_256(__m256i src, uint8_t dest[16]) { - _mm256_storeu_si256((__m256i *)dest, src); -} - -INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } - -INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } - -INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); } - -INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } - -INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } - -INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); } - -INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); } - -INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } - -INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); } - -INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); -} - -INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); } - -INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); } - -INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); } - -INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); } - -INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); } - -INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); } - -INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); } - -INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); } - -INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); } - -INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); } - -INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); } - -INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); } - -/* - * ---------------------------------------------------------------------------- - * compress_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = add_128(add_128(*row0, m), *row1); - *row3 = xor_128(*row3, *row0); - *row3 = rot16_128(*row3); - *row2 = add_128(*row2, *row3); - *row1 = xor_128(*row1, *row2); - *row1 = rot12_128(*row1); -} - -INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = add_128(add_128(*row0, m), *row1); - *row3 = xor_128(*row3, *row0); - *row3 = rot8_128(*row3); - *row2 = add_128(*row2, *row3); - *row1 = xor_128(*row1, *row2); - *row1 = rot7_128(*row1); -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); -} - -INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); -} - -INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - rows[0] = loadu_128((uint8_t *)&cv[0]); - rows[1] = loadu_128((uint8_t *)&cv[4]); - rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); - rows[3] = set4(counter_low(counter), counter_high(counter), - (uint32_t)block_len, (uint32_t)flags); - - __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]); - __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]); - __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]); - __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); - - __m128i t0, t1, t2, t3, tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); -} - -void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu_128(xor_128(rows[0], rows[2]), &out[0]); - storeu_128(xor_128(rows[1], rows[3]), &out[16]); - storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]); - storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); -} - -void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); - storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); -} - -/* - * ---------------------------------------------------------------------------- - * hash4_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) { - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = add_128(v[0], v[4]); - v[1] = add_128(v[1], v[5]); - v[2] = add_128(v[2], v[6]); - v[3] = add_128(v[3], v[7]); - v[12] = xor_128(v[12], v[0]); - v[13] = xor_128(v[13], v[1]); - v[14] = xor_128(v[14], v[2]); - v[15] = xor_128(v[15], v[3]); - v[12] = rot16_128(v[12]); - v[13] = rot16_128(v[13]); - v[14] = rot16_128(v[14]); - v[15] = rot16_128(v[15]); - v[8] = add_128(v[8], v[12]); - v[9] = add_128(v[9], v[13]); - v[10] = add_128(v[10], v[14]); - v[11] = add_128(v[11], v[15]); - v[4] = xor_128(v[4], v[8]); - v[5] = xor_128(v[5], v[9]); - v[6] = xor_128(v[6], v[10]); - v[7] = xor_128(v[7], v[11]); - v[4] = rot12_128(v[4]); - v[5] = rot12_128(v[5]); - v[6] = rot12_128(v[6]); - v[7] = rot12_128(v[7]); - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = add_128(v[0], v[4]); - v[1] = add_128(v[1], v[5]); - v[2] = add_128(v[2], v[6]); - v[3] = add_128(v[3], v[7]); - v[12] = xor_128(v[12], v[0]); - v[13] = xor_128(v[13], v[1]); - v[14] = xor_128(v[14], v[2]); - v[15] = xor_128(v[15], v[3]); - v[12] = rot8_128(v[12]); - v[13] = rot8_128(v[13]); - v[14] = rot8_128(v[14]); - v[15] = rot8_128(v[15]); - v[8] = add_128(v[8], v[12]); - v[9] = add_128(v[9], v[13]); - v[10] = add_128(v[10], v[14]); - v[11] = add_128(v[11], v[15]); - v[4] = xor_128(v[4], v[8]); - v[5] = xor_128(v[5], v[9]); - v[6] = xor_128(v[6], v[10]); - v[7] = xor_128(v[7], v[11]); - v[4] = rot7_128(v[4]); - v[5] = rot7_128(v[5]); - v[6] = rot7_128(v[6]); - v[7] = rot7_128(v[7]); - - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = add_128(v[0], v[5]); - v[1] = add_128(v[1], v[6]); - v[2] = add_128(v[2], v[7]); - v[3] = add_128(v[3], v[4]); - v[15] = xor_128(v[15], v[0]); - v[12] = xor_128(v[12], v[1]); - v[13] = xor_128(v[13], v[2]); - v[14] = xor_128(v[14], v[3]); - v[15] = rot16_128(v[15]); - v[12] = rot16_128(v[12]); - v[13] = rot16_128(v[13]); - v[14] = rot16_128(v[14]); - v[10] = add_128(v[10], v[15]); - v[11] = add_128(v[11], v[12]); - v[8] = add_128(v[8], v[13]); - v[9] = add_128(v[9], v[14]); - v[5] = xor_128(v[5], v[10]); - v[6] = xor_128(v[6], v[11]); - v[7] = xor_128(v[7], v[8]); - v[4] = xor_128(v[4], v[9]); - v[5] = rot12_128(v[5]); - v[6] = rot12_128(v[6]); - v[7] = rot12_128(v[7]); - v[4] = rot12_128(v[4]); - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = add_128(v[0], v[5]); - v[1] = add_128(v[1], v[6]); - v[2] = add_128(v[2], v[7]); - v[3] = add_128(v[3], v[4]); - v[15] = xor_128(v[15], v[0]); - v[12] = xor_128(v[12], v[1]); - v[13] = xor_128(v[13], v[2]); - v[14] = xor_128(v[14], v[3]); - v[15] = rot8_128(v[15]); - v[12] = rot8_128(v[12]); - v[13] = rot8_128(v[13]); - v[14] = rot8_128(v[14]); - v[10] = add_128(v[10], v[15]); - v[11] = add_128(v[11], v[12]); - v[8] = add_128(v[8], v[13]); - v[9] = add_128(v[9], v[14]); - v[5] = xor_128(v[5], v[10]); - v[6] = xor_128(v[6], v[11]); - v[7] = xor_128(v[7], v[8]); - v[4] = xor_128(v[4], v[9]); - v[5] = rot7_128(v[5]); - v[6] = rot7_128(v[6]); - v[7] = rot7_128(v[7]); - v[4] = rot7_128(v[4]); -} - -INLINE void transpose_vecs_128(__m128i vecs[4]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, - size_t block_offset, __m128i out[16]) { - out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]); - out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]); - out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]); - out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]); - out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]); - out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]); - out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]); - out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]); - out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]); - out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]); - out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]); - out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]); - out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]); - out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]); - out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); - out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); - for (size_t i = 0; i < 4; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs_128(&out[0]); - transpose_vecs_128(&out[4]); - transpose_vecs_128(&out[8]); - transpose_vecs_128(&out[12]); -} - -INLINE void load_counters4(uint64_t counter, bool increment_counter, - __m128i *out_lo, __m128i *out_hi) { - uint64_t mask = (increment_counter ? ~0 : 0); - __m256i mask_vec = _mm256_set1_epi64x(mask); - __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3); - deltas = _mm256_and_si256(mask_vec, deltas); - __m256i counters = - _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas); - *out_lo = _mm256_cvtepi64_epi32(counters); - *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32)); -} - -void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m128i h_vecs[8] = { - set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), - set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), - }; - __m128i counter_low_vec, counter_high_vec; - load_counters4(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN); - __m128i block_flags_vec = set1_128(block_flags); - __m128i msg_vecs[16]; - transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m128i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn4(v, msg_vecs, 0); - round_fn4(v, msg_vecs, 1); - round_fn4(v, msg_vecs, 2); - round_fn4(v, msg_vecs, 3); - round_fn4(v, msg_vecs, 4); - round_fn4(v, msg_vecs, 5); - round_fn4(v, msg_vecs, 6); - h_vecs[0] = xor_128(v[0], v[8]); - h_vecs[1] = xor_128(v[1], v[9]); - h_vecs[2] = xor_128(v[2], v[10]); - h_vecs[3] = xor_128(v[3], v[11]); - h_vecs[4] = xor_128(v[4], v[12]); - h_vecs[5] = xor_128(v[5], v[13]); - h_vecs[6] = xor_128(v[6], v[14]); - h_vecs[7] = xor_128(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs_128(&h_vecs[0]); - transpose_vecs_128(&h_vecs[4]); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]); - storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]); - storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]); - storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]); - storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]); - storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]); - storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]); - storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]); -} - -/* - * ---------------------------------------------------------------------------- - * hash8_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) { - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = add_256(v[0], v[4]); - v[1] = add_256(v[1], v[5]); - v[2] = add_256(v[2], v[6]); - v[3] = add_256(v[3], v[7]); - v[12] = xor_256(v[12], v[0]); - v[13] = xor_256(v[13], v[1]); - v[14] = xor_256(v[14], v[2]); - v[15] = xor_256(v[15], v[3]); - v[12] = rot16_256(v[12]); - v[13] = rot16_256(v[13]); - v[14] = rot16_256(v[14]); - v[15] = rot16_256(v[15]); - v[8] = add_256(v[8], v[12]); - v[9] = add_256(v[9], v[13]); - v[10] = add_256(v[10], v[14]); - v[11] = add_256(v[11], v[15]); - v[4] = xor_256(v[4], v[8]); - v[5] = xor_256(v[5], v[9]); - v[6] = xor_256(v[6], v[10]); - v[7] = xor_256(v[7], v[11]); - v[4] = rot12_256(v[4]); - v[5] = rot12_256(v[5]); - v[6] = rot12_256(v[6]); - v[7] = rot12_256(v[7]); - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = add_256(v[0], v[4]); - v[1] = add_256(v[1], v[5]); - v[2] = add_256(v[2], v[6]); - v[3] = add_256(v[3], v[7]); - v[12] = xor_256(v[12], v[0]); - v[13] = xor_256(v[13], v[1]); - v[14] = xor_256(v[14], v[2]); - v[15] = xor_256(v[15], v[3]); - v[12] = rot8_256(v[12]); - v[13] = rot8_256(v[13]); - v[14] = rot8_256(v[14]); - v[15] = rot8_256(v[15]); - v[8] = add_256(v[8], v[12]); - v[9] = add_256(v[9], v[13]); - v[10] = add_256(v[10], v[14]); - v[11] = add_256(v[11], v[15]); - v[4] = xor_256(v[4], v[8]); - v[5] = xor_256(v[5], v[9]); - v[6] = xor_256(v[6], v[10]); - v[7] = xor_256(v[7], v[11]); - v[4] = rot7_256(v[4]); - v[5] = rot7_256(v[5]); - v[6] = rot7_256(v[6]); - v[7] = rot7_256(v[7]); - - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = add_256(v[0], v[5]); - v[1] = add_256(v[1], v[6]); - v[2] = add_256(v[2], v[7]); - v[3] = add_256(v[3], v[4]); - v[15] = xor_256(v[15], v[0]); - v[12] = xor_256(v[12], v[1]); - v[13] = xor_256(v[13], v[2]); - v[14] = xor_256(v[14], v[3]); - v[15] = rot16_256(v[15]); - v[12] = rot16_256(v[12]); - v[13] = rot16_256(v[13]); - v[14] = rot16_256(v[14]); - v[10] = add_256(v[10], v[15]); - v[11] = add_256(v[11], v[12]); - v[8] = add_256(v[8], v[13]); - v[9] = add_256(v[9], v[14]); - v[5] = xor_256(v[5], v[10]); - v[6] = xor_256(v[6], v[11]); - v[7] = xor_256(v[7], v[8]); - v[4] = xor_256(v[4], v[9]); - v[5] = rot12_256(v[5]); - v[6] = rot12_256(v[6]); - v[7] = rot12_256(v[7]); - v[4] = rot12_256(v[4]); - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = add_256(v[0], v[5]); - v[1] = add_256(v[1], v[6]); - v[2] = add_256(v[2], v[7]); - v[3] = add_256(v[3], v[4]); - v[15] = xor_256(v[15], v[0]); - v[12] = xor_256(v[12], v[1]); - v[13] = xor_256(v[13], v[2]); - v[14] = xor_256(v[14], v[3]); - v[15] = rot8_256(v[15]); - v[12] = rot8_256(v[12]); - v[13] = rot8_256(v[13]); - v[14] = rot8_256(v[14]); - v[10] = add_256(v[10], v[15]); - v[11] = add_256(v[11], v[12]); - v[8] = add_256(v[8], v[13]); - v[9] = add_256(v[9], v[14]); - v[5] = xor_256(v[5], v[10]); - v[6] = xor_256(v[6], v[11]); - v[7] = xor_256(v[7], v[8]); - v[4] = xor_256(v[4], v[9]); - v[5] = rot7_256(v[5]); - v[6] = rot7_256(v[6]); - v[7] = rot7_256(v[7]); - v[4] = rot7_256(v[4]); -} - -INLINE void transpose_vecs_256(__m256i vecs[8]) { - // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high - // is 22/33/66/77. - __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); - __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); - __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); - __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); - __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); - __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); - __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); - __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); - - // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is - // 11/33. - __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); - __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); - __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); - __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); - __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); - __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); - __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); - __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); - - // Interleave 128-bit lanes. - vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); - vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); - vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); - vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); - vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); - vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); - vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); - vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); -} - -INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, - size_t block_offset, __m256i out[16]) { - out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]); - out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]); - out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]); - out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]); - out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]); - out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]); - out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]); - out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]); - out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]); - out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]); - out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]); - out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]); - out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]); - out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]); - out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]); - out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]); - for (size_t i = 0; i < 8; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs_256(&out[0]); - transpose_vecs_256(&out[8]); -} - -INLINE void load_counters8(uint64_t counter, bool increment_counter, - __m256i *out_lo, __m256i *out_hi) { - uint64_t mask = (increment_counter ? ~0 : 0); - __m512i mask_vec = _mm512_set1_epi64(mask); - __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); - deltas = _mm512_and_si512(mask_vec, deltas); - __m512i counters = - _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas); - *out_lo = _mm512_cvtepi64_epi32(counters); - *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32)); -} - -void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m256i h_vecs[8] = { - set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), - set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), - }; - __m256i counter_low_vec, counter_high_vec; - load_counters8(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN); - __m256i block_flags_vec = set1_256(block_flags); - __m256i msg_vecs[16]; - transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m256i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn8(v, msg_vecs, 0); - round_fn8(v, msg_vecs, 1); - round_fn8(v, msg_vecs, 2); - round_fn8(v, msg_vecs, 3); - round_fn8(v, msg_vecs, 4); - round_fn8(v, msg_vecs, 5); - round_fn8(v, msg_vecs, 6); - h_vecs[0] = xor_256(v[0], v[8]); - h_vecs[1] = xor_256(v[1], v[9]); - h_vecs[2] = xor_256(v[2], v[10]); - h_vecs[3] = xor_256(v[3], v[11]); - h_vecs[4] = xor_256(v[4], v[12]); - h_vecs[5] = xor_256(v[5], v[13]); - h_vecs[6] = xor_256(v[6], v[14]); - h_vecs[7] = xor_256(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs_256(h_vecs); - storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]); - storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]); - storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]); - storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]); - storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]); - storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]); - storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]); - storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]); -} - -/* - * ---------------------------------------------------------------------------- - * hash16_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) { - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = add_512(v[0], v[4]); - v[1] = add_512(v[1], v[5]); - v[2] = add_512(v[2], v[6]); - v[3] = add_512(v[3], v[7]); - v[12] = xor_512(v[12], v[0]); - v[13] = xor_512(v[13], v[1]); - v[14] = xor_512(v[14], v[2]); - v[15] = xor_512(v[15], v[3]); - v[12] = rot16_512(v[12]); - v[13] = rot16_512(v[13]); - v[14] = rot16_512(v[14]); - v[15] = rot16_512(v[15]); - v[8] = add_512(v[8], v[12]); - v[9] = add_512(v[9], v[13]); - v[10] = add_512(v[10], v[14]); - v[11] = add_512(v[11], v[15]); - v[4] = xor_512(v[4], v[8]); - v[5] = xor_512(v[5], v[9]); - v[6] = xor_512(v[6], v[10]); - v[7] = xor_512(v[7], v[11]); - v[4] = rot12_512(v[4]); - v[5] = rot12_512(v[5]); - v[6] = rot12_512(v[6]); - v[7] = rot12_512(v[7]); - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = add_512(v[0], v[4]); - v[1] = add_512(v[1], v[5]); - v[2] = add_512(v[2], v[6]); - v[3] = add_512(v[3], v[7]); - v[12] = xor_512(v[12], v[0]); - v[13] = xor_512(v[13], v[1]); - v[14] = xor_512(v[14], v[2]); - v[15] = xor_512(v[15], v[3]); - v[12] = rot8_512(v[12]); - v[13] = rot8_512(v[13]); - v[14] = rot8_512(v[14]); - v[15] = rot8_512(v[15]); - v[8] = add_512(v[8], v[12]); - v[9] = add_512(v[9], v[13]); - v[10] = add_512(v[10], v[14]); - v[11] = add_512(v[11], v[15]); - v[4] = xor_512(v[4], v[8]); - v[5] = xor_512(v[5], v[9]); - v[6] = xor_512(v[6], v[10]); - v[7] = xor_512(v[7], v[11]); - v[4] = rot7_512(v[4]); - v[5] = rot7_512(v[5]); - v[6] = rot7_512(v[6]); - v[7] = rot7_512(v[7]); - - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = add_512(v[0], v[5]); - v[1] = add_512(v[1], v[6]); - v[2] = add_512(v[2], v[7]); - v[3] = add_512(v[3], v[4]); - v[15] = xor_512(v[15], v[0]); - v[12] = xor_512(v[12], v[1]); - v[13] = xor_512(v[13], v[2]); - v[14] = xor_512(v[14], v[3]); - v[15] = rot16_512(v[15]); - v[12] = rot16_512(v[12]); - v[13] = rot16_512(v[13]); - v[14] = rot16_512(v[14]); - v[10] = add_512(v[10], v[15]); - v[11] = add_512(v[11], v[12]); - v[8] = add_512(v[8], v[13]); - v[9] = add_512(v[9], v[14]); - v[5] = xor_512(v[5], v[10]); - v[6] = xor_512(v[6], v[11]); - v[7] = xor_512(v[7], v[8]); - v[4] = xor_512(v[4], v[9]); - v[5] = rot12_512(v[5]); - v[6] = rot12_512(v[6]); - v[7] = rot12_512(v[7]); - v[4] = rot12_512(v[4]); - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = add_512(v[0], v[5]); - v[1] = add_512(v[1], v[6]); - v[2] = add_512(v[2], v[7]); - v[3] = add_512(v[3], v[4]); - v[15] = xor_512(v[15], v[0]); - v[12] = xor_512(v[12], v[1]); - v[13] = xor_512(v[13], v[2]); - v[14] = xor_512(v[14], v[3]); - v[15] = rot8_512(v[15]); - v[12] = rot8_512(v[12]); - v[13] = rot8_512(v[13]); - v[14] = rot8_512(v[14]); - v[10] = add_512(v[10], v[15]); - v[11] = add_512(v[11], v[12]); - v[8] = add_512(v[8], v[13]); - v[9] = add_512(v[9], v[14]); - v[5] = xor_512(v[5], v[10]); - v[6] = xor_512(v[6], v[11]); - v[7] = xor_512(v[7], v[8]); - v[4] = xor_512(v[4], v[9]); - v[5] = rot7_512(v[5]); - v[6] = rot7_512(v[6]); - v[7] = rot7_512(v[7]); - v[4] = rot7_512(v[4]); -} - -// 0b10001000, or lanes a0/a2/b0/b2 in little-endian order -#define LO_IMM8 0x88 - -INLINE __m512i unpack_lo_128(__m512i a, __m512i b) { - return _mm512_shuffle_i32x4(a, b, LO_IMM8); -} - -// 0b11011101, or lanes a1/a3/b1/b3 in little-endian order -#define HI_IMM8 0xdd - -INLINE __m512i unpack_hi_128(__m512i a, __m512i b) { - return _mm512_shuffle_i32x4(a, b, HI_IMM8); -} - -INLINE void transpose_vecs_512(__m512i vecs[16]) { - // Interleave 32-bit lanes. The _0 unpack is lanes - // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes - // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15. - __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]); - __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]); - __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]); - __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]); - __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]); - __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]); - __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]); - __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]); - __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]); - __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]); - __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]); - __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]); - __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]); - __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]); - __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]); - __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]); - - // Interleave 64-bit lates. The _0 unpack is lanes - // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes - // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes - // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes - // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15. - __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0); - __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0); - __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2); - __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2); - __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0); - __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0); - __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2); - __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2); - __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0); - __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0); - __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2); - __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2); - __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0); - __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0); - __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2); - __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2); - - // Interleave 128-bit lanes. The _0 unpack is - // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is - // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on. - __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0); - __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1); - __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2); - __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3); - __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0); - __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1); - __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2); - __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3); - __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0); - __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1); - __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2); - __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3); - __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0); - __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1); - __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2); - __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3); - - // Interleave 128-bit lanes again for the final outputs. - vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0); - vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1); - vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2); - vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3); - vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4); - vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5); - vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6); - vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7); - vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0); - vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1); - vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2); - vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3); - vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4); - vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5); - vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6); - vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7); -} - -INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, - size_t block_offset, __m512i out[16]) { - out[0] = loadu_512(&inputs[0][block_offset]); - out[1] = loadu_512(&inputs[1][block_offset]); - out[2] = loadu_512(&inputs[2][block_offset]); - out[3] = loadu_512(&inputs[3][block_offset]); - out[4] = loadu_512(&inputs[4][block_offset]); - out[5] = loadu_512(&inputs[5][block_offset]); - out[6] = loadu_512(&inputs[6][block_offset]); - out[7] = loadu_512(&inputs[7][block_offset]); - out[8] = loadu_512(&inputs[8][block_offset]); - out[9] = loadu_512(&inputs[9][block_offset]); - out[10] = loadu_512(&inputs[10][block_offset]); - out[11] = loadu_512(&inputs[11][block_offset]); - out[12] = loadu_512(&inputs[12][block_offset]); - out[13] = loadu_512(&inputs[13][block_offset]); - out[14] = loadu_512(&inputs[14][block_offset]); - out[15] = loadu_512(&inputs[15][block_offset]); - for (size_t i = 0; i < 16; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs_512(out); -} - -INLINE void load_counters16(uint64_t counter, bool increment_counter, - __m512i *out_lo, __m512i *out_hi) { - const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter); - const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - const __m512i add1 = _mm512_and_si512(mask, add0); - __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1); - __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT); - __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1)); - *out_lo = l; - *out_hi = h; -} - -void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, - uint8_t *out) { - __m512i h_vecs[8] = { - set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), - set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), - }; - __m512i counter_low_vec, counter_high_vec; - load_counters16(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN); - __m512i block_flags_vec = set1_512(block_flags); - __m512i msg_vecs[16]; - transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m512i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn16(v, msg_vecs, 0); - round_fn16(v, msg_vecs, 1); - round_fn16(v, msg_vecs, 2); - round_fn16(v, msg_vecs, 3); - round_fn16(v, msg_vecs, 4); - round_fn16(v, msg_vecs, 5); - round_fn16(v, msg_vecs, 6); - h_vecs[0] = xor_512(v[0], v[8]); - h_vecs[1] = xor_512(v[1], v[9]); - h_vecs[2] = xor_512(v[2], v[10]); - h_vecs[3] = xor_512(v[3], v[11]); - h_vecs[4] = xor_512(v[4], v[12]); - h_vecs[5] = xor_512(v[5], v[13]); - h_vecs[6] = xor_512(v[6], v[14]); - h_vecs[7] = xor_512(v[7], v[15]); - - block_flags = flags; - } - - // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8 - // state vectors. Pad the matrix with zeros. After transposition, store the - // lower half of each vector. - __m512i padded[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_512(0), set1_512(0), set1_512(0), set1_512(0), - set1_512(0), set1_512(0), set1_512(0), set1_512(0), - }; - transpose_vecs_512(padded); - _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0])); - _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1])); - _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2])); - _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3])); - _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4])); - _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5])); - _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6])); - _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7])); - _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8])); - _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9])); - _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10])); - _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11])); - _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12])); - _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13])); - _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14])); - _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15])); -} - -/* - * ---------------------------------------------------------------------------- - * hash_many_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, BLAKE3_OUT_LEN); -} - -void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= 16) { - blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += 16; - } - inputs += 16; - num_inputs -= 16; - out = &out[16 * BLAKE3_OUT_LEN]; - } - while (num_inputs >= 8) { - blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += 8; - } - inputs += 8; - num_inputs -= 8; - out = &out[8 * BLAKE3_OUT_LEN]; - } - while (num_inputs >= 4) { - blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += 4; - } - inputs += 4; - num_inputs -= 4; - out = &out[4 * BLAKE3_OUT_LEN]; - } - while (num_inputs > 0) { - hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/src/b3/blake3_avx512_x86-64_unix.S b/src/b3/blake3_avx512_x86-64_unix.S deleted file mode 100644 index 621e1aa6d..000000000 --- a/src/b3/blake3_avx512_x86-64_unix.S +++ /dev/null @@ -1,2572 +0,0 @@ -#ifdef __x86_64__ -.intel_syntax noprefix - -.global _blake3_hash_many_avx512 -.global blake3_hash_many_avx512 -.global blake3_compress_in_place_avx512 -.global _blake3_compress_in_place_avx512 -.global blake3_compress_xof_avx512 -.global _blake3_compress_xof_avx512 - -#ifdef __APPLE__ -.text -#else -.section .text -#endif -.p2align 6 -_blake3_hash_many_avx512: -blake3_hash_many_avx512: - push r15 - push r14 - push r13 - push r12 - push rbx - push rbp - mov rbp, rsp - sub rsp, 144 - and rsp, 0xFFFFFFFFFFFFFFC0 - neg r9 - kmovw k1, r9d - vmovd xmm0, r8d - vpbroadcastd ymm0, xmm0 - shr r8, 32 - vmovd xmm1, r8d - vpbroadcastd ymm1, xmm1 - vmovdqa ymm4, ymm1 - vmovdqa ymm5, ymm1 - vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] - vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] - vpcmpltud k2, ymm2, ymm0 - vpcmpltud k3, ymm3, ymm0 - vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} - vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} - knotw k2, k1 - vmovdqa32 ymm2 {k2}, ymm0 - vmovdqa32 ymm3 {k2}, ymm0 - vmovdqa32 ymm4 {k2}, ymm1 - vmovdqa32 ymm5 {k2}, ymm1 - vmovdqa ymmword ptr [rsp], ymm2 - vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 - vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 - vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 - shl rdx, 6 - mov qword ptr [rsp+0x80], rdx - cmp rsi, 16 - jc 3f -2: - vpbroadcastd zmm0, dword ptr [rcx] - vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] - vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] - vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] - vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] - vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] - vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] - vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] - movzx eax, byte ptr [rbp+0x38] - movzx ebx, byte ptr [rbp+0x40] - or eax, ebx - xor edx, edx -.p2align 5 -9: - movzx ebx, byte ptr [rbp+0x48] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x80] - cmove eax, ebx - mov dword ptr [rsp+0x88], eax - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x40] - mov r13, qword ptr [rdi+0x48] - mov r14, qword ptr [rdi+0x50] - mov r15, qword ptr [rdi+0x58] - vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] - vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 - vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] - vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 - vpunpcklqdq zmm8, zmm16, zmm17 - vpunpckhqdq zmm9, zmm16, zmm17 - vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] - vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 - vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] - vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 - vpunpcklqdq zmm10, zmm18, zmm19 - vpunpckhqdq zmm11, zmm18, zmm19 - mov r8, qword ptr [rdi+0x20] - mov r9, qword ptr [rdi+0x28] - mov r10, qword ptr [rdi+0x30] - mov r11, qword ptr [rdi+0x38] - mov r12, qword ptr [rdi+0x60] - mov r13, qword ptr [rdi+0x68] - mov r14, qword ptr [rdi+0x70] - mov r15, qword ptr [rdi+0x78] - vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] - vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 - vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] - vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 - vpunpcklqdq zmm12, zmm16, zmm17 - vpunpckhqdq zmm13, zmm16, zmm17 - vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] - vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 - vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] - vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 - vpunpcklqdq zmm14, zmm18, zmm19 - vpunpckhqdq zmm15, zmm18, zmm19 - vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] - vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] - vshufps zmm16, zmm8, zmm10, 136 - vshufps zmm17, zmm12, zmm14, 136 - vmovdqa32 zmm20, zmm16 - vpermt2d zmm16, zmm27, zmm17 - vpermt2d zmm20, zmm31, zmm17 - vshufps zmm17, zmm8, zmm10, 221 - vshufps zmm30, zmm12, zmm14, 221 - vmovdqa32 zmm21, zmm17 - vpermt2d zmm17, zmm27, zmm30 - vpermt2d zmm21, zmm31, zmm30 - vshufps zmm18, zmm9, zmm11, 136 - vshufps zmm8, zmm13, zmm15, 136 - vmovdqa32 zmm22, zmm18 - vpermt2d zmm18, zmm27, zmm8 - vpermt2d zmm22, zmm31, zmm8 - vshufps zmm19, zmm9, zmm11, 221 - vshufps zmm8, zmm13, zmm15, 221 - vmovdqa32 zmm23, zmm19 - vpermt2d zmm19, zmm27, zmm8 - vpermt2d zmm23, zmm31, zmm8 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x40] - mov r13, qword ptr [rdi+0x48] - mov r14, qword ptr [rdi+0x50] - mov r15, qword ptr [rdi+0x58] - vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm8, zmm24, zmm25 - vpunpckhqdq zmm9, zmm24, zmm25 - vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm10, zmm24, zmm25 - vpunpckhqdq zmm11, zmm24, zmm25 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - mov r8, qword ptr [rdi+0x20] - mov r9, qword ptr [rdi+0x28] - mov r10, qword ptr [rdi+0x30] - mov r11, qword ptr [rdi+0x38] - mov r12, qword ptr [rdi+0x60] - mov r13, qword ptr [rdi+0x68] - mov r14, qword ptr [rdi+0x70] - mov r15, qword ptr [rdi+0x78] - vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm12, zmm24, zmm25 - vpunpckhqdq zmm13, zmm24, zmm25 - vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm14, zmm24, zmm25 - vpunpckhqdq zmm15, zmm24, zmm25 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - vshufps zmm24, zmm8, zmm10, 136 - vshufps zmm30, zmm12, zmm14, 136 - vmovdqa32 zmm28, zmm24 - vpermt2d zmm24, zmm27, zmm30 - vpermt2d zmm28, zmm31, zmm30 - vshufps zmm25, zmm8, zmm10, 221 - vshufps zmm30, zmm12, zmm14, 221 - vmovdqa32 zmm29, zmm25 - vpermt2d zmm25, zmm27, zmm30 - vpermt2d zmm29, zmm31, zmm30 - vshufps zmm26, zmm9, zmm11, 136 - vshufps zmm8, zmm13, zmm15, 136 - vmovdqa32 zmm30, zmm26 - vpermt2d zmm26, zmm27, zmm8 - vpermt2d zmm30, zmm31, zmm8 - vshufps zmm8, zmm9, zmm11, 221 - vshufps zmm10, zmm13, zmm15, 221 - vpermi2d zmm27, zmm8, zmm10 - vpermi2d zmm31, zmm8, zmm10 - vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] - vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] - vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] - vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] - vmovdqa32 zmm12, zmmword ptr [rsp] - vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] - vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] - vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] - vpaddd zmm0, zmm0, zmm16 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm20 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm17 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm21 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm24 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm28 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm25 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm29 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm18 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm23 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm22 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm16 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm17 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm25 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm27 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm30 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm19 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm29 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm20 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm18 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm22 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm27 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm21 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm31 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm26 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm30 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm23 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm19 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm20 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm21 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm16 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm24 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm28 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm31 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm29 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm26 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm23 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm16 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm18 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm17 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm25 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm24 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm30 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm28 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm29 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm18 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm19 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm22 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm27 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm17 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm31 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm25 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm30 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm19 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm26 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm20 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpxord zmm0, zmm0, zmm8 - vpxord zmm1, zmm1, zmm9 - vpxord zmm2, zmm2, zmm10 - vpxord zmm3, zmm3, zmm11 - vpxord zmm4, zmm4, zmm12 - vpxord zmm5, zmm5, zmm13 - vpxord zmm6, zmm6, zmm14 - vpxord zmm7, zmm7, zmm15 - movzx eax, byte ptr [rbp+0x38] - jne 9b - mov rbx, qword ptr [rbp+0x50] - vpunpckldq zmm16, zmm0, zmm1 - vpunpckhdq zmm17, zmm0, zmm1 - vpunpckldq zmm18, zmm2, zmm3 - vpunpckhdq zmm19, zmm2, zmm3 - vpunpckldq zmm20, zmm4, zmm5 - vpunpckhdq zmm21, zmm4, zmm5 - vpunpckldq zmm22, zmm6, zmm7 - vpunpckhdq zmm23, zmm6, zmm7 - vpunpcklqdq zmm0, zmm16, zmm18 - vpunpckhqdq zmm1, zmm16, zmm18 - vpunpcklqdq zmm2, zmm17, zmm19 - vpunpckhqdq zmm3, zmm17, zmm19 - vpunpcklqdq zmm4, zmm20, zmm22 - vpunpckhqdq zmm5, zmm20, zmm22 - vpunpcklqdq zmm6, zmm21, zmm23 - vpunpckhqdq zmm7, zmm21, zmm23 - vshufi32x4 zmm16, zmm0, zmm4, 0x88 - vshufi32x4 zmm17, zmm1, zmm5, 0x88 - vshufi32x4 zmm18, zmm2, zmm6, 0x88 - vshufi32x4 zmm19, zmm3, zmm7, 0x88 - vshufi32x4 zmm20, zmm0, zmm4, 0xDD - vshufi32x4 zmm21, zmm1, zmm5, 0xDD - vshufi32x4 zmm22, zmm2, zmm6, 0xDD - vshufi32x4 zmm23, zmm3, zmm7, 0xDD - vshufi32x4 zmm0, zmm16, zmm17, 0x88 - vshufi32x4 zmm1, zmm18, zmm19, 0x88 - vshufi32x4 zmm2, zmm20, zmm21, 0x88 - vshufi32x4 zmm3, zmm22, zmm23, 0x88 - vshufi32x4 zmm4, zmm16, zmm17, 0xDD - vshufi32x4 zmm5, zmm18, zmm19, 0xDD - vshufi32x4 zmm6, zmm20, zmm21, 0xDD - vshufi32x4 zmm7, zmm22, zmm23, 0xDD - vmovdqu32 zmmword ptr [rbx], zmm0 - vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 - vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 - vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 - vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 - vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 - vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 - vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 - vmovdqa32 zmm0, zmmword ptr [rsp] - vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] - vmovdqa32 zmm2, zmm0 - vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} - vpcmpltud k2, zmm2, zmm0 - vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} - vmovdqa32 zmmword ptr [rsp], zmm2 - vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 - add rdi, 128 - add rbx, 512 - mov qword ptr [rbp+0x50], rbx - sub rsi, 16 - cmp rsi, 16 - jnc 2b - test rsi, rsi - jnz 3f -4: - vzeroupper - mov rsp, rbp - pop rbp - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 6 -3: - test esi, 0x8 - je 3f - vpbroadcastd ymm0, dword ptr [rcx] - vpbroadcastd ymm1, dword ptr [rcx+0x4] - vpbroadcastd ymm2, dword ptr [rcx+0x8] - vpbroadcastd ymm3, dword ptr [rcx+0xC] - vpbroadcastd ymm4, dword ptr [rcx+0x10] - vpbroadcastd ymm5, dword ptr [rcx+0x14] - vpbroadcastd ymm6, dword ptr [rcx+0x18] - vpbroadcastd ymm7, dword ptr [rcx+0x1C] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x20] - mov r13, qword ptr [rdi+0x28] - mov r14, qword ptr [rdi+0x30] - mov r15, qword ptr [rdi+0x38] - movzx eax, byte ptr [rbp+0x38] - movzx ebx, byte ptr [rbp+0x40] - or eax, ebx - xor edx, edx -2: - movzx ebx, byte ptr [rbp+0x48] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x80] - cmove eax, ebx - mov dword ptr [rsp+0x88], eax - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x40] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x40] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm16, ymm12, ymm14, 136 - vshufps ymm17, ymm12, ymm14, 221 - vshufps ymm18, ymm13, ymm15, 136 - vshufps ymm19, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x30] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x30] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm20, ymm12, ymm14, 136 - vshufps ymm21, ymm12, ymm14, 221 - vshufps ymm22, ymm13, ymm15, 136 - vshufps ymm23, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x20] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x20] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm24, ymm12, ymm14, 136 - vshufps ymm25, ymm12, ymm14, 221 - vshufps ymm26, ymm13, ymm15, 136 - vshufps ymm27, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x10] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x10] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm28, ymm12, ymm14, 136 - vshufps ymm29, ymm12, ymm14, 221 - vshufps ymm30, ymm13, ymm15, 136 - vshufps ymm31, ymm13, ymm15, 221 - vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] - vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] - vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] - vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] - vmovdqa ymm12, ymmword ptr [rsp] - vmovdqa ymm13, ymmword ptr [rsp+0x40] - vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] - vpbroadcastd ymm15, dword ptr [rsp+0x88] - vpaddd ymm0, ymm0, ymm16 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm20 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm17 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm21 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm24 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm28 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm25 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm29 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm18 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm23 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm22 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm16 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm17 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm25 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm27 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm30 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm19 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm29 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm20 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm18 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm22 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm27 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm21 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm31 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm26 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm30 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm23 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm19 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm20 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm21 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm16 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm24 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm28 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm31 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm29 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm26 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm23 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm16 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm18 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm17 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm25 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm24 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm30 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm28 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm29 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm18 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm19 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm22 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm27 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm17 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm31 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm25 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm30 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm19 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm26 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm20 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpxor ymm0, ymm0, ymm8 - vpxor ymm1, ymm1, ymm9 - vpxor ymm2, ymm2, ymm10 - vpxor ymm3, ymm3, ymm11 - vpxor ymm4, ymm4, ymm12 - vpxor ymm5, ymm5, ymm13 - vpxor ymm6, ymm6, ymm14 - vpxor ymm7, ymm7, ymm15 - movzx eax, byte ptr [rbp+0x38] - jne 2b - mov rbx, qword ptr [rbp+0x50] - vunpcklps ymm8, ymm0, ymm1 - vunpcklps ymm9, ymm2, ymm3 - vunpckhps ymm10, ymm0, ymm1 - vunpcklps ymm11, ymm4, ymm5 - vunpcklps ymm0, ymm6, ymm7 - vshufps ymm12, ymm8, ymm9, 78 - vblendps ymm1, ymm8, ymm12, 0xCC - vshufps ymm8, ymm11, ymm0, 78 - vunpckhps ymm13, ymm2, ymm3 - vblendps ymm2, ymm11, ymm8, 0xCC - vblendps ymm3, ymm12, ymm9, 0xCC - vperm2f128 ymm12, ymm1, ymm2, 0x20 - vmovups ymmword ptr [rbx], ymm12 - vunpckhps ymm14, ymm4, ymm5 - vblendps ymm4, ymm8, ymm0, 0xCC - vunpckhps ymm15, ymm6, ymm7 - vperm2f128 ymm7, ymm3, ymm4, 0x20 - vmovups ymmword ptr [rbx+0x20], ymm7 - vshufps ymm5, ymm10, ymm13, 78 - vblendps ymm6, ymm5, ymm13, 0xCC - vshufps ymm13, ymm14, ymm15, 78 - vblendps ymm10, ymm10, ymm5, 0xCC - vblendps ymm14, ymm14, ymm13, 0xCC - vperm2f128 ymm8, ymm10, ymm14, 0x20 - vmovups ymmword ptr [rbx+0x40], ymm8 - vblendps ymm15, ymm13, ymm15, 0xCC - vperm2f128 ymm13, ymm6, ymm15, 0x20 - vmovups ymmword ptr [rbx+0x60], ymm13 - vperm2f128 ymm9, ymm1, ymm2, 0x31 - vperm2f128 ymm11, ymm3, ymm4, 0x31 - vmovups ymmword ptr [rbx+0x80], ymm9 - vperm2f128 ymm14, ymm10, ymm14, 0x31 - vperm2f128 ymm15, ymm6, ymm15, 0x31 - vmovups ymmword ptr [rbx+0xA0], ymm11 - vmovups ymmword ptr [rbx+0xC0], ymm14 - vmovups ymmword ptr [rbx+0xE0], ymm15 - vmovdqa ymm0, ymmword ptr [rsp] - vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] - vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] - vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] - vmovdqa ymmword ptr [rsp], ymm0 - vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 - add rbx, 256 - mov qword ptr [rbp+0x50], rbx - add rdi, 64 - sub rsi, 8 -3: - mov rbx, qword ptr [rbp+0x50] - mov r15, qword ptr [rsp+0x80] - movzx r13, byte ptr [rbp+0x38] - movzx r12, byte ptr [rbp+0x48] - test esi, 0x4 - je 3f - vbroadcasti32x4 zmm0, xmmword ptr [rcx] - vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] - vmovdqa xmm12, xmmword ptr [rsp] - vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] - vpunpckldq xmm14, xmm12, xmm13 - vpunpckhdq xmm15, xmm12, xmm13 - vpermq ymm14, ymm14, 0xDC - vpermq ymm15, ymm15, 0xDC - vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] - vinserti64x4 zmm13, zmm14, ymm15, 0x01 - mov eax, 17476 - kmovw k2, eax - vpblendmd zmm13 {k2}, zmm13, zmm12 - vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov eax, 43690 - kmovw k3, eax - mov eax, 34952 - kmovw k4, eax - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x88], eax - vmovdqa32 zmm2, zmm15 - vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] - vpblendmd zmm3 {k4}, zmm13, zmm8 - vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] - vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 - vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 - vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 - vmovups zmm9, zmmword ptr [r8+rdx-0x30] - vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 - vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 - vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 - vshufps zmm4, zmm8, zmm9, 136 - vshufps zmm5, zmm8, zmm9, 221 - vmovups zmm8, zmmword ptr [r8+rdx-0x20] - vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 - vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 - vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 - vmovups zmm9, zmmword ptr [r8+rdx-0x10] - vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 - vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 - vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 - vshufps zmm6, zmm8, zmm9, 136 - vshufps zmm7, zmm8, zmm9, 221 - vpshufd zmm6, zmm6, 0x93 - vpshufd zmm7, zmm7, 0x93 - mov al, 7 -9: - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 16 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 12 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 8 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 7 - vpshufd zmm0, zmm0, 0x93 - vpshufd zmm3, zmm3, 0x4E - vpshufd zmm2, zmm2, 0x39 - vpaddd zmm0, zmm0, zmm6 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 16 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 12 - vpaddd zmm0, zmm0, zmm7 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 8 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 7 - vpshufd zmm0, zmm0, 0x39 - vpshufd zmm3, zmm3, 0x4E - vpshufd zmm2, zmm2, 0x93 - dec al - jz 9f - vshufps zmm8, zmm4, zmm5, 214 - vpshufd zmm9, zmm4, 0x0F - vpshufd zmm4, zmm8, 0x39 - vshufps zmm8, zmm6, zmm7, 250 - vpblendmd zmm9 {k3}, zmm9, zmm8 - vpunpcklqdq zmm8, zmm7, zmm5 - vpblendmd zmm8 {k4}, zmm8, zmm6 - vpshufd zmm8, zmm8, 0x78 - vpunpckhdq zmm5, zmm5, zmm7 - vpunpckldq zmm6, zmm6, zmm5 - vpshufd zmm7, zmm6, 0x1E - vmovdqa32 zmm5, zmm9 - vmovdqa32 zmm6, zmm8 - jmp 9b -9: - vpxord zmm0, zmm0, zmm2 - vpxord zmm1, zmm1, zmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 - vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 - vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 - vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 - vmovdqa xmm0, xmmword ptr [rsp] - vmovdqa xmm2, xmmword ptr [rsp+0x40] - vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] - vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] - vmovdqa xmmword ptr [rsp], xmm0 - vmovdqa xmmword ptr [rsp+0x40], xmm2 - add rbx, 128 - add rdi, 32 - sub rsi, 4 -3: - test esi, 0x2 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovd xmm13, dword ptr [rsp] - vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 - vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovd xmm14, dword ptr [rsp+0x4] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vinserti128 ymm13, ymm13, xmm14, 0x01 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x88], eax - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vpbroadcastd ymm8, dword ptr [rsp+0x88] - vpblendd ymm3, ymm13, ymm8, 0x88 - vmovups ymm8, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm8, ymm9, 136 - vshufps ymm5, ymm8, ymm9, 221 - vmovups ymm8, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm8, ymm9, 136 - vshufps ymm7, ymm8, ymm9, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 16 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 12 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 8 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 7 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 16 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 12 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 8 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 7 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x93 - dec al - jz 9f - vshufps ymm8, ymm4, ymm5, 214 - vpshufd ymm9, ymm4, 0x0F - vpshufd ymm4, ymm8, 0x39 - vshufps ymm8, ymm6, ymm7, 250 - vpblendd ymm9, ymm9, ymm8, 0xAA - vpunpcklqdq ymm8, ymm7, ymm5 - vpblendd ymm8, ymm8, ymm6, 0x88 - vpshufd ymm8, ymm8, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymm5, ymm9 - vmovdqa ymm6, ymm8 - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovdqa xmm0, xmmword ptr [rsp] - vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] - vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] - vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] - vmovdqa xmmword ptr [rsp], xmm0 - vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 - add rbx, 64 - add rdi, 16 - sub rsi, 2 -3: - test esi, 0x1 - je 4b - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+0x10] - vmovd xmm14, dword ptr [rsp] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - vpinsrd xmm3, xmm14, eax, 3 - vmovdqa xmm2, xmm15 - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vmovups xmm9, xmmword ptr [r8+rdx-0x30] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vmovups xmm9, xmmword ptr [r8+rdx-0x10] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - jmp 4b -.p2align 6 -_blake3_compress_in_place_avx512: -blake3_compress_in_place_avx512: - vmovdqu xmm0, xmmword ptr [rdi] - vmovdqu xmm1, xmmword ptr [rdi+0x10] - movzx eax, r8b - movzx edx, dl - shl rax, 32 - add rdx, rax - vmovq xmm3, rcx - vmovq xmm4, rdx - vpunpcklqdq xmm3, xmm3, xmm4 - vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovups xmm8, xmmword ptr [rsi] - vmovups xmm9, xmmword ptr [rsi+0x10] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [rsi+0x20] - vmovups xmm9, xmmword ptr [rsi+0x30] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - vmovdqu xmmword ptr [rdi], xmm0 - vmovdqu xmmword ptr [rdi+0x10], xmm1 - ret - -.p2align 6 -_blake3_compress_xof_avx512: -blake3_compress_xof_avx512: - vmovdqu xmm0, xmmword ptr [rdi] - vmovdqu xmm1, xmmword ptr [rdi+0x10] - movzx eax, r8b - movzx edx, dl - shl rax, 32 - add rdx, rax - vmovq xmm3, rcx - vmovq xmm4, rdx - vpunpcklqdq xmm3, xmm3, xmm4 - vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovups xmm8, xmmword ptr [rsi] - vmovups xmm9, xmmword ptr [rsi+0x10] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [rsi+0x20] - vmovups xmm9, xmmword ptr [rsi+0x30] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - vpxor xmm2, xmm2, [rdi] - vpxor xmm3, xmm3, [rdi+0x10] - vmovdqu xmmword ptr [r9], xmm0 - vmovdqu xmmword ptr [r9+0x10], xmm1 - vmovdqu xmmword ptr [r9+0x20], xmm2 - vmovdqu xmmword ptr [r9+0x30], xmm3 - ret - -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif -.p2align 6 -INDEX0: - .long 0, 1, 2, 3, 16, 17, 18, 19 - .long 8, 9, 10, 11, 24, 25, 26, 27 -INDEX1: - .long 4, 5, 6, 7, 20, 21, 22, 23 - .long 12, 13, 14, 15, 28, 29, 30, 31 -ADD0: - .long 0, 1, 2, 3, 4, 5, 6, 7 - .long 8, 9, 10, 11, 12, 13, 14, 15 -ADD1: .long 1 - -ADD16: .long 16 -BLAKE3_BLOCK_LEN: - .long 64 -.p2align 6 -BLAKE3_IV: -BLAKE3_IV_0: - .long 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A - -#endif // __x86_64__ diff --git a/src/b3/blake3_dispatch.c b/src/b3/blake3_dispatch.c deleted file mode 100644 index 684772564..000000000 --- a/src/b3/blake3_dispatch.c +++ /dev/null @@ -1,245 +0,0 @@ -#include -#include -#include - -#include "blake3_impl.h" - -#if defined(IS_X86) -#if defined(_MSC_VER) -#include -#elif defined(__GNUC__) -#include -#else -#error "Unimplemented!" -#endif -#endif - -#if defined(IS_X86) -static uint64_t xgetbv() { -#if defined(_MSC_VER) - return _xgetbv(0); -#else - uint32_t eax = 0, edx = 0; - __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); - return ((uint64_t)edx << 32) | eax; -#endif -} - -static void cpuid(uint32_t out[4], uint32_t id) { -#if defined(_MSC_VER) - __cpuid((int *)out, id); -#elif defined(__i386__) || defined(_M_IX86) - __asm__ __volatile__("movl %%ebx, %1\n" - "cpuid\n" - "xchgl %1, %%ebx\n" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id)); -#else - __asm__ __volatile__("cpuid\n" - : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id)); -#endif -} - -static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { -#if defined(_MSC_VER) - __cpuidex((int *)out, id, sid); -#elif defined(__i386__) || defined(_M_IX86) - __asm__ __volatile__("movl %%ebx, %1\n" - "cpuid\n" - "xchgl %1, %%ebx\n" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id), "c"(sid)); -#else - __asm__ __volatile__("cpuid\n" - : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id), "c"(sid)); -#endif -} - -#endif - -enum cpu_feature { - SSE2 = 1 << 0, - SSSE3 = 1 << 1, - SSE41 = 1 << 2, - AVX = 1 << 3, - AVX2 = 1 << 4, - AVX512F = 1 << 5, - AVX512VL = 1 << 6, - /* ... */ - UNDEFINED = 1 << 30 -}; - -#if !defined(BLAKE3_TESTING) -static /* Allow the variable to be controlled manually for testing */ -#endif - enum cpu_feature g_cpu_features = UNDEFINED; - -#if !defined(BLAKE3_TESTING) -static -#endif - enum cpu_feature - get_cpu_features() { - - if (g_cpu_features != UNDEFINED) { - return g_cpu_features; - } else { -#if defined(IS_X86) - uint32_t regs[4] = {0}; - uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; - (void)edx; - enum cpu_feature features = 0; - cpuid(regs, 0); - const int max_id = *eax; - cpuid(regs, 1); -#if defined(__amd64__) || defined(_M_X64) - features |= SSE2; -#else - if (*edx & (1UL << 26)) - features |= SSE2; -#endif - if (*ecx & (1UL << 0)) - features |= SSSE3; - if (*ecx & (1UL << 19)) - features |= SSE41; - - if (*ecx & (1UL << 27)) { // OSXSAVE - const uint64_t mask = xgetbv(); - if ((mask & 6) == 6) { // SSE and AVX states - if (*ecx & (1UL << 28)) - features |= AVX; - if (max_id >= 7) { - cpuidex(regs, 7, 0); - if (*ebx & (1UL << 5)) - features |= AVX2; - if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm - if (*ebx & (1UL << 31)) - features |= AVX512VL; - if (*ebx & (1UL << 16)) - features |= AVX512F; - } - } - } - } - g_cpu_features = features; - return features; -#else - /* How to detect NEON? */ - return 0; -#endif - } -} - -void blake3_compress_in_place(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); -#if !defined(BLAKE3_NO_AVX512) - if (features & AVX512VL) { - blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); - return; - } -#endif -#endif - blake3_compress_in_place_portable(cv, block, block_len, counter, flags); -} - -void blake3_compress_xof(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags, - uint8_t out[64]) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); -#if !defined(BLAKE3_NO_AVX512) - if (features & AVX512VL) { - blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); - return; - } -#endif -#endif - blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); -} - -void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); -#if !defined(BLAKE3_NO_AVX512) - if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { - blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); - return; - } -#endif -#if !defined(BLAKE3_NO_AVX2) - if (features & AVX2) { - blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); - return; - } -#endif -#endif - -#if defined(BLAKE3_USE_NEON) - blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - return; -#endif - - blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); -} - -// The dynamically detected SIMD degree of the current platform. -size_t blake3_simd_degree(void) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); -#if !defined(BLAKE3_NO_AVX512) - if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { - return 16; - } -#endif -#if !defined(BLAKE3_NO_AVX2) - if (features & AVX2) { - return 8; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - return 4; - } -#endif -#endif -#if defined(BLAKE3_USE_NEON) - return 4; -#endif - return 1; -} diff --git a/src/b3/blake3_impl.h b/src/b3/blake3_impl.h deleted file mode 100644 index c384671f0..000000000 --- a/src/b3/blake3_impl.h +++ /dev/null @@ -1,235 +0,0 @@ -#ifndef BLAKE3_IMPL_H -#define BLAKE3_IMPL_H - -#include -#include -#include -#include -#include - -#include "blake3.h" - -// internal flags -enum blake3_flags { - CHUNK_START = 1 << 0, - CHUNK_END = 1 << 1, - PARENT = 1 << 2, - ROOT = 1 << 3, - KEYED_HASH = 1 << 4, - DERIVE_KEY_CONTEXT = 1 << 5, - DERIVE_KEY_MATERIAL = 1 << 6, -}; - -// This C implementation tries to support recent versions of GCC, Clang, and -// MSVC. -#if defined(_MSC_VER) -#define INLINE static __forceinline -#else -#define INLINE static inline __attribute__((always_inline)) -#endif - -#if defined(__x86_64__) || defined(_M_X64) -#define IS_X86 -#define IS_X86_64 -#endif - -#if defined(__i386__) || defined(_M_IX86) -#define IS_X86 -#define IS_X86_32 -#endif - -#if defined(IS_X86) -#if defined(_MSC_VER) -#include -#endif -#include -#endif - -#if defined(IS_X86) -#define MAX_SIMD_DEGREE 16 -#elif defined(BLAKE3_USE_NEON) -#define MAX_SIMD_DEGREE 4 -#else -#define MAX_SIMD_DEGREE 1 -#endif - -// There are some places where we want a static size that's equal to the -// MAX_SIMD_DEGREE, but also at least 2. -#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) - -static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, - 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, - 0x1F83D9ABUL, 0x5BE0CD19UL}; - -static const uint8_t MSG_SCHEDULE[7][16] = { - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, - {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, - {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, - {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, - {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, - {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, -}; - -/* Find index of the highest set bit */ -/* x is assumed to be nonzero. */ -static unsigned int highest_one(uint64_t x) { -#if defined(__GNUC__) || defined(__clang__) - return 63 ^ __builtin_clzll(x); -#elif defined(_MSC_VER) && defined(IS_X86_64) - unsigned long index; - _BitScanReverse64(&index, x); - return index; -#elif defined(_MSC_VER) && defined(IS_X86_32) - if(x >> 32) { - unsigned long index; - _BitScanReverse(&index, x >> 32); - return 32 + index; - } else { - unsigned long index; - _BitScanReverse(&index, x); - return index; - } -#else - unsigned int c = 0; - if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } - if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } - if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } - if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } - if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } - if(x & 0x0000000000000002ULL) { c += 1; } - return c; -#endif -} - -// Count the number of 1 bits. -INLINE unsigned int popcnt(uint64_t x) { -#if defined(__GNUC__) || defined(__clang__) - return __builtin_popcountll(x); -#else - unsigned int count = 0; - while (x != 0) { - count += 1; - x &= x - 1; - } - return count; -#endif -} - -// Largest power of two less than or equal to x. As a special case, returns 1 -// when x is 0. -INLINE uint64_t round_down_to_power_of_2(uint64_t x) { - return 1ULL << highest_one(x | 1); -} - -INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } - -INLINE uint32_t counter_high(uint64_t counter) { - return (uint32_t)(counter >> 32); -} - -INLINE uint32_t load32(const void *src) { - const uint8_t *p = (const uint8_t *)src; - return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | - ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); -} - -INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], - uint32_t key_words[8]) { - key_words[0] = load32(&key[0 * 4]); - key_words[1] = load32(&key[1 * 4]); - key_words[2] = load32(&key[2 * 4]); - key_words[3] = load32(&key[3 * 4]); - key_words[4] = load32(&key[4 * 4]); - key_words[5] = load32(&key[5 * 4]); - key_words[6] = load32(&key[6 * 4]); - key_words[7] = load32(&key[7 * 4]); -} - -void blake3_compress_in_place(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); - -void blake3_compress_xof(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags, - uint8_t out[64]); - -void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -size_t blake3_simd_degree(void); - - -// Declarations for implementation-specific functions. -void blake3_compress_in_place_portable(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); - -void blake3_compress_xof_portable(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); - -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); - -#if defined(IS_X86) -#if !defined(BLAKE3_NO_SSE41) -void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); -void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif -#if !defined(BLAKE3_NO_AVX2) -void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif -#if !defined(BLAKE3_NO_AVX512) -void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); - -void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); - -void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif -#endif - -#if defined(BLAKE3_USE_NEON) -void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif - - -#endif /* BLAKE3_IMPL_H */ diff --git a/src/b3/blake3_portable.c b/src/b3/blake3_portable.c deleted file mode 100644 index 9ee2f4a42..000000000 --- a/src/b3/blake3_portable.c +++ /dev/null @@ -1,168 +0,0 @@ -#include "blake3_impl.h" -#include - -INLINE void store32(void *dst, uint32_t w) { - uint8_t *p = (uint8_t *)dst; - p[0] = (uint8_t)(w >> 0); - p[1] = (uint8_t)(w >> 8); - p[2] = (uint8_t)(w >> 16); - p[3] = (uint8_t)(w >> 24); -} - -INLINE uint32_t rotr32(uint32_t w, uint32_t c) { - return (w >> c) | (w << (32 - c)); -} - -INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, - uint32_t x, uint32_t y) { - state[a] = state[a] + state[b] + x; - state[d] = rotr32(state[d] ^ state[a], 16); - state[c] = state[c] + state[d]; - state[b] = rotr32(state[b] ^ state[c], 12); - state[a] = state[a] + state[b] + y; - state[d] = rotr32(state[d] ^ state[a], 8); - state[c] = state[c] + state[d]; - state[b] = rotr32(state[b] ^ state[c], 7); -} - -INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { - // Select the message schedule based on the round. - const uint8_t *schedule = MSG_SCHEDULE[round]; - - // Mix the columns. - g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); - g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); - g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); - g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); - - // Mix the rows. - g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); - g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); - g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); - g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); -} - -INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - uint32_t block_words[16]; - block_words[0] = load32(block + 4 * 0); - block_words[1] = load32(block + 4 * 1); - block_words[2] = load32(block + 4 * 2); - block_words[3] = load32(block + 4 * 3); - block_words[4] = load32(block + 4 * 4); - block_words[5] = load32(block + 4 * 5); - block_words[6] = load32(block + 4 * 6); - block_words[7] = load32(block + 4 * 7); - block_words[8] = load32(block + 4 * 8); - block_words[9] = load32(block + 4 * 9); - block_words[10] = load32(block + 4 * 10); - block_words[11] = load32(block + 4 * 11); - block_words[12] = load32(block + 4 * 12); - block_words[13] = load32(block + 4 * 13); - block_words[14] = load32(block + 4 * 14); - block_words[15] = load32(block + 4 * 15); - - state[0] = cv[0]; - state[1] = cv[1]; - state[2] = cv[2]; - state[3] = cv[3]; - state[4] = cv[4]; - state[5] = cv[5]; - state[6] = cv[6]; - state[7] = cv[7]; - state[8] = IV[0]; - state[9] = IV[1]; - state[10] = IV[2]; - state[11] = IV[3]; - state[12] = counter_low(counter); - state[13] = counter_high(counter); - state[14] = (uint32_t)block_len; - state[15] = (uint32_t)flags; - - round_fn(state, &block_words[0], 0); - round_fn(state, &block_words[0], 1); - round_fn(state, &block_words[0], 2); - round_fn(state, &block_words[0], 3); - round_fn(state, &block_words[0], 4); - round_fn(state, &block_words[0], 5); - round_fn(state, &block_words[0], 6); -} - -void blake3_compress_in_place_portable(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - uint32_t state[16]; - compress_pre(state, cv, block, block_len, counter, flags); - cv[0] = state[0] ^ state[8]; - cv[1] = state[1] ^ state[9]; - cv[2] = state[2] ^ state[10]; - cv[3] = state[3] ^ state[11]; - cv[4] = state[4] ^ state[12]; - cv[5] = state[5] ^ state[13]; - cv[6] = state[6] ^ state[14]; - cv[7] = state[7] ^ state[15]; -} - -void blake3_compress_xof_portable(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - uint32_t state[16]; - compress_pre(state, cv, block, block_len, counter, flags); - - store32(&out[0 * 4], state[0] ^ state[8]); - store32(&out[1 * 4], state[1] ^ state[9]); - store32(&out[2 * 4], state[2] ^ state[10]); - store32(&out[3 * 4], state[3] ^ state[11]); - store32(&out[4 * 4], state[4] ^ state[12]); - store32(&out[5 * 4], state[5] ^ state[13]); - store32(&out[6 * 4], state[6] ^ state[14]); - store32(&out[7 * 4], state[7] ^ state[15]); - store32(&out[8 * 4], state[8] ^ cv[0]); - store32(&out[9 * 4], state[9] ^ cv[1]); - store32(&out[10 * 4], state[10] ^ cv[2]); - store32(&out[11 * 4], state[11] ^ cv[3]); - store32(&out[12 * 4], state[12] ^ cv[4]); - store32(&out[13 * 4], state[13] ^ cv[5]); - store32(&out[14 * 4], state[14] ^ cv[6]); - store32(&out[15 * 4], state[15] ^ cv[7]); -} - -INLINE void hash_one_portable(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, 32); -} - -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs > 0) { - hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/src/b3/blake3_sse41.c b/src/b3/blake3_sse41.c deleted file mode 100644 index b31122533..000000000 --- a/src/b3/blake3_sse41.c +++ /dev/null @@ -1,559 +0,0 @@ -#include "blake3_impl.h" - -#include - -#define DEGREE 4 - -#define _mm_shuffle_ps2(a, b, c) \ - (_mm_castps_si128( \ - _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) - -INLINE __m128i loadu(const uint8_t src[16]) { - return _mm_loadu_si128((const __m128i *)src); -} - -INLINE void storeu(__m128i src, uint8_t dest[16]) { - _mm_storeu_si128((__m128i *)dest, src); -} - -INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } - -// Note that clang-format doesn't like the name "xor" for some reason. -INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } - -INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } - -INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); -} - -INLINE __m128i rot16(__m128i x) { - return _mm_shuffle_epi8( - x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); -} - -INLINE __m128i rot12(__m128i x) { - return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); -} - -INLINE __m128i rot8(__m128i x) { - return _mm_shuffle_epi8( - x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); -} - -INLINE __m128i rot7(__m128i x) { - return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); -} - -INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = addv(addv(*row0, m), *row1); - *row3 = xorv(*row3, *row0); - *row3 = rot16(*row3); - *row2 = addv(*row2, *row3); - *row1 = xorv(*row1, *row2); - *row1 = rot12(*row1); -} - -INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = addv(addv(*row0, m), *row1); - *row3 = xorv(*row3, *row0); - *row3 = rot8(*row3); - *row2 = addv(*row2, *row3); - *row1 = xorv(*row1, *row2); - *row1 = rot7(*row1); -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); -} - -INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); -} - -INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - rows[0] = loadu((uint8_t *)&cv[0]); - rows[1] = loadu((uint8_t *)&cv[4]); - rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); - rows[3] = set4(counter_low(counter), counter_high(counter), - (uint32_t)block_len, (uint32_t)flags); - - __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); - __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); - __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); - __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); - - __m128i t0, t1, t2, t3, tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); -} - -void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); - storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); -} - -void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu(xorv(rows[0], rows[2]), &out[0]); - storeu(xorv(rows[1], rows[3]), &out[16]); - storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); - storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); -} - -INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -INLINE void transpose_vecs(__m128i vecs[DEGREE]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -INLINE void transpose_msg_vecs(const uint8_t *const *inputs, - size_t block_offset, __m128i out[16]) { - out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); - out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); - out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); - out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); - out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); - out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); - out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); - out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); - out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); - out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); - out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); - out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); - out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); - out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); - out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); - out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); - for (size_t i = 0; i < 4; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs(&out[0]); - transpose_vecs(&out[4]); - transpose_vecs(&out[8]); - transpose_vecs(&out[12]); -} - -INLINE void load_counters(uint64_t counter, bool increment_counter, - __m128i *out_lo, __m128i *out_hi) { - const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); - const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); - const __m128i add1 = _mm_and_si128(mask, add0); - __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1); - __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), - _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); - __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry); - *out_lo = l; - *out_hi = h; -} - -void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m128i h_vecs[8] = { - set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), - set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), - }; - __m128i counter_low_vec, counter_high_vec; - load_counters(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); - __m128i block_flags_vec = set1(block_flags); - __m128i msg_vecs[16]; - transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m128i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn(v, msg_vecs, 0); - round_fn(v, msg_vecs, 1); - round_fn(v, msg_vecs, 2); - round_fn(v, msg_vecs, 3); - round_fn(v, msg_vecs, 4); - round_fn(v, msg_vecs, 5); - round_fn(v, msg_vecs, 6); - h_vecs[0] = xorv(v[0], v[8]); - h_vecs[1] = xorv(v[1], v[9]); - h_vecs[2] = xorv(v[2], v[10]); - h_vecs[3] = xorv(v[3], v[11]); - h_vecs[4] = xorv(v[4], v[12]); - h_vecs[5] = xorv(v[5], v[13]); - h_vecs[6] = xorv(v[6], v[14]); - h_vecs[7] = xorv(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(&h_vecs[0]); - transpose_vecs(&h_vecs[4]); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); - storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); - storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); - storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); - storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); - storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); - storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); - storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); -} - -INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, BLAKE3_OUT_LEN); -} - -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= DEGREE) { - blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += DEGREE; - } - inputs += DEGREE; - num_inputs -= DEGREE; - out = &out[DEGREE * BLAKE3_OUT_LEN]; - } - while (num_inputs > 0) { - hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/src/b3/blake3_sse41_x86-64_unix.S b/src/b3/blake3_sse41_x86-64_unix.S deleted file mode 100644 index 024a8290f..000000000 --- a/src/b3/blake3_sse41_x86-64_unix.S +++ /dev/null @@ -1,2014 +0,0 @@ -#ifdef __x86_64__ -.intel_syntax noprefix -.global blake3_hash_many_sse41 -.global _blake3_hash_many_sse41 -.global blake3_compress_in_place_sse41 -.global _blake3_compress_in_place_sse41 -.global blake3_compress_xof_sse41 -.global _blake3_compress_xof_sse41 -#ifdef __APPLE__ -.text -#else -.section .text -#endif - .p2align 6 -_blake3_hash_many_sse41: -blake3_hash_many_sse41: - push r15 - push r14 - push r13 - push r12 - push rbx - push rbp - mov rbp, rsp - sub rsp, 360 - and rsp, 0xFFFFFFFFFFFFFFC0 - neg r9d - movd xmm0, r9d - pshufd xmm0, xmm0, 0x00 - movdqa xmmword ptr [rsp+0x130], xmm0 - movdqa xmm1, xmm0 - pand xmm1, xmmword ptr [ADD0+rip] - pand xmm0, xmmword ptr [ADD1+rip] - movdqa xmmword ptr [rsp+0x150], xmm0 - movd xmm0, r8d - pshufd xmm0, xmm0, 0x00 - paddd xmm0, xmm1 - movdqa xmmword ptr [rsp+0x110], xmm0 - pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] - pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] - pcmpgtd xmm1, xmm0 - shr r8, 32 - movd xmm2, r8d - pshufd xmm2, xmm2, 0x00 - psubd xmm2, xmm1 - movdqa xmmword ptr [rsp+0x120], xmm2 - mov rbx, qword ptr [rbp+0x50] - mov r15, rdx - shl r15, 6 - movzx r13d, byte ptr [rbp+0x38] - movzx r12d, byte ptr [rbp+0x48] - cmp rsi, 4 - jc 3f -2: - movdqu xmm3, xmmword ptr [rcx] - pshufd xmm0, xmm3, 0x00 - pshufd xmm1, xmm3, 0x55 - pshufd xmm2, xmm3, 0xAA - pshufd xmm3, xmm3, 0xFF - movdqu xmm7, xmmword ptr [rcx+0x10] - pshufd xmm4, xmm7, 0x00 - pshufd xmm5, xmm7, 0x55 - pshufd xmm6, xmm7, 0xAA - pshufd xmm7, xmm7, 0xFF - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -9: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movdqu xmm8, xmmword ptr [r8+rdx-0x40] - movdqu xmm9, xmmword ptr [r9+rdx-0x40] - movdqu xmm10, xmmword ptr [r10+rdx-0x40] - movdqu xmm11, xmmword ptr [r11+rdx-0x40] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp], xmm8 - movdqa xmmword ptr [rsp+0x10], xmm9 - movdqa xmmword ptr [rsp+0x20], xmm12 - movdqa xmmword ptr [rsp+0x30], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x30] - movdqu xmm9, xmmword ptr [r9+rdx-0x30] - movdqu xmm10, xmmword ptr [r10+rdx-0x30] - movdqu xmm11, xmmword ptr [r11+rdx-0x30] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0x40], xmm8 - movdqa xmmword ptr [rsp+0x50], xmm9 - movdqa xmmword ptr [rsp+0x60], xmm12 - movdqa xmmword ptr [rsp+0x70], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x20] - movdqu xmm9, xmmword ptr [r9+rdx-0x20] - movdqu xmm10, xmmword ptr [r10+rdx-0x20] - movdqu xmm11, xmmword ptr [r11+rdx-0x20] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0x80], xmm8 - movdqa xmmword ptr [rsp+0x90], xmm9 - movdqa xmmword ptr [rsp+0xA0], xmm12 - movdqa xmmword ptr [rsp+0xB0], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x10] - movdqu xmm9, xmmword ptr [r9+rdx-0x10] - movdqu xmm10, xmmword ptr [r10+rdx-0x10] - movdqu xmm11, xmmword ptr [r11+rdx-0x10] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0xC0], xmm8 - movdqa xmmword ptr [rsp+0xD0], xmm9 - movdqa xmmword ptr [rsp+0xE0], xmm12 - movdqa xmmword ptr [rsp+0xF0], xmm13 - movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] - movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] - movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] - movdqa xmm12, xmmword ptr [rsp+0x110] - movdqa xmm13, xmmword ptr [rsp+0x120] - movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] - movd xmm15, eax - pshufd xmm15, xmm15, 0x00 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x40] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x10] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x50] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x80] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0xC0] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x90] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0xD0] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x20] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x70] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x60] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x10] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x90] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xB0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0xE0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x30] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0xD0] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x40] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x20] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x60] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0xB0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x50] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0xF0] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xA0] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0xE0] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x70] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0x30] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x40] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0x50] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x80] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xC0] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0xF0] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xD0] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0xA0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x70] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x20] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x10] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x90] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0x80] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xE0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0xC0] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xD0] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0x20] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x30] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0x60] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xB0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0x10] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xF0] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0x90] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xE0] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x30] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xA0] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x40] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - pxor xmm0, xmm8 - pxor xmm1, xmm9 - pxor xmm2, xmm10 - pxor xmm3, xmm11 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - pxor xmm4, xmm12 - pxor xmm5, xmm13 - pxor xmm6, xmm14 - pxor xmm7, xmm15 - mov eax, r13d - jne 9b - movdqa xmm9, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm9, xmm1 - movdqa xmm11, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm11, xmm3 - movdqa xmm1, xmm0 - punpcklqdq xmm0, xmm2 - punpckhqdq xmm1, xmm2 - movdqa xmm3, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm3, xmm11 - movdqu xmmword ptr [rbx], xmm0 - movdqu xmmword ptr [rbx+0x20], xmm1 - movdqu xmmword ptr [rbx+0x40], xmm9 - movdqu xmmword ptr [rbx+0x60], xmm3 - movdqa xmm9, xmm4 - punpckldq xmm4, xmm5 - punpckhdq xmm9, xmm5 - movdqa xmm11, xmm6 - punpckldq xmm6, xmm7 - punpckhdq xmm11, xmm7 - movdqa xmm5, xmm4 - punpcklqdq xmm4, xmm6 - punpckhqdq xmm5, xmm6 - movdqa xmm7, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm7, xmm11 - movdqu xmmword ptr [rbx+0x10], xmm4 - movdqu xmmword ptr [rbx+0x30], xmm5 - movdqu xmmword ptr [rbx+0x50], xmm9 - movdqu xmmword ptr [rbx+0x70], xmm7 - movdqa xmm1, xmmword ptr [rsp+0x110] - movdqa xmm0, xmm1 - paddd xmm1, xmmword ptr [rsp+0x150] - movdqa xmmword ptr [rsp+0x110], xmm1 - pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] - pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] - pcmpgtd xmm0, xmm1 - movdqa xmm1, xmmword ptr [rsp+0x120] - psubd xmm1, xmm0 - movdqa xmmword ptr [rsp+0x120], xmm1 - add rbx, 128 - add rdi, 32 - sub rsi, 4 - cmp rsi, 4 - jnc 2b - test rsi, rsi - jnz 3f -4: - mov rsp, rbp - pop rbp - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 5 -3: - test esi, 0x2 - je 3f - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movaps xmm8, xmm0 - movaps xmm9, xmm1 - movd xmm13, dword ptr [rsp+0x110] - pinsrd xmm13, dword ptr [rsp+0x120], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmmword ptr [rsp], xmm13 - movd xmm14, dword ptr [rsp+0x114] - pinsrd xmm14, dword ptr [rsp+0x124], 1 - pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmmword ptr [rsp+0x10], xmm14 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm10, xmm2 - movups xmm4, xmmword ptr [r8+rdx-0x40] - movups xmm5, xmmword ptr [r8+rdx-0x30] - movaps xmm3, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm3, xmm5, 221 - movaps xmm5, xmm3 - movups xmm6, xmmword ptr [r8+rdx-0x20] - movups xmm7, xmmword ptr [r8+rdx-0x10] - movaps xmm3, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm3, xmm7, 221 - pshufd xmm7, xmm3, 0x93 - movups xmm12, xmmword ptr [r9+rdx-0x40] - movups xmm13, xmmword ptr [r9+rdx-0x30] - movaps xmm11, xmm12 - shufps xmm12, xmm13, 136 - shufps xmm11, xmm13, 221 - movaps xmm13, xmm11 - movups xmm14, xmmword ptr [r9+rdx-0x20] - movups xmm15, xmmword ptr [r9+rdx-0x10] - movaps xmm11, xmm14 - shufps xmm14, xmm15, 136 - pshufd xmm14, xmm14, 0x93 - shufps xmm11, xmm15, 221 - pshufd xmm15, xmm11, 0x93 - movaps xmm3, xmmword ptr [rsp] - movaps xmm11, xmmword ptr [rsp+0x10] - pinsrd xmm3, eax, 3 - pinsrd xmm11, eax, 3 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm8, xmm12 - movaps xmmword ptr [rsp+0x20], xmm4 - movaps xmmword ptr [rsp+0x30], xmm12 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movaps xmm12, xmmword ptr [ROT16+rip] - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm5 - paddd xmm8, xmm13 - movaps xmmword ptr [rsp+0x40], xmm5 - movaps xmmword ptr [rsp+0x50], xmm13 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movaps xmm13, xmmword ptr [ROT8+rip] - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 0x93 - pshufd xmm8, xmm8, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm11, xmm11, 0x4E - pshufd xmm2, xmm2, 0x39 - pshufd xmm10, xmm10, 0x39 - paddd xmm0, xmm6 - paddd xmm8, xmm14 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm7 - paddd xmm8, xmm15 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 0x39 - pshufd xmm8, xmm8, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm11, xmm11, 0x4E - pshufd xmm2, xmm2, 0x93 - pshufd xmm10, xmm10, 0x93 - dec al - je 9f - movdqa xmm12, xmmword ptr [rsp+0x20] - movdqa xmm5, xmmword ptr [rsp+0x40] - pshufd xmm13, xmm12, 0x0F - shufps xmm12, xmm5, 214 - pshufd xmm4, xmm12, 0x39 - movdqa xmm12, xmm6 - shufps xmm12, xmm7, 250 - pblendw xmm13, xmm12, 0xCC - movdqa xmm12, xmm7 - punpcklqdq xmm12, xmm5 - pblendw xmm12, xmm6, 0xC0 - pshufd xmm12, xmm12, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmmword ptr [rsp+0x20], xmm13 - movdqa xmmword ptr [rsp+0x40], xmm12 - movdqa xmm5, xmmword ptr [rsp+0x30] - movdqa xmm13, xmmword ptr [rsp+0x50] - pshufd xmm6, xmm5, 0x0F - shufps xmm5, xmm13, 214 - pshufd xmm12, xmm5, 0x39 - movdqa xmm5, xmm14 - shufps xmm5, xmm15, 250 - pblendw xmm6, xmm5, 0xCC - movdqa xmm5, xmm15 - punpcklqdq xmm5, xmm13 - pblendw xmm5, xmm14, 0xC0 - pshufd xmm5, xmm5, 0x78 - punpckhdq xmm13, xmm15 - punpckldq xmm14, xmm13 - pshufd xmm15, xmm14, 0x1E - movdqa xmm13, xmm6 - movdqa xmm14, xmm5 - movdqa xmm5, xmmword ptr [rsp+0x20] - movdqa xmm6, xmmword ptr [rsp+0x40] - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm8, xmm10 - pxor xmm9, xmm11 - mov eax, r13d - cmp rdx, r15 - jne 2b - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+0x10], xmm1 - movups xmmword ptr [rbx+0x20], xmm8 - movups xmmword ptr [rbx+0x30], xmm9 - movdqa xmm0, xmmword ptr [rsp+0x130] - movdqa xmm1, xmmword ptr [rsp+0x110] - movdqa xmm2, xmmword ptr [rsp+0x120] - movdqu xmm3, xmmword ptr [rsp+0x118] - movdqu xmm4, xmmword ptr [rsp+0x128] - blendvps xmm1, xmm3, xmm0 - blendvps xmm2, xmm4, xmm0 - movdqa xmmword ptr [rsp+0x110], xmm1 - movdqa xmmword ptr [rsp+0x120], xmm2 - add rdi, 16 - add rbx, 64 - sub rsi, 2 -3: - test esi, 0x1 - je 4b - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movd xmm13, dword ptr [rsp+0x110] - pinsrd xmm13, dword ptr [rsp+0x120], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm3, xmm13 - pinsrd xmm3, eax, 3 - movups xmm4, xmmword ptr [r8+rdx-0x40] - movups xmm5, xmmword ptr [r8+rdx-0x30] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [r8+rdx-0x20] - movups xmm7, xmmword ptr [r8+rdx-0x10] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+0x10], xmm1 - jmp 4b - -.p2align 6 -blake3_compress_in_place_sse41: -_blake3_compress_in_place_sse41: - movups xmm0, xmmword ptr [rdi] - movups xmm1, xmmword ptr [rdi+0x10] - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - shl r8, 32 - add rdx, r8 - movq xmm3, rcx - movq xmm4, rdx - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rsi] - movups xmm5, xmmword ptr [rsi+0x10] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rsi+0x20] - movups xmm7, xmmword ptr [rsi+0x30] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - movups xmmword ptr [rdi], xmm0 - movups xmmword ptr [rdi+0x10], xmm1 - ret - -.p2align 6 -blake3_compress_xof_sse41: -_blake3_compress_xof_sse41: - movups xmm0, xmmword ptr [rdi] - movups xmm1, xmmword ptr [rdi+0x10] - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movzx eax, r8b - movzx edx, dl - shl rax, 32 - add rdx, rax - movq xmm3, rcx - movq xmm4, rdx - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rsi] - movups xmm5, xmmword ptr [rsi+0x10] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rsi+0x20] - movups xmm7, xmmword ptr [rsi+0x30] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - movdqu xmm4, xmmword ptr [rdi] - movdqu xmm5, xmmword ptr [rdi+0x10] - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm2, xmm4 - pxor xmm3, xmm5 - movups xmmword ptr [r9], xmm0 - movups xmmword ptr [r9+0x10], xmm1 - movups xmmword ptr [r9+0x20], xmm2 - movups xmmword ptr [r9+0x30], xmm3 - ret - - -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif -.p2align 6 -BLAKE3_IV: - .long 0x6A09E667, 0xBB67AE85 - .long 0x3C6EF372, 0xA54FF53A -ROT16: - .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -ROT8: - .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 -ADD0: - .long 0, 1, 2, 3 -ADD1: - .long 4, 4, 4, 4 -BLAKE3_IV_0: - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A -BLAKE3_BLOCK_LEN: - .long 64, 64, 64, 64 -CMP_MSB_MASK: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 - -#endif // __x86_64__ diff --git a/src/calculate_bucket.hpp b/src/calculate_bucket.hpp index 8d1cbcdac..11d736928 100644 --- a/src/calculate_bucket.hpp +++ b/src/calculate_bucket.hpp @@ -25,7 +25,7 @@ #include #include -#include "b3/blake3.h" +#include "blake3.h" #include "bits.hpp" #include "chacha8.h" #include "pos_constants.hpp" From 36bfa60c1e455dd1050e9b56931e03110b9b29d1 Mon Sep 17 00:00:00 2001 From: Chris Marslender Date: Fri, 13 Oct 2023 10:28:41 -0500 Subject: [PATCH 02/11] Remove old blake3 files from setup.py --- setup.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/setup.py b/setup.py index 4531a6957..0250eaceb 100644 --- a/setup.py +++ b/setup.py @@ -102,12 +102,6 @@ def __str__(self): "lib/FiniteStateEntropy/lib/hist.c", "python-bindings/chiapos.cpp", "uint128_t/uint128_t.cpp", - "src/b3/blake3.c", - "src/b3/blake3_portable.c", - "src/b3/blake3_dispatch.c", - "src/b3/blake3_avx2.c", - "src/b3/blake3_avx512.c", - "src/b3/blake3_sse41.c", "src/chacha8.c", ], include_dirs=[ From b9c15070be5b6b3cba2552593fedd72eb3fa3dcf Mon Sep 17 00:00:00 2001 From: Chris Marslender Date: Fri, 13 Oct 2023 10:43:04 -0500 Subject: [PATCH 03/11] Update cibuildwheel version --- .github/workflows/build-wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index c4166491b..43cc4f246 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -141,7 +141,7 @@ jobs: CIBW_MANYLINUX_X86_64_IMAGE: ${{ matrix.python.manylinux['intel'] }} CIBW_ARCHS_MACOS: ${{ matrix.os.cibw-archs-macos[matrix.arch.matrix] }} run: - pipx run --spec='cibuildwheel==2.11.2' cibuildwheel --output-dir dist 2>&1 + pipx run --spec='cibuildwheel==2.16.2' cibuildwheel --output-dir dist 2>&1 - name: Upload artifacts uses: actions/upload-artifact@v3 From e860807005c90e0da8adc60cdb068dcd26318c31 Mon Sep 17 00:00:00 2001 From: Chris Marslender Date: Fri, 13 Oct 2023 12:13:14 -0500 Subject: [PATCH 04/11] Use a longer filename so that macos std::move actually does the move vs small string optimization --- tests/test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test.cpp b/tests/test.cpp index e2d976df4..a64057ed2 100644 --- a/tests/test.cpp +++ b/tests/test.cpp @@ -621,7 +621,7 @@ void TestProofOfSpace( picosha2::hash256(hash_input.begin(), hash_input.end(), hash.begin(), hash.end()); vector qualities = prover.GetQualitiesForChallenge(hash.data()); Verifier verifier = Verifier(); - + for (uint32_t index = 0; index < qualities.size(); index++) { LargeBits proof = prover.GetFullProof(hash.data(), index); proof.ToBytes(proof_data); @@ -1082,7 +1082,7 @@ TEST_CASE("DiskProver") { SECTION("Move constructor") { - std::string filename = "prover_test.plot"; + std::string filename = "prover_test_with_a_long_name_to_avoid_sso.plot"; DiskPlotter plotter = DiskPlotter(); std::vector memo{1, 2, 3}; plotter.CreatePlotDisk( From bf43637c510e5a762a6e49091a0db44eee371355 Mon Sep 17 00:00:00 2001 From: Chris Marslender Date: Fri, 13 Oct 2023 12:18:02 -0500 Subject: [PATCH 05/11] Run c++ tests on macos --- .github/workflows/build-test-cplusplus.yml | 23 ++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-test-cplusplus.yml b/.github/workflows/build-test-cplusplus.yml index fce3e528b..c0b87a2f3 100644 --- a/.github/workflows/build-test-cplusplus.yml +++ b/.github/workflows/build-test-cplusplus.yml @@ -20,7 +20,7 @@ jobs: runs-on: ubuntu-22.04 steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: cmake, RunTests, and valgrind on ubuntu-20.04 run: | @@ -38,7 +38,7 @@ jobs: runs-on: ubuntu-22.04 steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: cmake, RunTests with address- and undefined sanitizer on Ubuntu run: | @@ -59,7 +59,7 @@ jobs: runs-on: ubuntu-22.04 steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: cmake, RunTests with thread sanitizer on Ubuntu run: | @@ -69,12 +69,27 @@ jobs: cmake --build . -- -j 6 TSAN_OPTIONS="memory_limit_mb=6000" ./RunTests + mac: + name: MacOS + runs-on: macos-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: cmake, RunTests on Mac + run: | + mkdir build + cd build + cmake .. + cmake --build . --config Release -j 6 + ./RunTests + windows: name: Windows Latest runs-on: windows-latest steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: cmake, RunTests with Windows run: | From 63a28245aab63e6db0c1e1e4692f51c20b5de595 Mon Sep 17 00:00:00 2001 From: Chris Marslender Date: Fri, 20 Oct 2023 17:15:12 -0500 Subject: [PATCH 06/11] Fix blake3 linking on windows + fix windows tests --- .gitignore | 1 + CMakeLists.txt | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 698734681..956d66bbf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.idea/ ProofOfSpace RunTests HellmanAttacks diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c02d070c..43bef9095 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -138,14 +138,15 @@ FetchContent_GetProperties(blake3) if(NOT blake3_POPULATED) FetchContent_Populate(blake3) - # Set BLAKE3 to build as a shared library - set(BUILD_SHARED_LIBS TRUE CACHE BOOL "Build shared libraries" FORCE) + # Set BLAKE3 to build as a static library + set(BUILD_SHARED_LIBS FALSE CACHE BOOL "Build static libraries" FORCE) add_subdirectory(${blake3_SOURCE_DIR}/c ${blake3_BINARY_DIR}) endif() set(BLAKE3_SRC ${blake3_SOURCE_DIR}/c) set(BLAKE3_INCLUDE_DIR ${blake3_SOURCE_DIR}/c) +target_link_libraries(chiapos PRIVATE blake3) target_link_libraries(ProofOfSpace PRIVATE blake3) include_directories( ${INCLUDE_DIRECTORIES} @@ -169,7 +170,14 @@ add_executable(RunTests tests/test.cpp src/chacha8.c ) -target_link_libraries(RunTests PRIVATE blake3) + +target_link_libraries(RunTests + PRIVATE + fse + Threads::Threads + Catch2::Catch2 + blake3 +) find_package(Threads REQUIRED) @@ -233,5 +241,5 @@ if (${CP_LINK_BLADEBIT_HARVESTER}) endif() -#enable_testing() -#add_test(NAME RunTests COMMAND RunTests) +enable_testing() +add_test(NAME RunTests COMMAND RunTests) From eca8f917b74f3a246f323cb00b44a7f1e37d45dd Mon Sep 17 00:00:00 2001 From: Chris Marslender Date: Fri, 20 Oct 2023 18:16:09 -0500 Subject: [PATCH 07/11] Use CMake for windows wheels --- setup.py | 166 +++++-------------------------------------------------- 1 file changed, 15 insertions(+), 151 deletions(-) diff --git a/setup.py b/setup.py index 0250eaceb..e371636cb 100644 --- a/setup.py +++ b/setup.py @@ -76,154 +76,18 @@ def build_extension(self, ext): ) -class get_pybind_include(object): - """Helper class to determine the pybind11 include path - - The purpose of this class is to postpone importing pybind11 - until it is actually installed, so that the ``get_include()`` - method can be invoked.""" - - def __init__(self, user=False): - self.user = user - - def __str__(self): - import pybind11 - - return pybind11.get_include(self.user) - - -ext_modules = [ - Extension( - "chiapos", - [ - "lib/FiniteStateEntropy/lib/entropy_common.c", - "lib/FiniteStateEntropy/lib/fse_compress.c", - "lib/FiniteStateEntropy/lib/fse_decompress.c", - "lib/FiniteStateEntropy/lib/hist.c", - "python-bindings/chiapos.cpp", - "uint128_t/uint128_t.cpp", - "src/chacha8.c", - ], - include_dirs=[ - # Path to pybind11 headers - get_pybind_include(), - get_pybind_include(user=True), - "src", - "uint128_t", - ".", - ], - ), -] - - -# As of Python 3.6, CCompiler has a `has_flag` method. -# cf http://bugs.python.org/issue26689 -def has_flag(compiler, flagname): - """Return a boolean indicating whether a flag name is supported on - the specified compiler. - """ - import tempfile - - with tempfile.NamedTemporaryFile("w", suffix=".cpp") as f: - f.write("int main (int argc, char **argv) { return 0; }") - try: - compiler.compile([f.name], extra_postargs=[flagname]) - except errors.CompileError: - return False - return True - - -def cpp_flag(compiler): - """Return the -std=c++[11/14/17] compiler flag. - - The newer version is prefered over c++11 (when it is available). - """ - flags = ["-std=c++17", "-std=c++14", "-std=c++11"] - - for flag in flags: - if has_flag(compiler, flag): - return flag - - raise RuntimeError("Unsupported compiler -- at least C++11 support " "is needed!") - - -class BuildExt(build_ext): - """A custom build extension for adding compiler-specific options.""" - - c_opts = { - "msvc": ["/EHsc", "/std:c++17", "/O2"], - "unix": [""], - } - l_opts = { - "msvc": [], - "unix": [""], - } - - if sys.platform == "darwin": - darwin_opts = ["-stdlib=libc++", "-mmacosx-version-min=10.14"] - c_opts["unix"] += darwin_opts - l_opts["unix"] += darwin_opts # type: ignore - - def build_extensions(self): - ct = self.compiler.compiler_type - opts = self.c_opts.get(ct, []) - link_opts = self.l_opts.get(ct, []) - if ct == "unix": - opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version()) - opts.append(cpp_flag(self.compiler)) - if has_flag(self.compiler, "-fvisibility=hidden"): - opts.append("-fvisibility=hidden") - elif ct == "msvc": - opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) - - # Link bladebit_harvester - if os.getenv("CP_USE_GREEN_REAPER") == "1": - opts.append("/DUSE_GREEN_REAPER=1") - opts.append("/DBLADEBIT_HARVESTER_LINKED=1") - opts.append("/Ilibs/green_reaper/include") - link_opts.append("libs/green_reaper/lib/bladebit_harvester.lib") - - for ext in self.extensions: - ext.extra_compile_args = opts - ext.extra_link_args = link_opts - build_ext.build_extensions(self) - - # Copy bladebit_harvester.dll on windows to the target build directory - # in order to package it into the root directory of the wheel - if os.getenv("CP_USE_GREEN_REAPER") == "1" and sys.platform == "win32": - shutil.copy2("libs/green_reaper/lib/bladebit_harvester.dll", self.build_lib + "/bladebit_harvester.dll") - - -if platform.system() == "Windows": - setup( - name="chiapos", - author="Mariano Sorgente", - author_email="mariano@chia.net", - description="Chia proof of space plotting, proving, and verifying (wraps C++)", - license="Apache License", - python_requires=">=3.7", - long_description=open("README.md").read(), - long_description_content_type="text/markdown", - url="https://github.com/Chia-Network/chiapos", - setup_requires=["pybind11>=2.10.0"], - tests_require=["pytest"], - ext_modules=ext_modules, - cmdclass={"build_ext": BuildExt}, - zip_safe=False, - ) -else: - setup( - name="chiapos", - author="Mariano Sorgente", - author_email="mariano@chia.net", - description="Chia proof of space plotting, proving, and verifying (wraps C++)", - license="Apache License", - python_requires=">=3.7", - long_description=open("README.md").read(), - long_description_content_type="text/markdown", - url="https://github.com/Chia-Network/chiapos", - tests_require=["pytest"], - ext_modules=[CMakeExtension("chiapos", ".")], - cmdclass=dict(build_ext=CMakeBuild), - zip_safe=False, - ) +setup( + name="chiapos", + author="Mariano Sorgente", + author_email="mariano@chia.net", + description="Chia proof of space plotting, proving, and verifying (wraps C++)", + license="Apache License", + python_requires=">=3.7", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", + url="https://github.com/Chia-Network/chiapos", + tests_require=["pytest"], + ext_modules=[CMakeExtension("chiapos", ".")], + cmdclass=dict(build_ext=CMakeBuild), + zip_safe=False, +) From 544123449bb6e1c270fbf1031ba197df8bba63bb Mon Sep 17 00:00:00 2001 From: Chris Marslender Date: Fri, 20 Oct 2023 18:21:10 -0500 Subject: [PATCH 08/11] cleanup unused imports --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index e371636cb..5f73900c8 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,11 @@ #!/usr/bin/python3 import os import re -import shutil import sys import platform import subprocess -from setuptools import setup, errors, Extension +from setuptools import setup, Extension from setuptools.command.build_ext import build_ext from distutils.version import LooseVersion From 5b828645e34bb6b7d5fc1d1e3b8ebb9cb8c190fd Mon Sep 17 00:00:00 2001 From: Chris Marslender Date: Fri, 20 Oct 2023 20:17:20 -0500 Subject: [PATCH 09/11] Temporarily skip windows tests to debug wheels --- .github/workflows/build-wheels.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 43cc4f246..b82f6e7ba 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -140,6 +140,7 @@ jobs: CIBW_MANYLINUX_AARCH64_IMAGE: ${{ matrix.python.manylinux['arm'] }} CIBW_MANYLINUX_X86_64_IMAGE: ${{ matrix.python.manylinux['intel'] }} CIBW_ARCHS_MACOS: ${{ matrix.os.cibw-archs-macos[matrix.arch.matrix] }} + CIBW_TEST_SKIP: "win*" run: pipx run --spec='cibuildwheel==2.16.2' cibuildwheel --output-dir dist 2>&1 From 06938247806978524d18588a75486b243136ea8a Mon Sep 17 00:00:00 2001 From: Chris Marslender Date: Fri, 20 Oct 2023 20:24:45 -0500 Subject: [PATCH 10/11] Skip all tests for now --- .github/workflows/build-wheels.yml | 1 - pyproject.toml | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index b82f6e7ba..43cc4f246 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -140,7 +140,6 @@ jobs: CIBW_MANYLINUX_AARCH64_IMAGE: ${{ matrix.python.manylinux['arm'] }} CIBW_MANYLINUX_X86_64_IMAGE: ${{ matrix.python.manylinux['intel'] }} CIBW_ARCHS_MACOS: ${{ matrix.os.cibw-archs-macos[matrix.arch.matrix] }} - CIBW_TEST_SKIP: "win*" run: pipx run --spec='cibuildwheel==2.16.2' cibuildwheel --output-dir dist 2>&1 diff --git a/pyproject.toml b/pyproject.toml index 966ac339d..a4aec69f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,8 +6,8 @@ build-backend = "setuptools.build_meta" local_scheme = "no-local-version" [tool.cibuildwheel] -test-requires = "pytest" -test-command = "pytest -v {project}/tests" +#test-requires = "pytest" +#test-command = "pytest -v {project}/tests" skip = "*-manylinux_i686 *-win32 *-musllinux_*" [tool.cibuildwheel.linux] From e66a4a4166ddb004129a27e95e52387ad0ff987d Mon Sep 17 00:00:00 2001 From: Chris Marslender Date: Fri, 20 Oct 2023 21:26:36 -0500 Subject: [PATCH 11/11] Copy bladebit dll on windows when enabled --- CMakeLists.txt | 9 +++++++++ pyproject.toml | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 43bef9095..23e288e70 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -238,6 +238,15 @@ if (${CP_LINK_BLADEBIT_HARVESTER}) set_property(TARGET chiapos APPEND PROPERTY BUILD_RPATH "$ORIGIN") set_property(TARGET ProofOfSpace APPEND PROPERTY BUILD_RPATH "$ORIGIN") set_property(TARGET RunTests APPEND PROPERTY BUILD_RPATH "$ORIGIN") + + if (WIN32) + add_custom_command(TARGET chiapos POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${CMAKE_SOURCE_DIR}/libs/green_reaper/lib/bladebit_harvester.dll" + "$/bladebit_harvester.dll" + ) + message("The bladebit dll was copied to: $/bladebit_harvester.dll") + endif() endif() diff --git a/pyproject.toml b/pyproject.toml index a4aec69f7..966ac339d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,8 +6,8 @@ build-backend = "setuptools.build_meta" local_scheme = "no-local-version" [tool.cibuildwheel] -#test-requires = "pytest" -#test-command = "pytest -v {project}/tests" +test-requires = "pytest" +test-command = "pytest -v {project}/tests" skip = "*-manylinux_i686 *-win32 *-musllinux_*" [tool.cibuildwheel.linux]