From 416a83e04e55c90143df5528d8a0e80522ea06d0 Mon Sep 17 00:00:00 2001 From: Nikolai Ponomarev Date: Thu, 7 Sep 2023 12:49:04 +0300 Subject: [PATCH 1/3] Make xxhsum know about RISC-V --- cli/xsum_arch.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cli/xsum_arch.h b/cli/xsum_arch.h index f025f345..2aba7d5d 100644 --- a/cli/xsum_arch.h +++ b/cli/xsum_arch.h @@ -161,6 +161,8 @@ # else # define XSUM_ARCH "wasm/asmjs" # endif +#elif defined(__riscv) +# define XSUM_ARCH "riscv" #else # define XSUM_ARCH "unknown" #endif From 167a006b893e51434399c6c454db3745fd7d12c1 Mon Sep 17 00:00:00 2001 From: Nikolai Ponomarev Date: Thu, 7 Sep 2023 16:35:54 +0300 Subject: [PATCH 2/3] Add basic RVV support for XXH3 and XXH128 --- xxhash.h | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/xxhash.h b/xxhash.h index d11f0f63..721089ca 100644 --- a/xxhash.h +++ b/xxhash.h @@ -3701,6 +3701,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can # include # elif defined(__SSE2__) # include +# elif defined(__riscv_vector) +# include # endif #endif @@ -3823,6 +3825,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ { */ XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ + XXH_RVV = 7, /*!< RVV for RISC-V */ }; /*! * @ingroup tuning @@ -3845,6 +3848,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # define XXH_NEON 4 # define XXH_VSX 5 # define XXH_SVE 6 +# define XXH_RVV 7 #endif #ifndef XXH_VECTOR /* can be defined on command line */ @@ -3869,6 +3873,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ { || (defined(__s390x__) && defined(__VEC__)) \ && defined(__GNUC__) /* TODO: IBM XL */ # define XXH_VECTOR XXH_VSX +# elif defined(__riscv_vector) +# define XXH_VECTOR XXH_RVV # else # define XXH_VECTOR XXH_SCALAR # endif @@ -3906,6 +3912,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # define XXH_ACC_ALIGN 64 # elif XXH_VECTOR == XXH_SVE /* sve */ # define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_RVV /* rvv */ +# define XXH_ACC_ALIGN 64 # endif #endif @@ -5548,6 +5556,83 @@ XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, #endif +#if (XXH_VECTOR == XXH_RVV) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_rvv( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + // Try to set vector lenght to 512 bits. + // If this length is unavailable, then maximum available will be used + size_t vl = vsetvl_e64m1(8); + + uint64_t* const xacc = (uint64_t*) acc; + uint64_t* const xinput = (uint64_t*) input; + uint64_t* const xsecret = (uint64_t*) secret; + uint64_t swap_mask[8] = {1, 0, 3, 2, 5, 4, 7, 6}; + vuint64m1_t xswap_mask = vle64_v_u64m1(swap_mask, vl); + + // vuint64m1_t is sizeless. + // But we can assume that vl can be only 2, 4 or 8 + for(size_t i = 0; i < XXH_STRIPE_LEN/(8 * vl); i++){ + /* data_vec = input[i]; */ + vuint64m1_t data_vec = vreinterpret_v_u8m1_u64m1(vle8_v_u8m1((uint8_t*)(xinput + vl * i), vl * 8)); + /* key_vec = secret[i]; */ + vuint64m1_t key_vec = vreinterpret_v_u8m1_u64m1(vle8_v_u8m1((uint8_t*)(xsecret + vl * i), vl * 8)); + /* data_key = data_vec ^ key_vec; */ + vuint64m1_t data_key = vxor_vv_u64m1(data_vec, key_vec, vl); + /* data_key_lo = data_key >> 32; */ + vuint64m1_t data_key_lo = vsrl_vx_u64m1(data_key, 32, vl); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + vuint64m1_t product = vmul_vv_u64m1(vand_vx_u64m1(data_key, 0xffffffff, vl), vand_vx_u64m1(data_key_lo, 0xffffffff, vl), vl); + /* acc_vec = xacc[i]; */ + vuint64m1_t acc_vec = vle64_v_u64m1(xacc + vl * i, vl); + acc_vec = vadd_vv_u64m1(acc_vec, product, vl); + /* swap high and low halves */ + vuint64m1_t data_swap = vrgather_vv_u64m1(data_vec, xswap_mask, vl); + acc_vec = vadd_vv_u64m1(acc_vec, data_swap, vl); + vse64_v_u64m1(xacc + vl * i, acc_vec, vl); + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(rvv) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_rvv(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + + // Try to set vector lenght to 512 bits. + // If this length is unavailable, then maximum available will be used + size_t vl = vsetvl_e64m1(8); + uint64_t* const xacc = (uint64_t*) acc; + uint64_t* const xsecret = (uint64_t*) secret; + + uint64_t prime[8] = {XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1}; + vuint64m1_t vprime = vle64_v_u64m1(prime, vl); + + // vuint64m1_t is sizeless. + // But we can assume that vl can be only 2, 4 or 8 + for(size_t i = 0; i < XXH_STRIPE_LEN/(8 * vl); i++){ + /* xacc[i] ^= (xacc[i] >> 47) */ + vuint64m1_t acc_vec = vle64_v_u64m1(xacc + vl * i, vl); + vuint64m1_t shifted = vsrl_vx_u64m1(acc_vec, 47, vl); + vuint64m1_t data_vec = vxor_vv_u64m1(acc_vec, shifted, vl); + /* xacc[i] ^= xsecret[i]; */ + vuint64m1_t key_vec = vreinterpret_v_u8m1_u64m1(vle8_v_u8m1((uint8_t*)(xsecret + vl * i), vl * 8)); + vuint64m1_t data_key = vxor_vv_u64m1(data_vec, key_vec, vl); + + /* xacc[i] *= XXH_PRIME32_1; */ + vuint64m1_t prod_even = vmul_vv_u64m1(vand_vx_u64m1(data_key, 0xffffffff, vl), vprime, vl); + vuint64m1_t prod_odd = vmul_vv_u64m1(vsrl_vx_u64m1(data_key, 32, vl), vprime, vl); + vuint64m1_t prod = vadd_vv_u64m1(prod_even, vsll_vx_u64m1(prod_odd, 32, vl), vl); + vse64_v_u64m1(xacc + vl * i, prod, vl); + } +} + +#endif + /* scalar variants - universal */ #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) @@ -5778,6 +5863,14 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#elif (XXH_VECTOR == XXH_RVV) + +#define XXH3_accumulate_512 XXH3_accumulate_512_rvv +#define XXH3_accumulate XXH3_accumulate_rvv +#define XXH3_scrambleAcc XXH3_scrambleAcc_rvv +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + + #else /* scalar */ #define XXH3_accumulate_512 XXH3_accumulate_512_scalar From 7e6be851c3960d09ac6e4f479142d3d302b600be Mon Sep 17 00:00:00 2001 From: Nikolai Ponomarev Date: Sat, 9 Sep 2023 11:23:25 +0300 Subject: [PATCH 3/3] Make RVV code compatible with many compilers GCC >= 13 & Clang >= 16 use RVV intrinsics with __riscv_ prefix --- xxhash.h | 55 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/xxhash.h b/xxhash.h index 721089ca..fb96a37a 100644 --- a/xxhash.h +++ b/xxhash.h @@ -5558,6 +5558,15 @@ XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, #if (XXH_VECTOR == XXH_RVV) +#if ((defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 13) || \ + (defined(__clang__) && __clang_major__ < 16)) + #define RVV_OP(op) op +#else + #define concat2(X, Y) X ## Y + #define concat(X, Y) concat2(X, Y) + #define RVV_OP(op) concat(__riscv_, op) +#endif + XXH_FORCE_INLINE void XXH3_accumulate_512_rvv( void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, @@ -5566,34 +5575,34 @@ XXH3_accumulate_512_rvv( void* XXH_RESTRICT acc, XXH_ASSERT((((size_t)acc) & 63) == 0); // Try to set vector lenght to 512 bits. // If this length is unavailable, then maximum available will be used - size_t vl = vsetvl_e64m1(8); + size_t vl = RVV_OP(vsetvl_e64m1)(8); uint64_t* const xacc = (uint64_t*) acc; uint64_t* const xinput = (uint64_t*) input; uint64_t* const xsecret = (uint64_t*) secret; uint64_t swap_mask[8] = {1, 0, 3, 2, 5, 4, 7, 6}; - vuint64m1_t xswap_mask = vle64_v_u64m1(swap_mask, vl); + vuint64m1_t xswap_mask = RVV_OP(vle64_v_u64m1)(swap_mask, vl); // vuint64m1_t is sizeless. // But we can assume that vl can be only 2, 4 or 8 for(size_t i = 0; i < XXH_STRIPE_LEN/(8 * vl); i++){ /* data_vec = input[i]; */ - vuint64m1_t data_vec = vreinterpret_v_u8m1_u64m1(vle8_v_u8m1((uint8_t*)(xinput + vl * i), vl * 8)); + vuint64m1_t data_vec = RVV_OP(vreinterpret_v_u8m1_u64m1)(RVV_OP(vle8_v_u8m1)((uint8_t*)(xinput + vl * i), vl * 8)); /* key_vec = secret[i]; */ - vuint64m1_t key_vec = vreinterpret_v_u8m1_u64m1(vle8_v_u8m1((uint8_t*)(xsecret + vl * i), vl * 8)); + vuint64m1_t key_vec = RVV_OP(vreinterpret_v_u8m1_u64m1)(RVV_OP(vle8_v_u8m1)((uint8_t*)(xsecret + vl * i), vl * 8)); /* data_key = data_vec ^ key_vec; */ - vuint64m1_t data_key = vxor_vv_u64m1(data_vec, key_vec, vl); + vuint64m1_t data_key = RVV_OP(vxor_vv_u64m1)(data_vec, key_vec, vl); /* data_key_lo = data_key >> 32; */ - vuint64m1_t data_key_lo = vsrl_vx_u64m1(data_key, 32, vl); + vuint64m1_t data_key_lo = RVV_OP(vsrl_vx_u64m1)(data_key, 32, vl); /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ - vuint64m1_t product = vmul_vv_u64m1(vand_vx_u64m1(data_key, 0xffffffff, vl), vand_vx_u64m1(data_key_lo, 0xffffffff, vl), vl); + vuint64m1_t product = RVV_OP(vmul_vv_u64m1)(RVV_OP(vand_vx_u64m1)(data_key, 0xffffffff, vl), RVV_OP(vand_vx_u64m1)(data_key_lo, 0xffffffff, vl), vl); /* acc_vec = xacc[i]; */ - vuint64m1_t acc_vec = vle64_v_u64m1(xacc + vl * i, vl); - acc_vec = vadd_vv_u64m1(acc_vec, product, vl); + vuint64m1_t acc_vec = RVV_OP(vle64_v_u64m1)(xacc + vl * i, vl); + acc_vec = RVV_OP(vadd_vv_u64m1)(acc_vec, product, vl); /* swap high and low halves */ - vuint64m1_t data_swap = vrgather_vv_u64m1(data_vec, xswap_mask, vl); - acc_vec = vadd_vv_u64m1(acc_vec, data_swap, vl); - vse64_v_u64m1(xacc + vl * i, acc_vec, vl); + vuint64m1_t data_swap = RVV_OP(vrgather_vv_u64m1)(data_vec, xswap_mask, vl); + acc_vec = RVV_OP(vadd_vv_u64m1)(acc_vec, data_swap, vl); + RVV_OP(vse64_v_u64m1)(xacc + vl * i, acc_vec, vl); } } XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(rvv) @@ -5605,29 +5614,29 @@ XXH3_scrambleAcc_rvv(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) // Try to set vector lenght to 512 bits. // If this length is unavailable, then maximum available will be used - size_t vl = vsetvl_e64m1(8); + size_t vl = RVV_OP(vsetvl_e64m1)(8); uint64_t* const xacc = (uint64_t*) acc; uint64_t* const xsecret = (uint64_t*) secret; uint64_t prime[8] = {XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1}; - vuint64m1_t vprime = vle64_v_u64m1(prime, vl); + vuint64m1_t vprime = RVV_OP(vle64_v_u64m1)(prime, vl); // vuint64m1_t is sizeless. // But we can assume that vl can be only 2, 4 or 8 for(size_t i = 0; i < XXH_STRIPE_LEN/(8 * vl); i++){ /* xacc[i] ^= (xacc[i] >> 47) */ - vuint64m1_t acc_vec = vle64_v_u64m1(xacc + vl * i, vl); - vuint64m1_t shifted = vsrl_vx_u64m1(acc_vec, 47, vl); - vuint64m1_t data_vec = vxor_vv_u64m1(acc_vec, shifted, vl); + vuint64m1_t acc_vec = RVV_OP(vle64_v_u64m1)(xacc + vl * i, vl); + vuint64m1_t shifted = RVV_OP(vsrl_vx_u64m1)(acc_vec, 47, vl); + vuint64m1_t data_vec = RVV_OP(vxor_vv_u64m1)(acc_vec, shifted, vl); /* xacc[i] ^= xsecret[i]; */ - vuint64m1_t key_vec = vreinterpret_v_u8m1_u64m1(vle8_v_u8m1((uint8_t*)(xsecret + vl * i), vl * 8)); - vuint64m1_t data_key = vxor_vv_u64m1(data_vec, key_vec, vl); + vuint64m1_t key_vec = RVV_OP(vreinterpret_v_u8m1_u64m1)(RVV_OP(vle8_v_u8m1)((uint8_t*)(xsecret + vl * i), vl * 8)); + vuint64m1_t data_key = RVV_OP(vxor_vv_u64m1)(data_vec, key_vec, vl); /* xacc[i] *= XXH_PRIME32_1; */ - vuint64m1_t prod_even = vmul_vv_u64m1(vand_vx_u64m1(data_key, 0xffffffff, vl), vprime, vl); - vuint64m1_t prod_odd = vmul_vv_u64m1(vsrl_vx_u64m1(data_key, 32, vl), vprime, vl); - vuint64m1_t prod = vadd_vv_u64m1(prod_even, vsll_vx_u64m1(prod_odd, 32, vl), vl); - vse64_v_u64m1(xacc + vl * i, prod, vl); + vuint64m1_t prod_even = RVV_OP(vmul_vv_u64m1)(RVV_OP(vand_vx_u64m1)(data_key, 0xffffffff, vl), vprime, vl); + vuint64m1_t prod_odd = RVV_OP(vmul_vv_u64m1)(RVV_OP(vsrl_vx_u64m1)(data_key, 32, vl), vprime, vl); + vuint64m1_t prod = RVV_OP(vadd_vv_u64m1)(prod_even, RVV_OP(vsll_vx_u64m1)(prod_odd, 32, vl), vl); + RVV_OP(vse64_v_u64m1)(xacc + vl * i, prod, vl); } }