Skip to content

Commit

Permalink
Merge pull request #981 from lrzlin/lsx
Browse files Browse the repository at this point in the history
Add LoongArch SX SIMD extension implementation
  • Loading branch information
Cyan4973 authored Dec 4, 2024
2 parents 298d03e + e7c94ef commit 9b1d788
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ jobs:
{ name: 'RISC-V', xcc_pkg: gcc-riscv64-linux-gnu, xcc: riscv64-linux-gnu-gcc, xemu_pkg: qemu-system-riscv64,xemu: qemu-riscv64-static, os: ubuntu-latest, },
# SPARC64 qemu emulation seems broken on Ubuntu-22
{ name: 'SPARC', xcc_pkg: gcc-sparc64-linux-gnu, xcc: sparc64-linux-gnu-gcc, xemu_pkg: qemu-system-sparc, xemu: qemu-sparc64-static, os: ubuntu-20.04, },
{ name: 'LoongArch', xcc_pkg: gcc-14-loongarch64-linux-gnu, xcc: loongarch64-linux-gnu-gcc-14, xemu_pkg: qemu-system-loongarch64, xemu: qemu-loongarch64-static, os: ubuntu-24.04, },

{ name: 'ARM, gcc-10', xcc_pkg: gcc-10-arm-linux-gnueabi, xcc: arm-linux-gnueabi-gcc-10, xemu_pkg: qemu-system-arm, xemu: qemu-arm-static, os: ubuntu-20.04, },
{ name: 'AARCH64, gcc-10', xcc_pkg: gcc-10-aarch64-linux-gnu, xcc: aarch64-linux-gnu-gcc-10, xemu_pkg: qemu-system-arm, xemu: qemu-aarch64-static, os: ubuntu-20.04, },
Expand Down Expand Up @@ -474,6 +475,11 @@ jobs:
run: |
make clean; LDFLAGS="-static" CC=$XCC RUN_ENV=$XEMU make check
- name: LoongArch (XXH_VECTOR=[ scalar, LSX ])
if: ${{ startsWith(matrix.name, 'LoongArch') }}
run: |
CPPFLAGS="-DXXH_VECTOR=XXH_SCALAR" LDFLAGS="-static" CC=$XCC RUN_ENV=$XEMU make clean check
CPPFLAGS=-DXXH_VECTOR=XXH_LSX CFLAGS="-O3 -march=la464 -mlsx" LDFLAGS="-static" CC=$XCC RUN_ENV=$XEMU make clean check
# macOS

Expand Down
2 changes: 2 additions & 0 deletions cli/xsum_arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@
# else
# define XSUM_ARCH "wasm/asmjs"
# endif
#elif defined(__loongarch_lp64)
# define XSUM_ARCH "loongarch"
#else
# define XSUM_ARCH "unknown"
#endif
Expand Down
79 changes: 79 additions & 0 deletions xxhash.h
Original file line number Diff line number Diff line change
Expand Up @@ -3749,6 +3749,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can
# include <immintrin.h>
# elif defined(__SSE2__)
# include <emmintrin.h>
# elif defined(__loongarch_sx)
# include <lsxintrin.h>
# endif
#endif

Expand Down Expand Up @@ -3871,6 +3873,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
*/
XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */
XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */
XXH_LSX = 7, /*!< LSX (128-bit SIMD) for LoongArch64 */
};
/*!
* @ingroup tuning
Expand All @@ -3893,6 +3896,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
# define XXH_NEON 4
# define XXH_VSX 5
# define XXH_SVE 6
# define XXH_LSX 7
#endif

#ifndef XXH_VECTOR /* can be defined on command line */
Expand All @@ -3917,6 +3921,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|| (defined(__s390x__) && defined(__VEC__)) \
&& defined(__GNUC__) /* TODO: IBM XL */
# define XXH_VECTOR XXH_VSX
# elif defined(__loongarch_sx)
# define XXH_VECTOR XXH_LSX
# else
# define XXH_VECTOR XXH_SCALAR
# endif
Expand Down Expand Up @@ -3954,6 +3960,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
# define XXH_ACC_ALIGN 64
# elif XXH_VECTOR == XXH_SVE /* sve */
# define XXH_ACC_ALIGN 64
# elif XXH_VECTOR == XXH_LSX /* lsx */
# define XXH_ACC_ALIGN 64
# endif
#endif

Expand Down Expand Up @@ -5591,6 +5599,71 @@ XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,

#endif

#if (XXH_VECTOR == XXH_LSX)
#define _LSX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))

XXH_FORCE_INLINE void
XXH3_accumulate_512_lsx( void* XXH_RESTRICT acc,
const void* XXH_RESTRICT input,
const void* XXH_RESTRICT secret)
{
XXH_ASSERT((((size_t)acc) & 15) == 0);
{
__m128i* const xacc = (__m128i *) acc;
const __m128i* const xinput = (const __m128i *) input;
const __m128i* const xsecret = (const __m128i *) secret;

for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
/* data_vec = xinput[i]; */
__m128i const data_vec = __lsx_vld(xinput + i, 0);
/* key_vec = xsecret[i]; */
__m128i const key_vec = __lsx_vld(xsecret + i, 0);
/* data_key = data_vec ^ key_vec; */
__m128i const data_key = __lsx_vxor_v(data_vec, key_vec);
/* data_key_lo = data_key >> 32; */
__m128i const data_key_lo = __lsx_vsrli_d(data_key, 32);
// __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32);
/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
__m128i const product = __lsx_vmulwev_d_wu(data_key, data_key_lo);
/* xacc[i] += swap(data_vec); */
__m128i const data_swap = __lsx_vshuf4i_w(data_vec, _LSX_SHUFFLE(1, 0, 3, 2));
__m128i const sum = __lsx_vadd_d(xacc[i], data_swap);
/* xacc[i] += product; */
xacc[i] = __lsx_vadd_d(product, sum);
}
}
}
XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lsx)

XXH_FORCE_INLINE void
XXH3_scrambleAcc_lsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
{
XXH_ASSERT((((size_t)acc) & 15) == 0);
{
__m128i* const xacc = (__m128i*) acc;
const __m128i* const xsecret = (const __m128i *) secret;
const __m128i prime32 = __lsx_vreplgr2vr_w((int)XXH_PRIME32_1);

for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
/* xacc[i] ^= (xacc[i] >> 47) */
__m128i const acc_vec = xacc[i];
__m128i const shifted = __lsx_vsrli_d(acc_vec, 47);
__m128i const data_vec = __lsx_vxor_v(acc_vec, shifted);
/* xacc[i] ^= xsecret[i]; */
__m128i const key_vec = __lsx_vld(xsecret + i, 0);
__m128i const data_key = __lsx_vxor_v(data_vec, key_vec);

/* xacc[i] *= XXH_PRIME32_1; */
__m128i const data_key_hi = __lsx_vsrli_d(data_key, 32);
__m128i const prod_lo = __lsx_vmulwev_d_wu(data_key, prime32);
__m128i const prod_hi = __lsx_vmulwev_d_wu(data_key_hi, prime32);
xacc[i] = __lsx_vadd_d(prod_lo, __lsx_vslli_d(prod_hi, 32));
}
}
}

#endif

/* scalar variants - universal */

#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
Expand Down Expand Up @@ -5821,6 +5894,12 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar

#elif (XXH_VECTOR == XXH_LSX)
#define XXH3_accumulate_512 XXH3_accumulate_512_lsx
#define XXH3_accumulate XXH3_accumulate_lsx
#define XXH3_scrambleAcc XXH3_scrambleAcc_lsx
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar

#else /* scalar */

#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
Expand Down

0 comments on commit 9b1d788

Please sign in to comment.