From 6cde31ce899246953e6a0c440ad1a88dce294490 Mon Sep 17 00:00:00 2001 From: Aymen Qader Date: Wed, 7 Feb 2024 06:52:58 +0000 Subject: [PATCH] Improve performance of simde_mm512_add_epi32 (#1126) Improve and simplify implementation of simde_mm512_add_epi32 as follows: (1): Remove the explicit SVE implementation. For SVE vector lengths of VL={128, 256}, this explicit vector length agnostic (VLA) SVE loop performs significantly worse than the Neon equivalent, which can be executed using fewer instructions. This sequence of SVE intrinsics is also malformed according to clang, so it fails to compile altogether. (2): Preferentially use GCC's vector extension if available, instead of repeated calls to simde_mm256_add_epi32. There are a couple of reasons for this: (a) The added indirection results in worse code generation. See the code generation attached to commit message for an example with GCC 13. (b) GCC's vector extension is an easier optimization target for compilers, allowing them to appropriately output performant code generation depending on their own internal cost & tuning models. See the snippets attached to commit message for an example of improved code-gen in a vector length specific (VLS) context. This brings the implementation of simde_mm512_add_epi32 back in line with other similar AVX512 intrinsics, such as simde_mm512_sub_epi32 and simde_mm512_mul_ps. Fixes #980. An example of code-gen difference is shown below. Source is a function containing a single call to simde_mm512_add_epi32. Compiler: GCC 13.2.0 Compile flags: -O3 -march=armv8-a Before this patch: 0: ld1 {v28.16b-v31.16b}, [x0] 4: sub sp, sp, 0x90 8: ld1 {v24.16b-v27.16b}, [x1] c: add x2, sp, 0x3f 10: and x2, x2, 0xffffffffffffffc0 14: add x0, x2, 0x40 18: add x1, x2, 0x20 1c: add v24.4s, v24.4s, v28.4s 20: add v25.4s, v25.4s, v29.4s 24: add v26.4s, v30.4s, v26.4s 28: add v27.4s, v31.4s, v27.4s 2c: stp q24, q25, [x2, 64] 30: ld1 {v28.16b, v29.16b}, [x0] 34: stp q26, q27, [x2, 64] 38: ld1 {v30.16b, v31.16b}, [x0] 3c: st1 {v28.16b, v29.16b}, [x2] 40: st1 {v30.16b, v31.16b}, [x1] 44: ld1 {v28.16b-v31.16b}, [x2] 48: st1 {v28.16b-v31.16b}, [x8] 4c: add sp, sp, 0x90 50: ret With this patch: 0: ld1 {v28.16b-v31.16b}, [x0] 4: ld1 {v24.16b-v27.16b}, [x1] 8: add v24.4s, v28.4s, v24.4s c: add v25.4s, v29.4s, v25.4s 10: add v26.4s, v30.4s, v26.4s 14: add v27.4s, v31.4s, v27.4s 18: st1 {v24.16b-v27.16b}, [x8] 1c: ret Another example of code-gen difference is shown below, targeting an SVE enabled microarchitecture with a 512-bit vector length in a vector length specific (VLS) context. Compiler: GCC 13.2.0 Compile flags: -O3 -march=armv8-a+sve -msve-vector-bits=512 Before this patch: 0: sub sp, sp, 0xf0 4: mov x2, 0x10 // 16 8: ptrue p6.b, vl64 c: add x4, sp, 0x3f 10: mov x3, x2 14: ld1d {z31.d}, p6/z, [x0] 18: and x4, x4, 0xffffffffffffffc0 1c: ld1d {z30.d}, p6/z, [x1] 20: ptrue p7.s, vl16 24: add x6, x4, 0x40 28: add x5, x4, 0x80 2c: st1d {z30.d}, p6, [x4] 30: st1d {z31.d}, p6, [x4, 1, mul vl] 34: nop 38: nop 3c: nop 40: add x0, x4, x2, lsl 2 44: add x1, x6, x2, lsl 2 48: ld1w {z30.s}, p7/z, [x0, -1, mul vl] 4c: ld1w {z31.s}, p7/z, [x1, -1, mul vl] 50: add x0, x5, x2, lsl 2 54: add z30.s, z31.s, z30.s 58: st1w {z30.s}, p7, [x0, -1, mul vl] 5c: whilelo p7.s, x2, x3 60: add x2, x2, 0x10 64: b.ne 40 // b.any 68: ptrue p7.b, vl64 6c: ld1d {z31.d}, p7/z, [x4, 2, mul vl] 70: st1d {z31.d}, p7, [x8] 74: add sp, sp, 0xf0 78: ret With this patch: 0: ptrue p0.b, vl64 4: ld1d {z0.d}, p0/z, [x0] 8: ld1d {z1.d}, p0/z, [x1] c: add z0.s, z0.s, z1.s 10: st1d {z0.d}, p0, [x8] 14: ret --- simde/x86/avx512/add.h | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/simde/x86/avx512/add.h b/simde/x86/avx512/add.h index 2c4c98e6c..d192b2f57 100644 --- a/simde/x86/avx512/add.h +++ b/simde/x86/avx512/add.h @@ -402,23 +402,7 @@ simde_mm512_add_epi32 (simde__m512i a, simde__m512i b) { a_ = simde__m512i_to_private(a), b_ = simde__m512i_to_private(b); - #if defined(SIMDE_ARM_SVE_NATIVE) - const size_t n = sizeof(a_.i32) / sizeof(a_.i32[0]); - size_t i = 0; - svbool_t pg = svwhilelt_b32(i, n); - do { - svint32_t - va = svld1_s32(pg, &(a_.i32[i])), - vb = svld1_s32(pg, &(b_.i32[i])); - svst1_s32(pg, &(r_.i32[i]), svadd_s32_x(pg, va, vb)); - i += svcntw(); - pg = svwhilelt_b32(i, n); - } while (svptest_any(svptrue_b32(), pg)); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_add_epi32(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = a_.i32 + b_.i32; #else SIMDE_VECTORIZE