Skip to content

Commit

Permalink
Improve performance of simde_mm512_add_epi32 (#1126)
Browse files Browse the repository at this point in the history
Improve and simplify implementation of simde_mm512_add_epi32 as follows:

(1): Remove the explicit SVE implementation. For SVE vector lengths of
VL={128, 256}, this explicit vector length agnostic (VLA) SVE loop
performs significantly worse than the Neon equivalent, which can be
executed using fewer instructions. This sequence of SVE intrinsics is
also malformed according to clang, so it fails to compile altogether.

(2): Preferentially use GCC's vector extension if available, instead of
repeated calls to simde_mm256_add_epi32. There are a couple of reasons
for this:

(a) The added indirection results in worse code generation. See the code
    generation attached to commit message for an example with GCC 13.

(b) GCC's vector extension is an easier optimization target for
    compilers, allowing them to appropriately output performant code
    generation depending on their own internal cost & tuning models.
    See the snippets attached to commit message for an example of
    improved code-gen in a vector length specific (VLS) context.

This brings the implementation of simde_mm512_add_epi32 back in line
with other similar AVX512 intrinsics, such as simde_mm512_sub_epi32 and
simde_mm512_mul_ps.

Fixes #980.

An example of code-gen difference is shown below. Source is a function
containing a single call to simde_mm512_add_epi32.

Compiler: GCC 13.2.0
Compile flags: -O3 -march=armv8-a

Before this patch:
   0:    ld1    {v28.16b-v31.16b}, [x0]
   4:    sub    sp, sp, 0x90
   8:    ld1    {v24.16b-v27.16b}, [x1]
   c:    add    x2, sp, 0x3f
  10:    and    x2, x2, 0xffffffffffffffc0
  14:    add    x0, x2, 0x40
  18:    add    x1, x2, 0x20
  1c:    add    v24.4s, v24.4s, v28.4s
  20:    add    v25.4s, v25.4s, v29.4s
  24:    add    v26.4s, v30.4s, v26.4s
  28:    add    v27.4s, v31.4s, v27.4s
  2c:    stp    q24, q25, [x2, 64]
  30:    ld1    {v28.16b, v29.16b}, [x0]
  34:    stp    q26, q27, [x2, 64]
  38:    ld1    {v30.16b, v31.16b}, [x0]
  3c:    st1    {v28.16b, v29.16b}, [x2]
  40:    st1    {v30.16b, v31.16b}, [x1]
  44:    ld1    {v28.16b-v31.16b}, [x2]
  48:    st1    {v28.16b-v31.16b}, [x8]
  4c:    add    sp, sp, 0x90
  50:    ret

With this patch:
   0:    ld1    {v28.16b-v31.16b}, [x0]
   4:    ld1    {v24.16b-v27.16b}, [x1]
   8:    add    v24.4s, v28.4s, v24.4s
   c:    add    v25.4s, v29.4s, v25.4s
  10:    add    v26.4s, v30.4s, v26.4s
  14:    add    v27.4s, v31.4s, v27.4s
  18:    st1    {v24.16b-v27.16b}, [x8]
  1c:    ret

Another example of code-gen difference is shown below, targeting an SVE
enabled microarchitecture with a 512-bit vector length in a vector
length specific (VLS) context.

Compiler: GCC 13.2.0
Compile flags: -O3 -march=armv8-a+sve -msve-vector-bits=512

Before this patch:
   0:    sub     sp, sp, 0xf0
   4:    mov     x2, 0x10 // 16
   8:    ptrue   p6.b, vl64
   c:    add     x4, sp, 0x3f
  10:    mov     x3, x2
  14:    ld1d    {z31.d}, p6/z, [x0]
  18:    and     x4, x4, 0xffffffffffffffc0
  1c:    ld1d    {z30.d}, p6/z, [x1]
  20:    ptrue   p7.s, vl16
  24:    add     x6, x4, 0x40
  28:    add     x5, x4, 0x80
  2c:    st1d    {z30.d}, p6, [x4]
  30:    st1d    {z31.d}, p6, [x4, 1, mul vl]
  34:    nop
  38:    nop
  3c:    nop
  40:    add     x0, x4, x2, lsl 2
  44:    add     x1, x6, x2, lsl 2
  48:    ld1w    {z30.s}, p7/z, [x0, -1, mul vl]
  4c:    ld1w    {z31.s}, p7/z, [x1, -1, mul vl]
  50:    add     x0, x5, x2, lsl 2
  54:    add     z30.s, z31.s, z30.s
  58:    st1w    {z30.s}, p7, [x0, -1, mul vl]
  5c:    whilelo p7.s, x2, x3
  60:    add     x2, x2, 0x10
  64:    b.ne    40 // b.any
  68:    ptrue   p7.b, vl64
  6c:    ld1d    {z31.d}, p7/z, [x4, 2, mul vl]
  70:    st1d    {z31.d}, p7, [x8]
  74:    add     sp, sp, 0xf0
  78:    ret

With this patch:
   0:    ptrue   p0.b, vl64
   4:    ld1d    {z0.d}, p0/z, [x0]
   8:    ld1d    {z1.d}, p0/z, [x1]
   c:    add     z0.s, z0.s, z1.s
  10:    st1d    {z0.d}, p0, [x8]
  14:    ret
  • Loading branch information
AymenQ authored Feb 7, 2024
1 parent d31de99 commit 6cde31c
Showing 1 changed file with 1 addition and 17 deletions.
18 changes: 1 addition & 17 deletions simde/x86/avx512/add.h
Original file line number Diff line number Diff line change
Expand Up @@ -402,23 +402,7 @@ simde_mm512_add_epi32 (simde__m512i a, simde__m512i b) {
a_ = simde__m512i_to_private(a),
b_ = simde__m512i_to_private(b);

#if defined(SIMDE_ARM_SVE_NATIVE)
const size_t n = sizeof(a_.i32) / sizeof(a_.i32[0]);
size_t i = 0;
svbool_t pg = svwhilelt_b32(i, n);
do {
svint32_t
va = svld1_s32(pg, &(a_.i32[i])),
vb = svld1_s32(pg, &(b_.i32[i]));
svst1_s32(pg, &(r_.i32[i]), svadd_s32_x(pg, va, vb));
i += svcntw();
pg = svwhilelt_b32(i, n);
} while (svptest_any(svptrue_b32(), pg));
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256)
for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
r_.m256i[i] = simde_mm256_add_epi32(a_.m256i[i], b_.m256i[i]);
}
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.i32 = a_.i32 + b_.i32;
#else
SIMDE_VECTORIZE
Expand Down

0 comments on commit 6cde31c

Please sign in to comment.