From cffae9cb0f68192eaf04cafff30e14b8ee465686 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 30 Jun 2024 13:41:03 -0700 Subject: [PATCH 1/2] AVX128: Minor optimization to 256-bit vpshufb --- .../Source/Interface/Core/OpcodeDispatcher.h | 3 ++- .../Core/OpcodeDispatcher/AVX_128.cpp | 3 ++- .../Core/OpcodeDispatcher/Vector.cpp | 23 +++++++++++-------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 769432fe8e..4953b0e7b1 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -1469,7 +1469,8 @@ class OpDispatchBuilder final : public IREmitter { Ref PSADBWOpImpl(size_t Size, Ref Src1, Ref Src2); - Ref PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2); + Ref GeneratePSHUFBMask(uint8_t SrcSize); + Ref PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2, Ref MaskVector); Ref PSIGNImpl(OpcodeArgs, size_t ElementSize, Ref Src1, Ref Src2); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index cf86460887..e613652c36 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -2012,8 +2012,9 @@ void OpDispatchBuilder::AVX128_VHSUBP(OpcodeArgs) { } void OpDispatchBuilder::AVX128_VPSHUFB(OpcodeArgs) { + auto MaskVector = GeneratePSHUFBMask(OpSize::i128Bit); AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i8Bit, - [this](size_t, Ref Src1, Ref Src2) { return PSHUFBOpImpl(OpSize::i128Bit, Src1, Src2); }); + [this, MaskVector](size_t, Ref Src1, Ref Src2) { return PSHUFBOpImpl(OpSize::i128Bit, Src1, Src2, MaskVector); }); } void OpDispatchBuilder::AVX128_VPSADBW(OpcodeArgs) { diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp index 8e80f1ae63..1c14c2bb1d 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp @@ -989,13 +989,7 @@ template void OpDispatchBuilder::VPUNPCKHOp<2>(OpcodeArgs); template void OpDispatchBuilder::VPUNPCKHOp<4>(OpcodeArgs); template void OpDispatchBuilder::VPUNPCKHOp<8>(OpcodeArgs); -Ref OpDispatchBuilder::PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2) { - const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE; - - // We perform the 256-bit version as two 128-bit operations due to - // the lane splitting behavior, so cap the maximum size at 16. - const auto SanitizedSrcSize = std::min(SrcSize, uint8_t {16}); - +Ref OpDispatchBuilder::GeneratePSHUFBMask(uint8_t SrcSize) { // PSHUFB doesn't 100% match VTBL behaviour // VTBL will set the element zero if the index is greater than // the number of elements in the array @@ -1006,7 +1000,16 @@ Ref OpDispatchBuilder::PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2) { // Bits [6:3] is reserved for 64-bit const uint8_t MaskImm = SrcSize == 8 ? 0b1000'0111 : 0b1000'1111; - Ref MaskVector = _VectorImm(SrcSize, 1, MaskImm); + return _VectorImm(SrcSize, 1, MaskImm); +} + +Ref OpDispatchBuilder::PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2, Ref MaskVector) { + const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE; + + // We perform the 256-bit version as two 128-bit operations due to + // the lane splitting behavior, so cap the maximum size at 16. + const auto SanitizedSrcSize = std::min(SrcSize, uint8_t {16}); + Ref MaskedIndices = _VAnd(SrcSize, SrcSize, Src2, MaskVector); Ref Low = _VTBL1(SanitizedSrcSize, Src1, MaskedIndices); @@ -1024,7 +1027,7 @@ void OpDispatchBuilder::PSHUFBOp(OpcodeArgs) { Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); - Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2); + Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2, GeneratePSHUFBMask(SrcSize)); StoreResult(FPRClass, Op, Result, -1); } @@ -1033,7 +1036,7 @@ void OpDispatchBuilder::VPSHUFBOp(OpcodeArgs) { Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); - Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2); + Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2, GeneratePSHUFBMask(SrcSize)); StoreResult(FPRClass, Op, Result, -1); } From c4604465baa05b7c1a04eb1bddd1f1b1a29b6c71 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 30 Jun 2024 13:41:14 -0700 Subject: [PATCH 2/2] InstcountCI: Update --- .../InstructionCountCI/AVX128/VEX_map2.json | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/unittests/InstructionCountCI/AVX128/VEX_map2.json b/unittests/InstructionCountCI/AVX128/VEX_map2.json index 7fb43a0bec..7f61838e76 100644 --- a/unittests/InstructionCountCI/AVX128/VEX_map2.json +++ b/unittests/InstructionCountCI/AVX128/VEX_map2.json @@ -25,19 +25,18 @@ ] }, "vpshufb ymm0, ymm1, ymm2": { - "ExpectedInstructionCount": 9, + "ExpectedInstructionCount": 8, "Comment": [ "Map 2 0b01 0x00 256-bit" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #32]", - "ldr q3, [x28, #48]", - "movi v4.16b, #0x8f", - "and v4.16b, v18.16b, v4.16b", - "tbl v16.16b, {v17.16b}, v4.16b", - "movi v4.16b, #0x8f", - "and v3.16b, v3.16b, v4.16b", - "tbl v2.16b, {v2.16b}, v3.16b", + "movi v2.16b, #0x8f", + "ldr q3, [x28, #32]", + "ldr q4, [x28, #48]", + "and v5.16b, v18.16b, v2.16b", + "tbl v16.16b, {v17.16b}, v5.16b", + "and v2.16b, v4.16b, v2.16b", + "tbl v2.16b, {v3.16b}, v2.16b", "str q2, [x28, #16]" ] },