Skip to content

Commit

Permalink
Merge pull request FEX-Emu#3789 from Sonicadvance1/avx128_minor_pshuf…
Browse files Browse the repository at this point in the history
…b_opt

AVX128: Minor optimization to 256-bit vpshufb
  • Loading branch information
Sonicadvance1 authored Jun 30, 2024
2 parents 4626145 + c460446 commit 5821054
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 21 deletions.
3 changes: 2 additions & 1 deletion FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -1469,7 +1469,8 @@ class OpDispatchBuilder final : public IREmitter {

Ref PSADBWOpImpl(size_t Size, Ref Src1, Ref Src2);

Ref PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2);
Ref GeneratePSHUFBMask(uint8_t SrcSize);
Ref PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2, Ref MaskVector);

Ref PSIGNImpl(OpcodeArgs, size_t ElementSize, Ref Src1, Ref Src2);

Expand Down
3 changes: 2 additions & 1 deletion FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2012,8 +2012,9 @@ void OpDispatchBuilder::AVX128_VHSUBP(OpcodeArgs) {
}

void OpDispatchBuilder::AVX128_VPSHUFB(OpcodeArgs) {
auto MaskVector = GeneratePSHUFBMask(OpSize::i128Bit);
AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i8Bit,
[this](size_t, Ref Src1, Ref Src2) { return PSHUFBOpImpl(OpSize::i128Bit, Src1, Src2); });
[this, MaskVector](size_t, Ref Src1, Ref Src2) { return PSHUFBOpImpl(OpSize::i128Bit, Src1, Src2, MaskVector); });
}

void OpDispatchBuilder::AVX128_VPSADBW(OpcodeArgs) {
Expand Down
23 changes: 13 additions & 10 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -989,13 +989,7 @@ template void OpDispatchBuilder::VPUNPCKHOp<2>(OpcodeArgs);
template void OpDispatchBuilder::VPUNPCKHOp<4>(OpcodeArgs);
template void OpDispatchBuilder::VPUNPCKHOp<8>(OpcodeArgs);

Ref OpDispatchBuilder::PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2) {
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;

// We perform the 256-bit version as two 128-bit operations due to
// the lane splitting behavior, so cap the maximum size at 16.
const auto SanitizedSrcSize = std::min(SrcSize, uint8_t {16});

Ref OpDispatchBuilder::GeneratePSHUFBMask(uint8_t SrcSize) {
// PSHUFB doesn't 100% match VTBL behaviour
// VTBL will set the element zero if the index is greater than
// the number of elements in the array
Expand All @@ -1006,7 +1000,16 @@ Ref OpDispatchBuilder::PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2) {
// Bits [6:3] is reserved for 64-bit
const uint8_t MaskImm = SrcSize == 8 ? 0b1000'0111 : 0b1000'1111;

Ref MaskVector = _VectorImm(SrcSize, 1, MaskImm);
return _VectorImm(SrcSize, 1, MaskImm);
}

Ref OpDispatchBuilder::PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2, Ref MaskVector) {
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;

// We perform the 256-bit version as two 128-bit operations due to
// the lane splitting behavior, so cap the maximum size at 16.
const auto SanitizedSrcSize = std::min(SrcSize, uint8_t {16});

Ref MaskedIndices = _VAnd(SrcSize, SrcSize, Src2, MaskVector);

Ref Low = _VTBL1(SanitizedSrcSize, Src1, MaskedIndices);
Expand All @@ -1024,7 +1027,7 @@ void OpDispatchBuilder::PSHUFBOp(OpcodeArgs) {
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);

Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2);
Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2, GeneratePSHUFBMask(SrcSize));
StoreResult(FPRClass, Op, Result, -1);
}

Expand All @@ -1033,7 +1036,7 @@ void OpDispatchBuilder::VPSHUFBOp(OpcodeArgs) {
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);

Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2);
Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2, GeneratePSHUFBMask(SrcSize));
StoreResult(FPRClass, Op, Result, -1);
}

Expand Down
17 changes: 8 additions & 9 deletions unittests/InstructionCountCI/AVX128/VEX_map2.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,18 @@
]
},
"vpshufb ymm0, ymm1, ymm2": {
"ExpectedInstructionCount": 9,
"ExpectedInstructionCount": 8,
"Comment": [
"Map 2 0b01 0x00 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #32]",
"ldr q3, [x28, #48]",
"movi v4.16b, #0x8f",
"and v4.16b, v18.16b, v4.16b",
"tbl v16.16b, {v17.16b}, v4.16b",
"movi v4.16b, #0x8f",
"and v3.16b, v3.16b, v4.16b",
"tbl v2.16b, {v2.16b}, v3.16b",
"movi v2.16b, #0x8f",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"and v5.16b, v18.16b, v2.16b",
"tbl v16.16b, {v17.16b}, v5.16b",
"and v2.16b, v4.16b, v2.16b",
"tbl v2.16b, {v3.16b}, v2.16b",
"str q2, [x28, #16]"
]
},
Expand Down

0 comments on commit 5821054

Please sign in to comment.