Skip to content

Commit

Permalink
Merge pull request #4127 from alyssarosenzweig/opt/masking
Browse files Browse the repository at this point in the history
Optimize bsf, bsr, register cmpxchg, pcmpistri
  • Loading branch information
Sonicadvance1 authored Oct 23, 2024
2 parents 767c61c + d2a42c0 commit caaacb6
Show file tree
Hide file tree
Showing 8 changed files with 159 additions and 243 deletions.
14 changes: 3 additions & 11 deletions FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1280,17 +1280,10 @@ DEF_OP(FindLSB) {
const auto Dst = GetReg(Node);
const auto Src = GetReg(Op->Src.ID());

if (IROp->Size != 8) {
ubfx(EmitSize, TMP1, Src, 0, IROp->Size * 8);
cmp(EmitSize, TMP1, 0);
rbit(EmitSize, TMP1, TMP1);
} else {
rbit(EmitSize, TMP1, Src);
cmp(EmitSize, Src, 0);
}

// We assume the source is nonzero, so we can just rbit+clz without worrying
// about upper garbage for smaller types.
rbit(EmitSize, TMP1, Src);
clz(EmitSize, Dst, TMP1);
csinv(EmitSize, Dst, Dst, ARMEmitter::Reg::zr, ARMEmitter::Condition::CC_NE);
}

DEF_OP(FindMSB) {
Expand All @@ -1307,7 +1300,6 @@ DEF_OP(FindMSB) {

if (OpSize == 2) {
lsl(EmitSize, Dst, Src, 16);
orr(EmitSize, Dst, Dst, 0x8000);
clz(EmitSize, Dst, Dst);
} else {
clz(EmitSize, Dst, Src);
Expand Down
61 changes: 23 additions & 38 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3716,41 +3716,37 @@ void OpDispatchBuilder::IDIVOp(OpcodeArgs) {
void OpDispatchBuilder::BSFOp(OpcodeArgs) {
const uint8_t GPRSize = CTX->GetGPRSize();
const uint8_t DstSize = GetDstSize(Op) == 2 ? 2 : GPRSize;
Ref Dest = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, DstSize, Op->Flags);
Ref Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);

InvalidateDeferredFlags();
CachedNZCV = nullptr;
Ref Dest = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, DstSize, Op->Flags, {.AllowUpperGarbage = true});
Ref Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});

// Find the LSB of this source
auto Result = _FindLSB(OpSizeFromSrc(Op), Src);

// OF, SF, AF, PF, CF all undefined
// ZF is set to 1 if the source was zero
SetNZ_ZeroCV(GetSrcSize(Op), Src);
// TODO: Optimize carry zero
SetZ_InvalidateNCV(OpSizeFromSrc(Op), Src);

// If Src was zero then the destination doesn't get modified
// If Src was zero then the destination doesn't get modified.
//
// Although Intel does not guarantee that semantic, AMD does and Intel
// hardware satisfies it. We provide the stronger AMD behaviour as
// applications might rely on that in the wild.
auto SelectOp = NZCVSelect(IR::SizeToOpSize(GPRSize), {COND_EQ}, Dest, Result);
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, SelectOp, DstSize, -1);
}

void OpDispatchBuilder::BSROp(OpcodeArgs) {
const uint8_t GPRSize = CTX->GetGPRSize();
const uint8_t DstSize = GetDstSize(Op) == 2 ? 2 : GPRSize;
Ref Dest = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, DstSize, Op->Flags);
Ref Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);

InvalidateDeferredFlags();
CachedNZCV = nullptr;
Ref Dest = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, DstSize, Op->Flags, {.AllowUpperGarbage = true});
Ref Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});

// Find the MSB of this source
auto Result = _FindMSB(OpSizeFromSrc(Op), Src);

// OF, SF, AF, PF, CF all undefined
// ZF is set to 1 if the source was zero
SetNZ_ZeroCV(GetSrcSize(Op), Src);
// TODO: Optimize carry zero
SetZ_InvalidateNCV(OpSizeFromSrc(Op), Src);

// If Src was zero then the destination doesn't get modified
auto SelectOp = NZCVSelect(IR::SizeToOpSize(GPRSize), {COND_EQ}, Dest, Result);
Expand All @@ -3776,13 +3772,11 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) {
const auto GPRSize = CTX->GetGPRSize();
auto Size = GetSrcSize(Op);

// This is our source register
Ref Src2 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
// 0x80014000
// 0x80064000
// 0x80064000

if (Op->Dest.IsGPR()) {
// This is our source register
Ref Src2 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
Ref Src3 = LoadGPRRegister(X86State::REG_RAX);

// If the destination is also the accumulator, we get some algebraic
// simplifications. Not sure if this is actually hit but it's in
// InstCountCI.
Expand All @@ -3791,26 +3785,16 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) {
Ref Src1 {};
Ref Src1Lower {};

Ref Src3 {};
Ref Src3Lower {};
if (GPRSize == 8 && Size == 4) {
Src1 = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, GPRSize, Op->Flags);
Src1 = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, GPRSize, Op->Flags, {.AllowUpperGarbage = true});
Src1Lower = _Bfe(IR::SizeToOpSize(GPRSize), Size * 8, 0, Src1);
Src3 = LoadGPRRegister(X86State::REG_RAX);
} else {
Src1 = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, Size, Op->Flags);
Src1 = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, Size, Op->Flags, {.AllowUpperGarbage = true});
Src1Lower = Src1;
Src3 = LoadGPRRegister(X86State::REG_RAX);
}

if (Size != GPRSize) {
Src3Lower = _Bfe(IR::SizeToOpSize(GPRSize), Size * 8, 0, Src3);
} else {
Src3Lower = Src3;
}

// Compare RAX with the destination, setting flags accordingly.
CalculateFlags_SUB(GetSrcSize(Op), Src3Lower, Src1Lower);
CalculateFlags_SUB(GetSrcSize(Op), Src3, Src1Lower);
CalculateDeferredFlags();

if (!Trivial) {
Expand All @@ -3837,6 +3821,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) {
StoreResult(GPRClass, Op, DestResult, -1);
}
} else {
Ref Src2 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
HandledLock = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_LOCK;

Ref Src3 {};
Expand All @@ -3858,15 +3843,15 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) {
Ref CASResult = _CAS(IR::SizeToOpSize(Size), Src3Lower, Src2, Src1);
Ref RAXResult = CASResult;

CalculateFlags_SUB(GetSrcSize(Op), Src3Lower, CASResult);
CalculateDeferredFlags();

if (GPRSize == 8 && Size == 4) {
// This allows us to only hit the ZEXT case on failure
RAXResult = _Select(FEXCore::IR::COND_EQ, CASResult, Src3Lower, Src3, CASResult);
RAXResult = _NZCVSelect(IR::i64Bit, {COND_EQ}, Src3, CASResult);
Size = 8;
}

CalculateFlags_SUB(GetSrcSize(Op), Src3Lower, CASResult);
CalculateDeferredFlags();

// RAX gets the result of the CAS op
StoreGPRRegister(X86State::REG_RAX, RAXResult, Size);
}
Expand Down
13 changes: 13 additions & 0 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -1701,6 +1701,19 @@ class OpDispatchBuilder final : public IREmitter {
PossiblySetNZCVBits |= (1u << Bit);
}

// If we don't care about N/C/V and just need Z, we can test with a simple
// mask without any shifting.
void SetZ_InvalidateNCV(IR::OpSize Size, Ref Src) {
HandleNZCVWrite();
CFInverted = true;

if (Size < 4) {
_TestNZ(OpSize::i32Bit, Src, _InlineConstant((1u << (8 * Size)) - 1));
} else {
_TestNZ(Size, Src, Src);
}
}

// Ensure the carry invert flag matches the desired form. Used before an
// operation reading carry or at the end of a block.
void RectifyCarryInvert(bool RequiredInvert) {
Expand Down
5 changes: 2 additions & 3 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -1051,18 +1051,17 @@
"GPR = FindLSB OpSize:#Size, GPR:$Src": {
"Desc": ["Find least-significant-bit set",
"Returns the index of the least significant bit set",
"In the case of zero returns ~0U"
"Undefined result if Src is zero."
],
"DestSize": "Size",
"ImplicitFlagClobber": true,
"EmitValidation": [
"Size == FEXCore::IR::OpSize::i8Bit || Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit"
]
},
"GPR = FindMSB OpSize:#Size, GPR:$Src": {
"Desc": ["Find most-significant-bit set",
"Returns the index of the most significant bit set",
"In the case of zero returns ~0U"
"Undefined result if Src is zero."
],
"DestSize": "Size",
"EmitValidation": [
Expand Down
7 changes: 2 additions & 5 deletions unittests/InstructionCountCI/FlagM/HotBlocks.json
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,7 @@
]
},
"pcmpistri xmm0, xmm1, 0_0_00_11_01b": {
"ExpectedInstructionCount": 41,
"ExpectedInstructionCount": 38,
"Comment": [
"A Hat In Time spends at least 5% CPU time in this instruction",
"Comes from vcruntime140.dll wcsstr"
Expand Down Expand Up @@ -731,11 +731,8 @@
"mov w27, #0x0",
"uxth w21, w20",
"mov w22, #0x8",
"lsr w0, w21, #0",
"cmp w0, #0x0 (0)",
"rbit w0, w0",
"rbit w0, w21",
"clz w23, w0",
"csinv w23, w23, wzr, ne",
"cmp x21, #0x0 (0)",
"csel x7, x22, x23, eq",
"mov w26, #0x1",
Expand Down
Loading

0 comments on commit caaacb6

Please sign in to comment.