Skip to content

Commit

Permalink
Merge pull request FEX-Emu#3530 from alyssarosenzweig/opt/cmpxchg-flags2
Browse files Browse the repository at this point in the history
Optimize cmpxchg with flagm
  • Loading branch information
alyssarosenzweig authored Mar 30, 2024
2 parents d25ace4 + 9bca052 commit 2a625a4
Show file tree
Hide file tree
Showing 9 changed files with 139 additions and 74 deletions.
26 changes: 26 additions & 0 deletions FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ tags: backend|arm64
$end_info$
*/

#include "FEXCore/IR/IR.h"
#include "Interface/Context/Context.h"
#include "Interface/Core/ArchHelpers/CodeEmitter/Emitter.h"
#include "Interface/Core/ArchHelpers/CodeEmitter/Registers.h"
Expand Down Expand Up @@ -299,6 +300,31 @@ DEF_OP(SubNZCV) {
}
}

DEF_OP(CmpPairZ) {
auto Op = IROp->C<IR::IROp_CmpPairZ>();
const uint8_t OpSize = IROp->Size;

const auto EmitSize = OpSize == IR::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;

// Save NZCV
mrs(TMP1, ARMEmitter::SystemRegister::NZCV);

// Compare, setting Z and clobbering NzCV
const auto Src1 = GetRegPair(Op->Src1.ID());
const auto Src2 = GetRegPair(Op->Src2.ID());
cmp(EmitSize, Src1.first, Src2.first);
ccmp(EmitSize, Src1.second, Src2.second, ARMEmitter::StatusFlags::None, ARMEmitter::Condition::CC_EQ);

// Restore NzCV
if (CTX->HostFeatures.SupportsFlagM) {
rmif(TMP1, 0, 0xb /* NzCV */);
} else {
cset(ARMEmitter::Size::i32Bit, TMP2, ARMEmitter::Condition::CC_EQ);
bfi(ARMEmitter::Size::i32Bit, TMP1, TMP2, 30 /* lsb: Z */, 1);
msr(ARMEmitter::SystemRegister::NZCV, TMP1);
}
}

DEF_OP(CarryInvert) {
LOGMAN_THROW_A_FMT(CTX->HostFeatures.SupportsFlagM, "Unsupported flagm op");
cfinv();
Expand Down
11 changes: 11 additions & 0 deletions FEXCore/Source/Interface/Core/JIT/Arm64/AtomicOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,19 @@ DEF_OP(CASPair) {
mov(EmitSize, Dst.second, TMP4.R());
}
else {
// Save NZCV so we don't have to mark this op as clobbering NZCV (the
// SupportsAtomics does not clobber atomics and this !SupportsAtomics path
// is so slow it's not worth the complexity of splitting the IR op.). We
// clobber NZCV inside the hot loop and we can't replace cmp/ccmp/b.ne with
// something NZCV-preserving without requiring an extra instruction.
mrs(TMP1, ARMEmitter::SystemRegister::NZCV);

ARMEmitter::BackwardLabel LoopTop;
ARMEmitter::SingleUseForwardLabel LoopNotExpected;
ARMEmitter::SingleUseForwardLabel LoopExpected;
Bind(&LoopTop);

// This instruction sequence must be synced with HandleCASPAL_Armv8.
ldaxp(EmitSize, TMP2, TMP3, MemSrc);
cmp(EmitSize, TMP2, Expected.first);
ccmp(EmitSize, TMP3, Expected.second, ARMEmitter::StatusFlags::None, ARMEmitter::Condition::CC_EQ);
Expand All @@ -54,6 +62,9 @@ DEF_OP(CASPair) {
// Might have hit the case where ldaxr was hit but stlxr wasn't
clrex();
Bind(&LoopExpected);

// Restore
msr(ARMEmitter::SystemRegister::NZCV, TMP1);
}
}

Expand Down
12 changes: 2 additions & 10 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4258,16 +4258,8 @@ void OpDispatchBuilder::CMPXCHGPairOp(OpcodeArgs) {
OrderedNode *Result_Lower = _ExtractElementPair(IR::SizeToOpSize(Size), CASResult, 0);
OrderedNode *Result_Upper = _ExtractElementPair(IR::SizeToOpSize(Size), CASResult, 1);

// Set ZF if memory result was expected
auto OneConst = _Constant(1);
auto ZeroConst = _Constant(0);

OrderedNode *ZFResult = _Select(FEXCore::IR::COND_EQ,
CASResult, Expected,
OneConst, ZeroConst);

// Set ZF
SetRFLAG<FEXCore::X86State::RFLAG_ZF_RAW_LOC>(ZFResult);
HandleNZCV_RMW();
_CmpPairZ(IR::SizeToOpSize(Size), CASResult, Expected);
CalculateDeferredFlags();

auto UpdateIfNotZF = [this](auto Reg, auto Value) {
Expand Down
6 changes: 5 additions & 1 deletion FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -642,7 +642,6 @@
],
"HasDest": true,
"DestSize": "Size",
"ImplicitFlagClobber": true,
"NumElements": "2",
"EmitValidation": [
"Size == FEXCore::IR::OpSize::i64Bit || Size == FEXCore::IR::OpSize::i128Bit"
Expand Down Expand Up @@ -1095,6 +1094,11 @@
"Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit"
]
},
"CmpPairZ OpSize:#Size, GPRPair:$Src1, GPRPair:$Src2": {
"Desc": ["Compares register pairs and sets Z accordingly, preserving N/Z/V.",
"This accelerates cmpxchg."],
"HasSideEffects": true
},
"SubNZCV OpSize:#Size, GPR:$Src1, GPR:$Src2": {
"Desc": ["Set NZCV for the difference of two GPRs. ",
"Carry flag uses arm64 definition, inverted x86.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,12 @@ DeadFlagCalculationEliminination::Classify(IROp_Header *IROp)
.CanEliminate = true,
};

case OP_CMPPAIRZ:
return {
.Write = FLAG_Z,
.CanEliminate = true,
};

case OP_CARRYINVERT:
return {
.Read = FLAG_C,
Expand Down
32 changes: 32 additions & 0 deletions unittests/InstructionCountCI/FlagM/FlagOpts.json
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,38 @@
"mov x26, x5",
"cmn wzr, w26, lsl #24"
]
},
"Dead cmpxchg flags": {
"ExpectedInstructionCount": 23,
"x86Insts": [
"cmpxchg8b [rbp]",
"test rax, rax"
],
"ExpectedArm64ASM": [
"add x20, x9, #0x0 (0)",
"mov w21, w4",
"mov w22, w6",
"mov w23, w22",
"mov w22, w21",
"mov w21, w7",
"mov w24, w5",
"mov w25, w24",
"mov w24, w21",
"mov w2, w22",
"mov w3, w23",
"caspal w2, w3, w24, w25, [x20]",
"mov w20, w2",
"mov w21, w3",
"mov w24, w20",
"mov w25, w21",
"mrs x0, nzcv",
"cmp w20, w22",
"ccmp w21, w23, #nzcv, eq",
"rmif x0, #0, #NzCV",
"csel x4, x24, x4, ne",
"csel x6, x25, x6, ne",
"ands x26, x4, x4"
]
}
}
}
24 changes: 11 additions & 13 deletions unittests/InstructionCountCI/FlagM/HotBlocks_32Bit.json
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
]
},
"dxvk hotblock from MGRR": {
"ExpectedInstructionCount": 42,
"ExpectedInstructionCount": 40,
"Comment": [
"Hottest block in Metal Gear Rising: Revengeance render thread"
],
Expand Down Expand Up @@ -128,21 +128,19 @@
"mov w23, w6",
"mov w24, w21",
"mov w25, w5",
"mrs x21, nzcv",
"mov w2, w22",
"mov w3, w23",
"caspal w2, w3, w24, w25, [x20]",
"mov w24, w2",
"mov w25, w3",
"mov w20, w24",
"mov w12, w25",
"cmp x24, x22",
"ccmp x25, x23, #nzcv, eq",
"cset x22, eq",
"msr nzcv, x21",
"rmif x22, #62, #nZcv",
"csel x4, x20, x4, ne",
"csel x6, x12, x6, ne"
"mov w20, w2",
"mov w21, w3",
"mov w24, w20",
"mov w25, w21",
"mrs x0, nzcv",
"cmp w20, w22",
"ccmp w21, w23, #nzcv, eq",
"rmif x0, #0, #NzCV",
"csel x4, x24, x4, ne",
"csel x6, x25, x6, ne"
]
},
"Psychonauts matrix swizzle": {
Expand Down
48 changes: 22 additions & 26 deletions unittests/InstructionCountCI/FlagM/SecondaryGroup.json
Original file line number Diff line number Diff line change
Expand Up @@ -644,7 +644,7 @@
]
},
"cmpxchg8b [rbp]": {
"ExpectedInstructionCount": 24,
"ExpectedInstructionCount": 22,
"Comment": "GROUP9 0x0F 0xC7 /1",
"ExpectedArm64ASM": [
"add x20, x9, #0x0 (0)",
Expand All @@ -656,47 +656,43 @@
"mov w24, w5",
"mov w25, w24",
"mov w24, w21",
"mrs x21, nzcv",
"mov w2, w22",
"mov w3, w23",
"caspal w2, w3, w24, w25, [x20]",
"mov w24, w2",
"mov w25, w3",
"mov w20, w24",
"mov w30, w25",
"cmp x24, x22",
"ccmp x25, x23, #nzcv, eq",
"cset x22, eq",
"msr nzcv, x21",
"rmif x22, #62, #nZcv",
"csel x4, x20, x4, ne",
"csel x6, x30, x6, ne"
"mov w20, w2",
"mov w21, w3",
"mov w24, w20",
"mov w25, w21",
"mrs x0, nzcv",
"cmp w20, w22",
"ccmp w21, w23, #nzcv, eq",
"rmif x0, #0, #NzCV",
"csel x4, x24, x4, ne",
"csel x6, x25, x6, ne"
]
},
"cmpxchg16b [rbp]": {
"ExpectedInstructionCount": 20,
"ExpectedInstructionCount": 18,
"Comment": "GROUP9 0x0F 0xC7 /1",
"ExpectedArm64ASM": [
"add x20, x9, #0x0 (0)",
"mov x22, x4",
"mov x23, x6",
"mov x24, x7",
"mov x25, x5",
"mrs x21, nzcv",
"mov x2, x22",
"mov x3, x23",
"caspal x2, x3, x24, x25, [x20]",
"mov x24, x2",
"mov x25, x3",
"mov x20, x24",
"mov x30, x25",
"cmp x24, x22",
"ccmp x25, x23, #nzcv, eq",
"cset x22, eq",
"msr nzcv, x21",
"rmif x22, #62, #nZcv",
"csel x4, x20, x4, ne",
"csel x6, x30, x6, ne"
"mov x20, x2",
"mov x21, x3",
"mov x24, x20",
"mov x25, x21",
"mrs x0, nzcv",
"cmp w20, w22",
"ccmp w21, w23, #nzcv, eq",
"rmif x0, #0, #NzCV",
"csel x4, x24, x4, ne",
"csel x6, x25, x6, ne"
]
},
"rdrand ax": {
Expand Down
48 changes: 24 additions & 24 deletions unittests/InstructionCountCI/SecondaryGroup.json
Original file line number Diff line number Diff line change
Expand Up @@ -788,21 +788,21 @@
"mov w24, w5",
"mov w25, w24",
"mov w24, w21",
"mrs x21, nzcv",
"mov w2, w22",
"mov w3, w23",
"caspal w2, w3, w24, w25, [x20]",
"mov w24, w2",
"mov w25, w3",
"mov w20, w24",
"mov w30, w25",
"cmp x24, x22",
"ccmp x25, x23, #nzcv, eq",
"cset x22, eq",
"bfi w21, w22, #30, #1",
"msr nzcv, x21",
"csel x4, x20, x4, ne",
"csel x6, x30, x6, ne"
"mov w20, w2",
"mov w21, w3",
"mov w24, w20",
"mov w25, w21",
"mrs x0, nzcv",
"cmp w20, w22",
"ccmp w21, w23, #nzcv, eq",
"cset w1, eq",
"bfi w0, w1, #30, #1",
"msr nzcv, x0",
"csel x4, x24, x4, ne",
"csel x6, x25, x6, ne"
]
},
"cmpxchg16b [rbp]": {
Expand All @@ -814,21 +814,21 @@
"mov x23, x6",
"mov x24, x7",
"mov x25, x5",
"mrs x21, nzcv",
"mov x2, x22",
"mov x3, x23",
"caspal x2, x3, x24, x25, [x20]",
"mov x24, x2",
"mov x25, x3",
"mov x20, x24",
"mov x30, x25",
"cmp x24, x22",
"ccmp x25, x23, #nzcv, eq",
"cset x22, eq",
"bfi w21, w22, #30, #1",
"msr nzcv, x21",
"csel x4, x20, x4, ne",
"csel x6, x30, x6, ne"
"mov x20, x2",
"mov x21, x3",
"mov x24, x20",
"mov x25, x21",
"mrs x0, nzcv",
"cmp w20, w22",
"ccmp w21, w23, #nzcv, eq",
"cset w1, eq",
"bfi w0, w1, #30, #1",
"msr nzcv, x0",
"csel x4, x24, x4, ne",
"csel x6, x25, x6, ne"
]
},
"rdrand ax": {
Expand Down

0 comments on commit 2a625a4

Please sign in to comment.