From dccae72aa23133ed5093b3d0334f5390fb5c4fc6 Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Tue, 3 Dec 2024 17:40:25 +0100 Subject: [PATCH] Ensure predicate cache is reset when control flow leaves block Whenever the control float leaves the block, it might clobber the predicate register so we reset the cache whenever that happens. The difficulty here is that the cache is valid only during IR generation so we need to make sure we catch all the cases during this pass where the execution might leave the block. Fixes #4264 --- FEXCore/Scripts/json_ir_generator.py | 12 +- .../Interface/Core/OpcodeDispatcher.cpp | 4 +- .../Source/Interface/Core/OpcodeDispatcher.h | 6 +- .../Interface/Core/OpcodeDispatcher/X87.cpp | 14 +- .../Core/OpcodeDispatcher/X87F64.cpp | 2 +- FEXCore/Source/Interface/IR/IR.json | 180 ++++++++++++------ FEXCore/Source/Interface/IR/IREmitter.cpp | 1 + FEXCore/Source/Interface/IR/IREmitter.h | 34 +++- .../IR/Passes/x87StackOptimizationPass.cpp | 9 +- 9 files changed, 183 insertions(+), 79 deletions(-) diff --git a/FEXCore/Scripts/json_ir_generator.py b/FEXCore/Scripts/json_ir_generator.py index cda8d9c509..152535bc41 100755 --- a/FEXCore/Scripts/json_ir_generator.py +++ b/FEXCore/Scripts/json_ir_generator.py @@ -55,6 +55,7 @@ class OpDefinition: NonSSAArgNum: int DynamicDispatch: bool LoweredX87: bool + MaybeClobbersPredRegs: bool JITDispatch: bool JITDispatchOverride: str TiedSource: int @@ -79,6 +80,7 @@ def __init__(self): self.NonSSAArgNum = 0 self.DynamicDispatch = False self.LoweredX87 = False + self.MaybeClobbersPredRegs = False self.JITDispatch = True self.JITDispatchOverride = None self.TiedSource = -1 @@ -223,7 +225,7 @@ def parse_ops(ops): (OpArg.Type == "GPR" or OpArg.Type == "GPRPair" or OpArg.Type == "FPR" or - OpArg.Type == "PR")): + OpArg.Type == "PRED")): OpDef.EmitValidation.append(f"GetOpRegClass({ArgName}) == InvalidClass || WalkFindRegClass({ArgName}) == {OpArg.Type}Class") OpArg.Name = ArgName @@ -277,6 +279,10 @@ def parse_ops(ops): assert("JITDispatch" not in op_val) OpDef.JITDispatch = False + if "MaybeClobbersPredRegs" in op_val: + OpDef.MaybeClobbersPredRegs = op_val["MaybeClobbersPredRegs"] + # TODO: Does this imply !JITDispatch? + if "TiedSource" in op_val: OpDef.TiedSource = op_val["TiedSource"] @@ -506,6 +512,7 @@ def print_ir_hassideeffects(): ("HasSideEffects", "bool"), ("ImplicitFlagClobber", "bool"), ("LoweredX87", "bool"), + ("MaybeClobbersPredRegs", "bool"), ("TiedSource", "int8_t"), ]: output_file.write( @@ -707,6 +714,9 @@ def print_ir_allocator_helpers(): "\t\tif(MMXState == MMXState_MMX) ChgStateMMX_X87();\n" ) + if op.MaybeClobbersPredRegs: + output_file.write("\t\tResetInitPredicateCache();\n") + output_file.write("\t\tauto _Op = AllocateOp();\n".format(op.Name, op.Name.upper())) if op.SSAArgNum != 0: diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 6dd2d8b108..f729823364 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -4314,7 +4314,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T Ref MemSrc = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { // Using SVE we can load this with a single instruction. - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc); } else { // For X87 extended doubles, Split the load. @@ -4448,7 +4448,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl if (OpSize == OpSize::f80Bit) { Ref MemStoreDst = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); _StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst); } else { // For X87 extended doubles, split before storing diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 9545e87612..fe0c4b7b83 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -119,6 +119,7 @@ class OpDispatchBuilder final : public IREmitter { CachedNZCV = nullptr; CFInverted = CFInvertedABI; FlushRegisterCache(); + ResetInitPredicateCache(); // New block needs to reset segment telemetry. SegmentsNeedReadCheck = ~0U; @@ -718,7 +719,6 @@ class OpDispatchBuilder final : public IREmitter { void FNINIT(OpcodeArgs); void X87ModifySTP(OpcodeArgs, bool Inc); - void X87SinCos(OpcodeArgs); void X87FYL2X(OpcodeArgs, bool IsFYL2XP1); void X87LDENV(OpcodeArgs); void X87FLDCW(OpcodeArgs); @@ -764,9 +764,6 @@ class OpDispatchBuilder final : public IREmitter { void FTSTF64(OpcodeArgs); void FRNDINTF64(OpcodeArgs); void FSQRTF64(OpcodeArgs); - void X87UnaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp); - void X87BinaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp); - void X87SinCosF64(OpcodeArgs); void X87FLDCWF64(OpcodeArgs); void X87TANF64(OpcodeArgs); void X87ATANF64(OpcodeArgs); @@ -1175,6 +1172,7 @@ class OpDispatchBuilder final : public IREmitter { } void FlushRegisterCache(bool SRAOnly = false) { + // At block boundaries, fix up the carry flag. if (!SRAOnly) { RectifyCarryInvert(CFInvertedABI); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp index 1470768f8f..3eed79f50d 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp @@ -130,7 +130,11 @@ void OpDispatchBuilder::FILD(OpcodeArgs) { void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) { Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); - _StoreStackMemory(Mem, OpSize::i128Bit, true, Width); + Ref PredReg = Invalid(); + if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { + PredReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); + } + _StoreStackMemory(PredReg, Mem, OpSize::i128Bit, true, Width); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } @@ -267,9 +271,9 @@ void OpDispatchBuilder::FDIV(OpcodeArgs, IR::OpSize Width, bool Integer, bool Re void OpDispatchBuilder::FSUB(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpDispatchBuilder::OpResult ResInST0) { if (Op->Src[0].IsNone()) { - const auto Offset = Op->OP & 7; - const auto St0 = 0; - const auto Result = (ResInST0 == OpResult::RES_STI) ? Offset : St0; + const uint8_t Offset = Op->OP & 7; + const uint8_t St0 = 0; + const uint8_t Result = (ResInST0 == OpResult::RES_STI) ? Offset : St0; if (Reverse ^ (ResInST0 == OpResult::RES_STI)) { _F80SubStack(Result, Offset, St0); @@ -751,13 +755,11 @@ void OpDispatchBuilder::FNINIT(OpcodeArgs) { } void OpDispatchBuilder::X87FFREE(OpcodeArgs) { - _InvalidateStack(Op->OP & 7); } void OpDispatchBuilder::X87EMMS(OpcodeArgs) { // Tags all get set to 0b11 - _InvalidateStack(0xff); } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp index ca4e91f0b5..7313125185 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp @@ -105,7 +105,7 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) { void OpDispatchBuilder::FSTF64(OpcodeArgs, IR::OpSize Width) { Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); - _StoreStackMemory(Mem, OpSize::i64Bit, true, Width); + _StoreStackMemory(Invalid(), Mem, OpSize::i64Bit, true, Width); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index 1cb3e38690..0d63d614bb 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -211,7 +211,8 @@ }, "ThreadRemoveCodeEntry": { - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "GPR = ProcessorID": { @@ -248,7 +249,8 @@ "Print SSA:$Value": { "HasSideEffects": true, "Desc": ["Debug operation that prints an SSA value to the console", - "May only print 64bits of the value"] + "May only print 64bits of the value"], + "MaybeClobbersPredRegs": true }, "GPR = AllocateGPR i1:$ForPair": { "Desc": ["Silly pseudo-instruction to allocate a register for a future destination", @@ -312,7 +314,8 @@ "HasSideEffects": true, "Desc": ["Dispatches a guest syscall through to the SyscallHandler class" ], - "DestSize": "OpSize::i64Bit" + "DestSize": "OpSize::i64Bit", + "MaybeClobbersPredRegs": true }, "GPR = InlineSyscall GPR:$Arg0, GPR:$Arg1, GPR:$Arg2, GPR:$Arg3, GPR:$Arg4, GPR:$Arg5, i32:$HostSyscallNumber, SyscallFlags:$Flags": { @@ -328,18 +331,21 @@ }, "Thunk GPR:$ArgPtr, SHA256Sum:$ThunkNameHash": { - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "GPR:$EAX, GPR:$EBX, GPR:$ECX, GPR:$EDX = CPUID GPR:$Function, GPR:$Leaf": { "Desc": ["Calls in to the CPUID handler function to return emulated CPUID"], "DestSize": "OpSize::i32Bit", - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "GPR:$EAX, GPR:$EDX = XGetBV GPR:$Function": { "Desc": ["Calls in to the XCR handler function to return emulated XCR"], "DestSize": "OpSize::i32Bit", - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true } }, "Moves": { @@ -578,6 +584,7 @@ "HasSideEffects": true, "ElementSize": "ElementSize" }, + "FPR = LoadMemPredicate OpSize:#RegisterSize, OpSize:#ElementSize, PRED:$Mask, GPR:$Addr": { "Desc": [ "Loads a value to memory using SVE predicate mask." ], "DestSize": "RegisterSize", @@ -1631,7 +1638,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "GPR = LUDiv OpSize:#Size, GPR:$Lower, GPR:$Upper, GPR:$Divisor": { "Desc": ["Integer long unsigned division returning lower bits", @@ -1641,7 +1649,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "GPR = LRem OpSize:#Size, GPR:$Lower, GPR:$Upper, GPR:$Divisor": { "Desc": ["Integer long signed remainder returning lower bits", @@ -1651,7 +1660,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "GPR = LURem OpSize:#Size, GPR:$Lower, GPR:$Upper, GPR:$Divisor": { "Desc": ["Integer long unsigned remainder returning lower bits", @@ -1661,7 +1671,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "Float to GPR": {"Ignore": 1}, @@ -2788,13 +2799,14 @@ "HasSideEffects": true, "X87": true }, - "StoreStackMemory GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": { + "StoreStackMemory PRED:$PredReg, GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": { "Desc": [ "Takes the top value off the x87 stack and stores it to memory.", "SourceSize is 128bit for F80 values, 64-bit for low precision.", "StoreSize is the store size for conversion:", "Float: 80-bit, 64-bit, or 32-bit", - "Int: 64-bit, 32-bit, 16-bit" + "Int: 64-bit, 32-bit, 16-bit", + "If possible, it will use the PredReg for an SVE store." ], "HasSideEffects": true, "X87": true @@ -2834,18 +2846,21 @@ "Adds two stack locations together, storing the result in to the first stack location" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80AddValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Adds a operand value to a stack location. The result stored in to the stack location provided." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Add FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SubStack u8:$DstStack, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ @@ -2853,7 +2868,8 @@ "The result is stored in stack location TOP+$DstStack." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80SubValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ @@ -2861,7 +2877,8 @@ "The result is stored in stack location TOP." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80SubRValue FPR:$X80Src, u8:$SrcStack": { "Desc": [ @@ -2869,7 +2886,8 @@ "The result is stored in stack location TOP." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Sub FPR:$X80Src1, FPR:$X80Src2": { "Desc": [ @@ -2878,25 +2896,29 @@ "`FPR = X80Src2 - X80Src1`" ], "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80MulStack u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ "Multiplies two stack locations together, storing the result in to the first stack location" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80MulValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Multiplies a operand value to a stack location. The result stored in to the stack location provided." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Mul FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80DivStack u8:$DstStack, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ @@ -2905,7 +2927,8 @@ "`FPR|Stack[TOP+DstStack] = Stack[TOP+SrcStack1] / Stack[TOP+SrcStack2]`" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80DivValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ @@ -2914,7 +2937,8 @@ "`FPR|Stack[TOP] = Stack[TOP+SrcStack] / X80Src`" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80DivRValue FPR:$X80Src, u8:$SrcStack": { "Desc": [ @@ -2923,7 +2947,8 @@ "`FPR|Stack[TOP] = X80Src / Stack[TOP+SrcStack]`" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Div FPR:$X80Src1, FPR:$X80Src2": { "Desc": [ @@ -2932,7 +2957,8 @@ "`FPR = X80Src1 / X80Src2`" ], "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80StackXchange u8:$SrcStack": { "Desc": [ @@ -2957,14 +2983,16 @@ ], "HasSideEffects": true, "DestSize": "OpSize::i128Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80PTANStack": { "Desc": [ "Computes the approximate tangent of the source operand in register ST(0), stores the result in ST(0), and pushes a 1.0 onto the FPU register stack." ], "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80ATANStack": { "Desc": [ @@ -2972,51 +3000,63 @@ ], "DestSize": "OpSize::i128Bit", "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80ATAN FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80FPREMStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80FPREM FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80FPREM1Stack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80FPREM1 FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SCALEStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80SCALE FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80CVT OpSize:#Size, FPR:$X80Src": { "DestSize": "Size", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "GPR = F80CVTInt OpSize:#Size, FPR:$X80Src, i1:$Truncate": { "DestSize": "Size", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80CVTTo FPR:$X80Src, OpSize:$SrcSize": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80CVTToInt GPR:$Src, OpSize:$SrcSize": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80RoundStack": { "Desc": [ @@ -3027,62 +3067,76 @@ }, "FPR = F80Round FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80F2XM1Stack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80F2XM1 FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80TAN FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SINStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80SIN FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80COSStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80COS FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SINCOSStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "F80SQRTStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80SQRT FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80XTRACT_EXP FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80XTRACT_SIG FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "GPR = F80StackTest u8:$SrcStack": { "Desc": [ "Does comparison between value in stack at TOP + SrcStack" ], "DestSize": "OpSize::i32Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "GPR = F80CmpStack u8:$SrcStack": { "Desc": [ @@ -3090,7 +3144,8 @@ "Ordering flag result is true if either float input is NaN" ], "DestSize": "OpSize::i32Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "GPR = F80CmpValue FPR:$X80Src": { "Desc": [ @@ -3099,14 +3154,16 @@ ], "DestSize": "OpSize::i32Bit", "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "GPR = F80Cmp FPR:$X80Src1, FPR:$X80Src2": { "Desc": ["Does a scalar unordered compare and stores the flags in to a GPR", "Ordering flag result is true if either float input is NaN" ], "DestSize": "OpSize::i32Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80BCDLoad FPR:$X80Src": { "DestSize": "OpSize::i128Bit", @@ -3124,11 +3181,13 @@ ], "HasSideEffects": true, "DestSize": "OpSize::i128Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80FYL2X FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80VBSLStack OpSize:#RegisterSize, FPR:$VectorMask, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ @@ -3138,7 +3197,8 @@ "Writes the result to the top of the stack." ], "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true } }, "Backend": { diff --git a/FEXCore/Source/Interface/IR/IREmitter.cpp b/FEXCore/Source/Interface/IR/IREmitter.cpp index 0850187b1c..95cb2e73dd 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.cpp +++ b/FEXCore/Source/Interface/IR/IREmitter.cpp @@ -41,6 +41,7 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) { case FPRClass: case GPRFixedClass: case FPRFixedClass: + case PREDClass: case InvalidClass: return Class; default: break; } diff --git a/FEXCore/Source/Interface/IR/IREmitter.h b/FEXCore/Source/Interface/IR/IREmitter.h index 0cfc4027be..c5af4efdd3 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.h +++ b/FEXCore/Source/Interface/IR/IREmitter.h @@ -1,6 +1,7 @@ // SPDX-License-Identifier: MIT #pragma once +#include "CodeEmitter/Emitter.h" #include "Interface/IR/IR.h" #include "Interface/IR/IntrusiveIRList.h" @@ -9,9 +10,9 @@ #include #include +#include #include -#include #include #include @@ -45,6 +46,37 @@ class IREmitter { } void ResetWorkingList(); + // Predicate Cache Implementation + // This lives here rather than OpcodeDispatcher because x87StackOptimization Pass + // also needs it. + struct PredicateKey { + ARMEmitter::PredicatePattern Pattern; + OpSize Size; + bool operator==(const PredicateKey& rhs) const = default; + }; + + struct PredicateKeyHash { + size_t operator()(const PredicateKey& key) const { + return FEXCore::ToUnderlying(key.Pattern) + (FEXCore::ToUnderlying(key.Size) * FEXCore::ToUnderlying(OpSize::iInvalid)); + } + }; + fextl::unordered_map InitPredicateCache; + + Ref InitPredicateCached(OpSize Size, ARMEmitter::PredicatePattern Pattern) { + PredicateKey Key {Pattern, Size}; + auto ValIt = InitPredicateCache.find(Key); + if (ValIt == InitPredicateCache.end()) { + auto Predicate = _InitPredicate(Size, static_cast(FEXCore::ToUnderlying(Pattern))); + InitPredicateCache[Key] = Predicate; + return Predicate; + } + return ValIt->second; + } + + void ResetInitPredicateCache() { + InitPredicateCache.clear(); + } + /** * @name IR allocation routines * diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index 247b54633e..476d9f9eab 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -822,10 +822,11 @@ void X87StackOptimization::Run(IREmitter* Emit) { if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode); } - if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize() - if (Features.SupportsSVE128 || Features.SupportsSVE256) { - auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); - IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode); + if (Op->StoreSize == OpSize::f80Bit) { + Ref PredReg = CurrentIR.GetNode(Op->PredReg); + bool CanUsePredicateStore = (Features.SupportsSVE128 || Features.SupportsSVE256) && PredReg; + if (CanUsePredicateStore) { + IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PredReg, AddrNode); } else { // For X87 extended doubles, split before storing IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);