diff --git a/FEXCore/Scripts/json_ir_generator.py b/FEXCore/Scripts/json_ir_generator.py index cda8d9c509..152535bc41 100755 --- a/FEXCore/Scripts/json_ir_generator.py +++ b/FEXCore/Scripts/json_ir_generator.py @@ -55,6 +55,7 @@ class OpDefinition: NonSSAArgNum: int DynamicDispatch: bool LoweredX87: bool + MaybeClobbersPredRegs: bool JITDispatch: bool JITDispatchOverride: str TiedSource: int @@ -79,6 +80,7 @@ def __init__(self): self.NonSSAArgNum = 0 self.DynamicDispatch = False self.LoweredX87 = False + self.MaybeClobbersPredRegs = False self.JITDispatch = True self.JITDispatchOverride = None self.TiedSource = -1 @@ -223,7 +225,7 @@ def parse_ops(ops): (OpArg.Type == "GPR" or OpArg.Type == "GPRPair" or OpArg.Type == "FPR" or - OpArg.Type == "PR")): + OpArg.Type == "PRED")): OpDef.EmitValidation.append(f"GetOpRegClass({ArgName}) == InvalidClass || WalkFindRegClass({ArgName}) == {OpArg.Type}Class") OpArg.Name = ArgName @@ -277,6 +279,10 @@ def parse_ops(ops): assert("JITDispatch" not in op_val) OpDef.JITDispatch = False + if "MaybeClobbersPredRegs" in op_val: + OpDef.MaybeClobbersPredRegs = op_val["MaybeClobbersPredRegs"] + # TODO: Does this imply !JITDispatch? + if "TiedSource" in op_val: OpDef.TiedSource = op_val["TiedSource"] @@ -506,6 +512,7 @@ def print_ir_hassideeffects(): ("HasSideEffects", "bool"), ("ImplicitFlagClobber", "bool"), ("LoweredX87", "bool"), + ("MaybeClobbersPredRegs", "bool"), ("TiedSource", "int8_t"), ]: output_file.write( @@ -707,6 +714,9 @@ def print_ir_allocator_helpers(): "\t\tif(MMXState == MMXState_MMX) ChgStateMMX_X87();\n" ) + if op.MaybeClobbersPredRegs: + output_file.write("\t\tResetInitPredicateCache();\n") + output_file.write("\t\tauto _Op = AllocateOp();\n".format(op.Name, op.Name.upper())) if op.SSAArgNum != 0: diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 6dd2d8b108..f729823364 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -4314,7 +4314,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T Ref MemSrc = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { // Using SVE we can load this with a single instruction. - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc); } else { // For X87 extended doubles, Split the load. @@ -4448,7 +4448,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl if (OpSize == OpSize::f80Bit) { Ref MemStoreDst = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); _StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst); } else { // For X87 extended doubles, split before storing diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 9545e87612..fe0c4b7b83 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -119,6 +119,7 @@ class OpDispatchBuilder final : public IREmitter { CachedNZCV = nullptr; CFInverted = CFInvertedABI; FlushRegisterCache(); + ResetInitPredicateCache(); // New block needs to reset segment telemetry. SegmentsNeedReadCheck = ~0U; @@ -718,7 +719,6 @@ class OpDispatchBuilder final : public IREmitter { void FNINIT(OpcodeArgs); void X87ModifySTP(OpcodeArgs, bool Inc); - void X87SinCos(OpcodeArgs); void X87FYL2X(OpcodeArgs, bool IsFYL2XP1); void X87LDENV(OpcodeArgs); void X87FLDCW(OpcodeArgs); @@ -764,9 +764,6 @@ class OpDispatchBuilder final : public IREmitter { void FTSTF64(OpcodeArgs); void FRNDINTF64(OpcodeArgs); void FSQRTF64(OpcodeArgs); - void X87UnaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp); - void X87BinaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp); - void X87SinCosF64(OpcodeArgs); void X87FLDCWF64(OpcodeArgs); void X87TANF64(OpcodeArgs); void X87ATANF64(OpcodeArgs); @@ -1175,6 +1172,7 @@ class OpDispatchBuilder final : public IREmitter { } void FlushRegisterCache(bool SRAOnly = false) { + // At block boundaries, fix up the carry flag. if (!SRAOnly) { RectifyCarryInvert(CFInvertedABI); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp index 1470768f8f..3eed79f50d 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp @@ -130,7 +130,11 @@ void OpDispatchBuilder::FILD(OpcodeArgs) { void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) { Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); - _StoreStackMemory(Mem, OpSize::i128Bit, true, Width); + Ref PredReg = Invalid(); + if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { + PredReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); + } + _StoreStackMemory(PredReg, Mem, OpSize::i128Bit, true, Width); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } @@ -267,9 +271,9 @@ void OpDispatchBuilder::FDIV(OpcodeArgs, IR::OpSize Width, bool Integer, bool Re void OpDispatchBuilder::FSUB(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpDispatchBuilder::OpResult ResInST0) { if (Op->Src[0].IsNone()) { - const auto Offset = Op->OP & 7; - const auto St0 = 0; - const auto Result = (ResInST0 == OpResult::RES_STI) ? Offset : St0; + const uint8_t Offset = Op->OP & 7; + const uint8_t St0 = 0; + const uint8_t Result = (ResInST0 == OpResult::RES_STI) ? Offset : St0; if (Reverse ^ (ResInST0 == OpResult::RES_STI)) { _F80SubStack(Result, Offset, St0); @@ -751,13 +755,11 @@ void OpDispatchBuilder::FNINIT(OpcodeArgs) { } void OpDispatchBuilder::X87FFREE(OpcodeArgs) { - _InvalidateStack(Op->OP & 7); } void OpDispatchBuilder::X87EMMS(OpcodeArgs) { // Tags all get set to 0b11 - _InvalidateStack(0xff); } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp index ca4e91f0b5..7313125185 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp @@ -105,7 +105,7 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) { void OpDispatchBuilder::FSTF64(OpcodeArgs, IR::OpSize Width) { Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); - _StoreStackMemory(Mem, OpSize::i64Bit, true, Width); + _StoreStackMemory(Invalid(), Mem, OpSize::i64Bit, true, Width); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index 1cb3e38690..0d63d614bb 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -211,7 +211,8 @@ }, "ThreadRemoveCodeEntry": { - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "GPR = ProcessorID": { @@ -248,7 +249,8 @@ "Print SSA:$Value": { "HasSideEffects": true, "Desc": ["Debug operation that prints an SSA value to the console", - "May only print 64bits of the value"] + "May only print 64bits of the value"], + "MaybeClobbersPredRegs": true }, "GPR = AllocateGPR i1:$ForPair": { "Desc": ["Silly pseudo-instruction to allocate a register for a future destination", @@ -312,7 +314,8 @@ "HasSideEffects": true, "Desc": ["Dispatches a guest syscall through to the SyscallHandler class" ], - "DestSize": "OpSize::i64Bit" + "DestSize": "OpSize::i64Bit", + "MaybeClobbersPredRegs": true }, "GPR = InlineSyscall GPR:$Arg0, GPR:$Arg1, GPR:$Arg2, GPR:$Arg3, GPR:$Arg4, GPR:$Arg5, i32:$HostSyscallNumber, SyscallFlags:$Flags": { @@ -328,18 +331,21 @@ }, "Thunk GPR:$ArgPtr, SHA256Sum:$ThunkNameHash": { - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "GPR:$EAX, GPR:$EBX, GPR:$ECX, GPR:$EDX = CPUID GPR:$Function, GPR:$Leaf": { "Desc": ["Calls in to the CPUID handler function to return emulated CPUID"], "DestSize": "OpSize::i32Bit", - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "GPR:$EAX, GPR:$EDX = XGetBV GPR:$Function": { "Desc": ["Calls in to the XCR handler function to return emulated XCR"], "DestSize": "OpSize::i32Bit", - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true } }, "Moves": { @@ -578,6 +584,7 @@ "HasSideEffects": true, "ElementSize": "ElementSize" }, + "FPR = LoadMemPredicate OpSize:#RegisterSize, OpSize:#ElementSize, PRED:$Mask, GPR:$Addr": { "Desc": [ "Loads a value to memory using SVE predicate mask." ], "DestSize": "RegisterSize", @@ -1631,7 +1638,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "GPR = LUDiv OpSize:#Size, GPR:$Lower, GPR:$Upper, GPR:$Divisor": { "Desc": ["Integer long unsigned division returning lower bits", @@ -1641,7 +1649,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "GPR = LRem OpSize:#Size, GPR:$Lower, GPR:$Upper, GPR:$Divisor": { "Desc": ["Integer long signed remainder returning lower bits", @@ -1651,7 +1660,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "GPR = LURem OpSize:#Size, GPR:$Lower, GPR:$Upper, GPR:$Divisor": { "Desc": ["Integer long unsigned remainder returning lower bits", @@ -1661,7 +1671,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "Float to GPR": {"Ignore": 1}, @@ -2788,13 +2799,14 @@ "HasSideEffects": true, "X87": true }, - "StoreStackMemory GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": { + "StoreStackMemory PRED:$PredReg, GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": { "Desc": [ "Takes the top value off the x87 stack and stores it to memory.", "SourceSize is 128bit for F80 values, 64-bit for low precision.", "StoreSize is the store size for conversion:", "Float: 80-bit, 64-bit, or 32-bit", - "Int: 64-bit, 32-bit, 16-bit" + "Int: 64-bit, 32-bit, 16-bit", + "If possible, it will use the PredReg for an SVE store." ], "HasSideEffects": true, "X87": true @@ -2834,18 +2846,21 @@ "Adds two stack locations together, storing the result in to the first stack location" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80AddValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Adds a operand value to a stack location. The result stored in to the stack location provided." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Add FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SubStack u8:$DstStack, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ @@ -2853,7 +2868,8 @@ "The result is stored in stack location TOP+$DstStack." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80SubValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ @@ -2861,7 +2877,8 @@ "The result is stored in stack location TOP." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80SubRValue FPR:$X80Src, u8:$SrcStack": { "Desc": [ @@ -2869,7 +2886,8 @@ "The result is stored in stack location TOP." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Sub FPR:$X80Src1, FPR:$X80Src2": { "Desc": [ @@ -2878,25 +2896,29 @@ "`FPR = X80Src2 - X80Src1`" ], "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80MulStack u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ "Multiplies two stack locations together, storing the result in to the first stack location" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80MulValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Multiplies a operand value to a stack location. The result stored in to the stack location provided." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Mul FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80DivStack u8:$DstStack, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ @@ -2905,7 +2927,8 @@ "`FPR|Stack[TOP+DstStack] = Stack[TOP+SrcStack1] / Stack[TOP+SrcStack2]`" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80DivValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ @@ -2914,7 +2937,8 @@ "`FPR|Stack[TOP] = Stack[TOP+SrcStack] / X80Src`" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80DivRValue FPR:$X80Src, u8:$SrcStack": { "Desc": [ @@ -2923,7 +2947,8 @@ "`FPR|Stack[TOP] = X80Src / Stack[TOP+SrcStack]`" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Div FPR:$X80Src1, FPR:$X80Src2": { "Desc": [ @@ -2932,7 +2957,8 @@ "`FPR = X80Src1 / X80Src2`" ], "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80StackXchange u8:$SrcStack": { "Desc": [ @@ -2957,14 +2983,16 @@ ], "HasSideEffects": true, "DestSize": "OpSize::i128Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80PTANStack": { "Desc": [ "Computes the approximate tangent of the source operand in register ST(0), stores the result in ST(0), and pushes a 1.0 onto the FPU register stack." ], "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80ATANStack": { "Desc": [ @@ -2972,51 +3000,63 @@ ], "DestSize": "OpSize::i128Bit", "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80ATAN FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80FPREMStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80FPREM FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80FPREM1Stack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80FPREM1 FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SCALEStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80SCALE FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80CVT OpSize:#Size, FPR:$X80Src": { "DestSize": "Size", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "GPR = F80CVTInt OpSize:#Size, FPR:$X80Src, i1:$Truncate": { "DestSize": "Size", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80CVTTo FPR:$X80Src, OpSize:$SrcSize": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80CVTToInt GPR:$Src, OpSize:$SrcSize": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80RoundStack": { "Desc": [ @@ -3027,62 +3067,76 @@ }, "FPR = F80Round FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80F2XM1Stack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80F2XM1 FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80TAN FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SINStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80SIN FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80COSStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80COS FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SINCOSStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "F80SQRTStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80SQRT FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80XTRACT_EXP FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80XTRACT_SIG FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "GPR = F80StackTest u8:$SrcStack": { "Desc": [ "Does comparison between value in stack at TOP + SrcStack" ], "DestSize": "OpSize::i32Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "GPR = F80CmpStack u8:$SrcStack": { "Desc": [ @@ -3090,7 +3144,8 @@ "Ordering flag result is true if either float input is NaN" ], "DestSize": "OpSize::i32Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "GPR = F80CmpValue FPR:$X80Src": { "Desc": [ @@ -3099,14 +3154,16 @@ ], "DestSize": "OpSize::i32Bit", "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "GPR = F80Cmp FPR:$X80Src1, FPR:$X80Src2": { "Desc": ["Does a scalar unordered compare and stores the flags in to a GPR", "Ordering flag result is true if either float input is NaN" ], "DestSize": "OpSize::i32Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80BCDLoad FPR:$X80Src": { "DestSize": "OpSize::i128Bit", @@ -3124,11 +3181,13 @@ ], "HasSideEffects": true, "DestSize": "OpSize::i128Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80FYL2X FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80VBSLStack OpSize:#RegisterSize, FPR:$VectorMask, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ @@ -3138,7 +3197,8 @@ "Writes the result to the top of the stack." ], "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true } }, "Backend": { diff --git a/FEXCore/Source/Interface/IR/IREmitter.cpp b/FEXCore/Source/Interface/IR/IREmitter.cpp index 0850187b1c..95cb2e73dd 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.cpp +++ b/FEXCore/Source/Interface/IR/IREmitter.cpp @@ -41,6 +41,7 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) { case FPRClass: case GPRFixedClass: case FPRFixedClass: + case PREDClass: case InvalidClass: return Class; default: break; } diff --git a/FEXCore/Source/Interface/IR/IREmitter.h b/FEXCore/Source/Interface/IR/IREmitter.h index 0cfc4027be..c5af4efdd3 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.h +++ b/FEXCore/Source/Interface/IR/IREmitter.h @@ -1,6 +1,7 @@ // SPDX-License-Identifier: MIT #pragma once +#include "CodeEmitter/Emitter.h" #include "Interface/IR/IR.h" #include "Interface/IR/IntrusiveIRList.h" @@ -9,9 +10,9 @@ #include #include +#include #include -#include #include #include @@ -45,6 +46,37 @@ class IREmitter { } void ResetWorkingList(); + // Predicate Cache Implementation + // This lives here rather than OpcodeDispatcher because x87StackOptimization Pass + // also needs it. + struct PredicateKey { + ARMEmitter::PredicatePattern Pattern; + OpSize Size; + bool operator==(const PredicateKey& rhs) const = default; + }; + + struct PredicateKeyHash { + size_t operator()(const PredicateKey& key) const { + return FEXCore::ToUnderlying(key.Pattern) + (FEXCore::ToUnderlying(key.Size) * FEXCore::ToUnderlying(OpSize::iInvalid)); + } + }; + fextl::unordered_map InitPredicateCache; + + Ref InitPredicateCached(OpSize Size, ARMEmitter::PredicatePattern Pattern) { + PredicateKey Key {Pattern, Size}; + auto ValIt = InitPredicateCache.find(Key); + if (ValIt == InitPredicateCache.end()) { + auto Predicate = _InitPredicate(Size, static_cast(FEXCore::ToUnderlying(Pattern))); + InitPredicateCache[Key] = Predicate; + return Predicate; + } + return ValIt->second; + } + + void ResetInitPredicateCache() { + InitPredicateCache.clear(); + } + /** * @name IR allocation routines * diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index 247b54633e..476d9f9eab 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -822,10 +822,11 @@ void X87StackOptimization::Run(IREmitter* Emit) { if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode); } - if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize() - if (Features.SupportsSVE128 || Features.SupportsSVE256) { - auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); - IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode); + if (Op->StoreSize == OpSize::f80Bit) { + Ref PredReg = CurrentIR.GetNode(Op->PredReg); + bool CanUsePredicateStore = (Features.SupportsSVE128 || Features.SupportsSVE256) && PredReg; + if (CanUsePredicateStore) { + IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PredReg, AddrNode); } else { // For X87 extended doubles, split before storing IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);