From ddbf71fe60a00aac4edb778a629209b679dac0f3 Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Fri, 10 Jan 2025 18:16:53 +0100 Subject: [PATCH] Ensure predicate cache is reset when control flow leaves block Whenever the control float leaves the block, it might clobber the predicate register so we reset the cache whenever that happens. The difficulty here is that the cache is valid only during IR generation so we need to make sure we catch all the cases during this pass where the execution might leave the block. Fixes #4264 --- .../Source/Interface/Core/OpcodeDispatcher.h | 13 +++++++--- .../Interface/Core/OpcodeDispatcher/X87.cpp | 25 ++++++++++++++++++- .../Core/OpcodeDispatcher/X87F64.cpp | 2 +- FEXCore/Source/Interface/IR/IR.json | 5 ++-- .../IR/Passes/x87StackOptimizationPass.cpp | 9 ++++--- 5 files changed, 42 insertions(+), 12 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 9545e87612..5acdb9625f 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -129,36 +129,44 @@ class OpDispatchBuilder final : public IREmitter { IRPair Jump() { FlushRegisterCache(); + ResetInitPredicateCache(); return _Jump(); } IRPair Jump(Ref _TargetBlock) { FlushRegisterCache(); + ResetInitPredicateCache(); return _Jump(_TargetBlock); } IRPair CondJump(Ref _Cmp1, Ref _Cmp2, Ref _TrueBlock, Ref _FalseBlock, CondClassType _Cond = {COND_NEQ}, IR::OpSize _CompareSize = OpSize::iInvalid) { FlushRegisterCache(); + ResetInitPredicateCache(); return _CondJump(_Cmp1, _Cmp2, _TrueBlock, _FalseBlock, _Cond, _CompareSize); } IRPair CondJump(Ref ssa0, CondClassType cond = {COND_NEQ}) { FlushRegisterCache(); + ResetInitPredicateCache(); return _CondJump(ssa0, cond); } IRPair CondJump(Ref ssa0, Ref ssa1, Ref ssa2, CondClassType cond = {COND_NEQ}) { FlushRegisterCache(); + ResetInitPredicateCache(); return _CondJump(ssa0, ssa1, ssa2, cond); } IRPair CondJumpNZCV(CondClassType Cond) { FlushRegisterCache(); + ResetInitPredicateCache(); return _CondJump(InvalidNode, InvalidNode, InvalidNode, InvalidNode, Cond, OpSize::iInvalid, true); } IRPair CondJumpBit(Ref Src, unsigned Bit, bool Set) { FlushRegisterCache(); + ResetInitPredicateCache(); auto InlineConst = _InlineConstant(Bit); return _CondJump(Src, InlineConst, InvalidNode, InvalidNode, {Set ? COND_TSTNZ : COND_TSTZ}, OpSize::iInvalid, false); } IRPair ExitFunction(Ref NewRIP) { FlushRegisterCache(); + ResetInitPredicateCache(); return _ExitFunction(NewRIP); } IRPair Break(BreakDefinition Reason) { @@ -167,6 +175,7 @@ class OpDispatchBuilder final : public IREmitter { } IRPair Thunk(Ref ArgPtr, SHA256Sum ThunkNameHash) { FlushRegisterCache(); + ResetInitPredicateCache(); return _Thunk(ArgPtr, ThunkNameHash); } @@ -718,7 +727,6 @@ class OpDispatchBuilder final : public IREmitter { void FNINIT(OpcodeArgs); void X87ModifySTP(OpcodeArgs, bool Inc); - void X87SinCos(OpcodeArgs); void X87FYL2X(OpcodeArgs, bool IsFYL2XP1); void X87LDENV(OpcodeArgs); void X87FLDCW(OpcodeArgs); @@ -764,9 +772,6 @@ class OpDispatchBuilder final : public IREmitter { void FTSTF64(OpcodeArgs); void FRNDINTF64(OpcodeArgs); void FSQRTF64(OpcodeArgs); - void X87UnaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp); - void X87BinaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp); - void X87SinCosF64(OpcodeArgs); void X87FLDCWF64(OpcodeArgs); void X87TANF64(OpcodeArgs); void X87ATANF64(OpcodeArgs); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp index 1470768f8f..e25174c07d 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp @@ -130,7 +130,11 @@ void OpDispatchBuilder::FILD(OpcodeArgs) { void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) { Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); - _StoreStackMemory(Mem, OpSize::i128Bit, true, Width); + Ref PredReg = Invalid(); + if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { + PredReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); + } + _StoreStackMemory(PredReg, Mem, OpSize::i128Bit, true, Width); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } @@ -164,6 +168,9 @@ void OpDispatchBuilder::FADD(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispa if (Op->Src[0].IsNone()) { // Implicit argument case auto Offset = Op->OP & 7; auto St0 = 0; + if (!ReducedPrecisionMode) { + ResetInitPredicateCache(); + } if (ResInST0 == OpResult::RES_STI) { _F80AddStack(Offset, St0); } else { @@ -194,6 +201,9 @@ void OpDispatchBuilder::FMUL(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispa if (Op->Src[0].IsNone()) { // Implicit argument case auto offset = Op->OP & 7; auto st0 = 0; + if (!ReducedPrecisionMode) { + ResetInitPredicateCache(); + } if (ResInST0 == OpResult::RES_STI) { _F80MulStack(offset, st0); } else { @@ -230,6 +240,9 @@ void OpDispatchBuilder::FDIV(OpcodeArgs, IR::OpSize Width, bool Integer, bool Re const auto St0 = 0; const auto Result = (ResInST0 == OpResult::RES_STI) ? Offset : St0; + if (!ReducedPrecisionMode) { + ResetInitPredicateCache(); + } if (Reverse ^ (ResInST0 == OpResult::RES_STI)) { _F80DivStack(Result, Offset, St0); } else { @@ -271,6 +284,9 @@ void OpDispatchBuilder::FSUB(OpcodeArgs, IR::OpSize Width, bool Integer, bool Re const auto St0 = 0; const auto Result = (ResInST0 == OpResult::RES_STI) ? Offset : St0; + if (!ReducedPrecisionMode) { + ResetInitPredicateCache(); + } if (Reverse ^ (ResInST0 == OpResult::RES_STI)) { _F80SubStack(Result, Offset, St0); } else { @@ -589,6 +605,10 @@ void OpDispatchBuilder::FXCH(OpcodeArgs) { } void OpDispatchBuilder::X87FYL2X(OpcodeArgs, bool IsFYL2XP1) { + if (!ReducedPrecisionMode) { + ResetInitPredicateCache(); + } + if (IsFYL2XP1) { // create an add between top of stack and 1. Ref One = ReducedPrecisionMode ? _VCastFromGPR(OpSize::i64Bit, OpSize::i64Bit, _Constant(0x3FF0000000000000)) : @@ -671,6 +691,9 @@ void OpDispatchBuilder::FTST(OpcodeArgs) { void OpDispatchBuilder::X87OpHelper(OpcodeArgs, FEXCore::IR::IROps IROp, bool ZeroC2) { DeriveOp(Result, IROp, _F80SCALEStack()); + if (!ReducedPrecisionMode) { + ResetInitPredicateCache(); + } if (ZeroC2) { SetRFLAG(_Constant(0)); } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp index ca4e91f0b5..7313125185 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp @@ -105,7 +105,7 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) { void OpDispatchBuilder::FSTF64(OpcodeArgs, IR::OpSize Width) { Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); - _StoreStackMemory(Mem, OpSize::i64Bit, true, Width); + _StoreStackMemory(Invalid(), Mem, OpSize::i64Bit, true, Width); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index 1cb3e38690..5414619689 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -2788,13 +2788,14 @@ "HasSideEffects": true, "X87": true }, - "StoreStackMemory GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": { + "StoreStackMemory PRED:$PredReg, GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": { "Desc": [ "Takes the top value off the x87 stack and stores it to memory.", "SourceSize is 128bit for F80 values, 64-bit for low precision.", "StoreSize is the store size for conversion:", "Float: 80-bit, 64-bit, or 32-bit", - "Int: 64-bit, 32-bit, 16-bit" + "Int: 64-bit, 32-bit, 16-bit", + "If possible, it will use the PredReg for an SVE store." ], "HasSideEffects": true, "X87": true diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index 247b54633e..476d9f9eab 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -822,10 +822,11 @@ void X87StackOptimization::Run(IREmitter* Emit) { if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode); } - if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize() - if (Features.SupportsSVE128 || Features.SupportsSVE256) { - auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); - IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode); + if (Op->StoreSize == OpSize::f80Bit) { + Ref PredReg = CurrentIR.GetNode(Op->PredReg); + bool CanUsePredicateStore = (Features.SupportsSVE128 || Features.SupportsSVE256) && PredReg; + if (CanUsePredicateStore) { + IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PredReg, AddrNode); } else { // For X87 extended doubles, split before storing IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);