From ce49434beda1561fc0e56b8ba26c6f6bffb45d2e Mon Sep 17 00:00:00 2001
From: Paulo Matos <pmatos@igalia.com>
Date: Wed, 22 Jan 2025 10:04:25 +0100
Subject: [PATCH] Ensure predicate cache is reset when control flow leaves
 block

Whenever the control float leaves the block, it might clobber the
predicate register so we reset the cache whenever that happens.

The difficulty here is that the cache is valid only during IR generation
so we need to make sure we catch all the cases during this pass where
the execution might leave the block.

Fixes #4264
---
 .../Core/ArchHelpers/Arm64Emitter.cpp          | 18 ++++++++++++------
 .../Interface/Core/ArchHelpers/Arm64Emitter.h  |  9 +++++++++
 FEXCore/Source/Interface/Core/JIT/JIT.cpp      |  2 ++
 .../Source/Interface/Core/JIT/MemoryOps.cpp    |  6 +++---
 .../Source/Interface/Core/OpcodeDispatcher.cpp |  4 ++--
 .../Source/Interface/Core/OpcodeDispatcher.h   |  4 ----
 FEXCore/Source/Interface/IR/IR.json            |  4 ++--
 FEXCore/Source/Interface/IR/IREmitter.cpp      |  1 +
 FEXCore/Source/Interface/IR/IREmitter.h        |  2 ++
 .../IR/Passes/RegisterAllocationPass.cpp       |  5 ++++-
 .../IR/Passes/x87StackOptimizationPass.cpp     |  8 ++++----
 11 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
index d54a467e3f..298d80ec10 100644
--- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
+++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
@@ -60,8 +60,8 @@ namespace x64 {
   // p6 and p7 registers are used as temporaries no not added here for RA
   // See PREF_TMP_16B and PREF_TMP_32B
   // p0-p1 are also used in the jit as temps.
-  // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5.
-  constexpr std::array<ARMEmitter::PRegister, 4> PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
+  // Also p8-p15 cannot be used can only encode p0-p7, p2 is a static register, so we're left with p3-p5.
+  constexpr std::array<ARMEmitter::PRegister, 3> PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
 
   constexpr unsigned RAPairs = 6;
 
@@ -82,7 +82,7 @@ namespace x64 {
     ARMEmitter::VReg::v12, ARMEmitter::VReg::v13, ARMEmitter::VReg::v14, ARMEmitter::VReg::v15,
   };
 #else
-  constexpr std::array<ARMEmitter::Register, 18> SRA = {
+  constexpr std::array<ARMEmitter::Register, 19> SRA = {
     ARMEmitter::Reg::r8,
     ARMEmitter::Reg::r0,
     ARMEmitter::Reg::r1,
@@ -100,6 +100,8 @@ namespace x64 {
     ARMEmitter::Reg::r20,
     ARMEmitter::Reg::r21,
     ARMEmitter::Reg::r22,
+    // Predicate register
+    ARMEmitter::PR::p2,
     REG_PF,
     REG_AF,
   };
@@ -112,8 +114,8 @@ namespace x64 {
   // p6 and p7 registers are used as temporaries no not added here for RA
   // See PREF_TMP_16B and PREF_TMP_32B
   // p0-p1 are also used in the jit as temps.
-  // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5.
-  constexpr std::array<ARMEmitter::PRegister, 4> PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
+  // Also p8-p15 cannot be used can only encode p0-p7, p2 is a static register, so we're left with p2-p5.
+  constexpr std::array<ARMEmitter::PRegister, 3> PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
 
   constexpr unsigned RAPairs = 6;
 
@@ -250,7 +252,7 @@ namespace x32 {
   // See PREF_TMP_16B and PREF_TMP_32B
   // p0-p1 are also used in the jit as temps.
   // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5.
-  constexpr std::array<ARMEmitter::PRegister, 4> PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
+  constexpr std::array<ARMEmitter::PRegister, 3> PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
 
   // All are caller saved
   constexpr std::array<ARMEmitter::VRegister, 8> SRAFPR = {
@@ -822,6 +824,10 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
     }
   }
 
+  if (FillP2) {
+    ptrue(ARMEmitter::SubRegSize::i16Bit, SVE_OPT_PRED, ARMEmitter::PredicatePattern::SVE_VL5);
+  }
+
   // PF/AF are special, remove them from the mask
   uint32_t PFAFMask = ((1u << REG_PF.Idx()) | ((1u << REG_AF.Idx())));
   uint32_t PFAFFillMask = GPRFillMask & PFAFMask;
diff --git a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h
index 44266593da..bfbacca473 100644
--- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h
+++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h
@@ -48,6 +48,10 @@ constexpr auto REG_AF = ARMEmitter::Reg::r27;
 // Vector temporaries
 constexpr auto VTMP1 = ARMEmitter::VReg::v0;
 constexpr auto VTMP2 = ARMEmitter::VReg::v1;
+
+// Predicate register for X87 SVE Optimization
+constexpr auto SVE_OPT_PRED = ARMEmitter::PReg::p2;
+
 #else
 constexpr auto TMP1 = ARMEmitter::XReg::x10;
 constexpr auto TMP2 = ARMEmitter::XReg::x11;
@@ -67,6 +71,9 @@ constexpr auto VTMP2 = ARMEmitter::VReg::v17;
 constexpr auto EC_CALL_CHECKER_PC_REG = ARMEmitter::XReg::x9;
 constexpr auto EC_ENTRY_CPUAREA_REG = ARMEmitter::XReg::x17;
 
+// Predicate register for X87 SVE Optimization
+constexpr auto SVE_OPT_PRED = ARMEmitter::PReg::p2;
+
 // These structures are not included in the standard Windows headers, define the offsets of members we care about for EC here.
 constexpr size_t TEB_CPU_AREA_OFFSET = 0x1788;
 constexpr size_t TEB_PEB_OFFSET = 0x60;
@@ -107,6 +114,8 @@ class Arm64Emitter : public ARMEmitter::Emitter {
   void LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, uint64_t Constant, bool NOPPad = false);
 
   void FillSpecialRegs(ARMEmitter::Register TmpReg, ARMEmitter::Register TmpReg2, bool SetFIZ, bool SetPredRegs);
+  // Do we need to fill register p2 for the SVE X87 Store Optimization
+  bool FillP2 = false;
 
   // Correlate an ARM register back to an x86 register index.
   // Returning REG_INVALID if there was no mapping.
diff --git a/FEXCore/Source/Interface/Core/JIT/JIT.cpp b/FEXCore/Source/Interface/Core/JIT/JIT.cpp
index e9de9e36b8..17496d67ca 100644
--- a/FEXCore/Source/Interface/Core/JIT/JIT.cpp
+++ b/FEXCore/Source/Interface/Core/JIT/JIT.cpp
@@ -734,6 +734,8 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size
   this->DebugData = DebugData;
   this->IR = IR;
 
+  FillP2 = false; // Reset for each block
+
   // Fairly excessive buffer range to make sure we don't overflow
   uint32_t BufferRange = SSACount * 16;
   if ((GetCursorOffset() + BufferRange) > CurrentCodeBuffer->Size) {
diff --git a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp
index f56c6bf888..db2a400462 100644
--- a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp
+++ b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp
@@ -1590,10 +1590,10 @@ DEF_OP(StoreMem) {
   }
 }
 
-DEF_OP(InitPredicate) {
-  const auto Op = IROp->C<IR::IROp_InitPredicate>();
+DEF_OP(LoadSVEOptPredicate) {
+  FillP2 = true;
   const auto OpSize = IROp->Size;
-  ptrue(ConvertSubRegSize16(OpSize), GetPReg(Node), static_cast<ARMEmitter::PredicatePattern>(Op->Pattern));
+  ptrue(ConvertSubRegSize16(OpSize), GetPReg(Node), ARMEmitter::PredicatePattern::SVE_VL5);
 }
 
 DEF_OP(StoreMemPredicate) {
diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
index db1b5e38d0..5f5639491b 100644
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
@@ -4314,7 +4314,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T
       Ref MemSrc = LoadEffectiveAddress(A, true);
       if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
         // Using SVE we can load this with a single instruction.
-        auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
+        auto PReg = _LoadSVEOptPredicate(OpSize::i16Bit);
         return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc);
       } else {
         // For X87 extended doubles, Split the load.
@@ -4448,7 +4448,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
   if (OpSize == OpSize::f80Bit) {
     Ref MemStoreDst = LoadEffectiveAddress(A, true);
     if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
-      auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
+      auto PReg = _LoadSVEOptPredicate(OpSize::i16Bit);
       _StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst);
     } else {
       // For X87 extended doubles, split before storing
diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
index 4fd2591cea..5e43b3dd71 100644
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
@@ -718,7 +718,6 @@ class OpDispatchBuilder final : public IREmitter {
   void FNINIT(OpcodeArgs);
 
   void X87ModifySTP(OpcodeArgs, bool Inc);
-  void X87SinCos(OpcodeArgs);
   void X87FYL2X(OpcodeArgs, bool IsFYL2XP1);
   void X87LDENV(OpcodeArgs);
   void X87FLDCW(OpcodeArgs);
@@ -764,9 +763,6 @@ class OpDispatchBuilder final : public IREmitter {
   void FTSTF64(OpcodeArgs);
   void FRNDINTF64(OpcodeArgs);
   void FSQRTF64(OpcodeArgs);
-  void X87UnaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp);
-  void X87BinaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp);
-  void X87SinCosF64(OpcodeArgs);
   void X87FLDCWF64(OpcodeArgs);
   void X87TANF64(OpcodeArgs);
   void X87ATANF64(OpcodeArgs);
diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json
index c2eff80377..14a978003e 100644
--- a/FEXCore/Source/Interface/IR/IR.json
+++ b/FEXCore/Source/Interface/IR/IR.json
@@ -567,8 +567,8 @@
         ]
       },
 
-      "PRED = InitPredicate OpSize:#Size, u8:$Pattern": {
-        "Desc": ["Initialize predicate register from Pattern"],
+      "PRED = LoadSVEOptPredicate OpSize:#Size": {
+        "Desc": ["Load the predicate register for the X87 SVE optimization with the necessary pattern (VL5)."],
         "DestSize": "Size"
       },
 
diff --git a/FEXCore/Source/Interface/IR/IREmitter.cpp b/FEXCore/Source/Interface/IR/IREmitter.cpp
index 61d7d4bcb6..a65c58ea69 100644
--- a/FEXCore/Source/Interface/IR/IREmitter.cpp
+++ b/FEXCore/Source/Interface/IR/IREmitter.cpp
@@ -41,6 +41,7 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) {
   case FPRClass:
   case GPRFixedClass:
   case FPRFixedClass:
+  case PREDClass:
   case InvalidClass: return Class;
   default: break;
   }
diff --git a/FEXCore/Source/Interface/IR/IREmitter.h b/FEXCore/Source/Interface/IR/IREmitter.h
index 3df70d0d68..fb24e94018 100644
--- a/FEXCore/Source/Interface/IR/IREmitter.h
+++ b/FEXCore/Source/Interface/IR/IREmitter.h
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: MIT
 #pragma once
 
+#include "CodeEmitter/Emitter.h"
 #include "Interface/IR/IR.h"
 #include "Interface/IR/IntrusiveIRList.h"
 
@@ -9,6 +10,7 @@
 
 #include <FEXCore/Utils/LogManager.h>
 #include <FEXCore/fextl/vector.h>
+#include <FEXCore/fextl/unordered_map.h>
 
 #include <algorithm>
 #include <stdint.h>
diff --git a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp
index b5945d7599..366c003c5a 100644
--- a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp
+++ b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp
@@ -235,7 +235,7 @@ class ConstrainedRAPass final : public RegisterAllocationPass {
   };
 
   Ref DecodeSRANode(const IROp_Header* IROp, Ref Node) {
-    if (IROp->Op == OP_LOADREGISTER || IROp->Op == OP_LOADPF || IROp->Op == OP_LOADAF) {
+    if (IROp->Op == OP_LOADREGISTER || IROp->Op == OP_LOADPF || IROp->Op == OP_LOADAF || IROp->Op == OP_LOADSVEOPTPREDICATE) {
       return Node;
     } else if (IROp->Op == OP_STOREREGISTER) {
       const IROp_StoreRegister* Op = IROp->C<IR::IROp_StoreRegister>();
@@ -269,6 +269,9 @@ class ConstrainedRAPass final : public RegisterAllocationPass {
       return PhysicalRegister {GPRFixedClass, FlagOffset};
     } else if (IROp->Op == OP_LOADAF || IROp->Op == OP_STOREAF) {
       return PhysicalRegister {GPRFixedClass, (uint8_t)(FlagOffset + 1)};
+    } else if (IROp->Op == OP_LOADSVEOPTPREDICATE) {
+      // We use p2 for the X87 SVE Store optimization.
+      return PhysicalRegister {PREDClass, 2};
     }
 
     LOGMAN_THROW_A_FMT(Class == GPRClass || Class == FPRClass, "SRA classes");
diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp
index 98c43f1e3b..0e9f5869f3 100644
--- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp
+++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp
@@ -6,7 +6,7 @@
 #include "FEXCore/IR/IR.h"
 #include "FEXCore/Utils/Profiler.h"
 #include "FEXCore/Core/HostFeatures.h"
-#include "CodeEmitter/Emitter.h"
+#include "Interface/Core/ArchHelpers/Arm64Emitter.h"
 
 #include <array>
 #include <cstddef>
@@ -838,13 +838,13 @@ void X87StackOptimization::Run(IREmitter* Emit) {
             if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert
               StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode);
             }
-            if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize()
+            if (Op->StoreSize == OpSize::f80Bit) {
               if (Features.SupportsSVE128 || Features.SupportsSVE256) {
-                auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
+                auto PredReg = IREmit->_LoadSVEOptPredicate(OpSize::i16Bit);
                 if (!IsZero(Offset)) {
                   AddrNode = IREmit->_Add(OpSize::i64Bit, AddrNode, Offset);
                 }
-                IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode);
+                IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PredReg, AddrNode);
               } else {
                 // For X87 extended doubles, split before storing
                 IREmit->_StoreMem(FPRClass, OpSize::i64Bit, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);