Merge pull request FEX-Emu#3883 from Sonicadvance1/implement_daz

Arm64: Implements support for DAZ using AFP.FIZ
pmatos · Jul 21, 2024 · f8c6baa · f8c6baa
2 parents 5c9bb65 + 4fffe68
commit f8c6baa
Show file tree

Hide file tree

Showing 16 changed files with 180 additions and 113 deletions.
diff --git a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
@@ -575,7 +575,7 @@ void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs, uint3
   if (EmitterCTX->HostFeatures.SupportsAFP) {
     // Disable AFP features when spilling registers.
     //
-    // Disable FPCR.NEP and FPCR.AH
+    // Disable FPCR.NEP and FPCR.AH and FPCR.FIZ
     // NEP(2): Changes ASIMD scalar instructions to insert in to the lower bits of the destination.
     // AH(1):  Changes NaN behaviour in some instructions. Specifically fmin, fmax.
     //         Also interacts with RPRES to change reciprocal/rsqrt precision from 8-bit mantissa to 12-bit.
@@ -585,7 +585,8 @@ void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs, uint3
     mrs(TmpReg, ARMEmitter::SystemRegister::FPCR);
     bic(ARMEmitter::Size::i64Bit, TmpReg, TmpReg,
         (1U << 2) |   // NEP
-          (1U << 1)); // AH
+          (1U << 1) | // AH
+          (1U << 0)); // FIZ
     msr(ARMEmitter::SystemRegister::FPCR, TmpReg);
   }
 #endif
@@ -663,19 +664,32 @@ void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs, uint3
   }
 }
 
-void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRFillMask) {
-  ARMEmitter::Register TmpReg = ARMEmitter::Reg::r0;
-  LOGMAN_THROW_A_FMT(GPRFillMask != 0, "Must fill at least 1 GPR for a temp");
-  [[maybe_unused]] bool FoundRegister {};
-  for (auto Reg : StaticRegisters) {
-    if (((1U << Reg.Idx()) & GPRFillMask)) {
-      TmpReg = Reg;
-      FoundRegister = true;
-      break;
+void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRFillMask, std::optional<ARMEmitter::Register> OptionalReg,
+                                  std::optional<ARMEmitter::Register> OptionalReg2) {
+  auto FindTempReg = [this](uint32_t* GPRFillMask) -> std::optional<ARMEmitter::Register> {
+    for (auto Reg : StaticRegisters) {
+      if (((1U << Reg.Idx()) & *GPRFillMask)) {
+        *GPRFillMask &= ~(1U << Reg.Idx());
+        return std::make_optional(Reg);
+      }
     }
+    return std::nullopt;
+  };
+
+  LOGMAN_THROW_A_FMT(GPRFillMask != 0, "Must fill at least 2 GPRs for a temp");
+  uint32_t TempGPRFillMask = GPRFillMask;
+  if (!OptionalReg.has_value()) {
+    OptionalReg = FindTempReg(&TempGPRFillMask);
   }
 
-  LOGMAN_THROW_A_FMT(FoundRegister, "Didn't have an SRA register to use as a temporary while spilling!");
+  if (!OptionalReg2.has_value()) {
+    OptionalReg2 = FindTempReg(&TempGPRFillMask);
+  }
+  LOGMAN_THROW_A_FMT(OptionalReg.has_value() && OptionalReg2.has_value(), "Didn't have an SRA register to use as a temporary while "
+                                                                          "spilling!");
+
+  auto TmpReg = *OptionalReg;
+  [[maybe_unused]] auto TmpReg2 = *OptionalReg2;
 
 #ifndef VIXL_SIMULATOR
   if (EmitterCTX->HostFeatures.SupportsAFP) {
@@ -692,6 +706,11 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
     orr(ARMEmitter::Size::i64Bit, TmpReg, TmpReg,
         (1U << 2) |   // NEP
           (1U << 1)); // AH
+
+    // Insert MXCSR.DAZ in to FIZ
+    ldr(TmpReg2.W(), STATE.R(), offsetof(FEXCore::Core::CPUState, mxcsr));
+    bfxil(ARMEmitter::Size::i64Bit, TmpReg, TmpReg2, 6, 1);
+
     msr(ARMEmitter::SystemRegister::FPCR, TmpReg);
   }
 #endif

diff --git a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h
@@ -102,7 +102,9 @@ class Arm64Emitter : public ARMEmitter::Emitter {
   //       and FPRs are being spilled or filled. If only GPRs are spilled/filled, then
   //       TMP4 is left alone.
   void SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs = true, uint32_t GPRSpillMask = ~0U, uint32_t FPRSpillMask = ~0U);
-  void FillStaticRegs(bool FPRs = true, uint32_t GPRFillMask = ~0U, uint32_t FPRFillMask = ~0U);
+  void FillStaticRegs(bool FPRs = true, uint32_t GPRFillMask = ~0U, uint32_t FPRFillMask = ~0U,
+                      std::optional<ARMEmitter::Register> OptionalReg = std::nullopt,
+                      std::optional<ARMEmitter::Register> OptionalReg2 = std::nullopt);
 
   // Register 0-18 + 29 + 30 are caller saved
   static constexpr uint32_t CALLER_GPR_MASK = 0b0110'0000'0000'0111'1111'1111'1111'1111U;

diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/BranchOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/BranchOps.cpp
@@ -184,7 +184,7 @@ DEF_OP(Syscall) {
   if ((Flags & FEXCore::IR::SyscallFlags::NORETURN) != FEXCore::IR::SyscallFlags::NORETURN) {
     // Result is now in x0
     // Fix the stack and any values that were stepped on
-    FillStaticRegs(true, GPRSpillMask, FPRSpillMask);
+    FillStaticRegs(true, GPRSpillMask, FPRSpillMask, ARMEmitter::Reg::r1, ARMEmitter::Reg::r2);
 
     // Now the registers we've spilled are back in their original host registers
     // We can safely claim we are no longer in a syscall
@@ -285,7 +285,7 @@ DEF_OP(InlineSyscall) {
   if ((Op->Flags & FEXCore::IR::SyscallFlags::NORETURN) != FEXCore::IR::SyscallFlags::NORETURN) {
     // Now that we are done in the syscall we need to carefully peel back the state
     // First unspill the registers from before
-    FillStaticRegs(false, SpillMask);
+    FillStaticRegs(false, SpillMask, ~0U, ARMEmitter::Reg::r8, ARMEmitter::Reg::r1);
 
     // Now the registers we've spilled are back in their original host registers
     // We can safely claim we are no longer in a syscall

diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/MiscOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/MiscOps.cpp
@@ -98,6 +98,7 @@ DEF_OP(GetRoundingMode) {
 DEF_OP(SetRoundingMode) {
   auto Op = IROp->C<IR::IROp_SetRoundingMode>();
   auto Src = GetReg(Op->RoundMode.ID());
+  auto MXCSR = GetReg(Op->MXCSR.ID());
 
   // As above, setup the rounding flags in [31:30]
   rbit(ARMEmitter::Size::i32Bit, TMP2, Src);
@@ -116,6 +117,11 @@ DEF_OP(SetRoundingMode) {
   lsr(ARMEmitter::Size::i64Bit, TMP2, Src, 2);
   bfi(ARMEmitter::Size::i64Bit, TMP1, TMP2, 24, 1);
 
+  if (Op->SetDAZ && HostSupportsAFP) {
+    // Extract DAZ from MXCSR and insert to in FPCR.FIZ
+    bfxil(ARMEmitter::Size::i64Bit, TMP1, MXCSR, 6, 1);
+  }
+
   // Now save the new FPCR
   msr(ARMEmitter::SystemRegister::FPCR, TMP1);
 }
@@ -227,7 +233,7 @@ DEF_OP(ProcessorID) {
 
   // Now that we are done in the syscall we need to carefully peel back the state
   // First unspill the registers from before
-  FillStaticRegs(false, SpillMask);
+  FillStaticRegs(false, SpillMask, ~0U, ARMEmitter::Reg::r8, ARMEmitter::Reg::r2);
 
   // Now the registers we've spilled are back in their original host registers
   // We can safely claim we are no longer in a syscall

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
@@ -2774,10 +2774,11 @@ void OpDispatchBuilder::SaveAVXState(Ref MemBase) {
 }
 
 Ref OpDispatchBuilder::GetMXCSR() {
-  // Default MXCSR Value
-  Ref MXCSR = _Constant(0x1F80);
-  Ref RoundingMode = _GetRoundingMode();
-  return _Bfi(OpSize::i32Bit, 3, 13, MXCSR, RoundingMode);
+  Ref MXCSR = _LoadContext(OpSize::i32Bit, GPRClass, offsetof(FEXCore::Core::CPUState, mxcsr));
+  // Mask out unsupported bits
+  // Keeps FZ, RC, exception masks, and DAZ
+  MXCSR = _And(OpSize::i32Bit, MXCSR, _Constant(0xFFC0));
+  return MXCSR;
 }
 
 void OpDispatchBuilder::FXRStoreOp(OpcodeArgs) {
@@ -2886,9 +2887,13 @@ void OpDispatchBuilder::RestoreSSEState(Ref MemBase) {
 }
 
 void OpDispatchBuilder::RestoreMXCSRState(Ref MXCSR) {
+  // Mask out unsupported bits
+  MXCSR = _And(OpSize::i32Bit, MXCSR, _Constant(0xFFC0));
+
+  _StoreContext(OpSize::i32Bit, GPRClass, MXCSR, offsetof(FEXCore::Core::CPUState, mxcsr));
   // We only support the rounding mode and FTZ bit being set
   Ref RoundingMode = _Bfe(OpSize::i32Bit, 3, 13, MXCSR);
-  _SetRoundingMode(RoundingMode);
+  _SetRoundingMode(RoundingMode, true, MXCSR);
 }
 
 void OpDispatchBuilder::RestoreAVXState(Ref MemBase) {

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp
@@ -48,7 +48,7 @@ void OpDispatchBuilder::FNINITF64(OpcodeArgs) {
   auto NewFCW = _Constant(16, 0x037F);
   // Init host rounding mode to zero
   auto Zero = _Constant(0);
-  _SetRoundingMode(Zero);
+  _SetRoundingMode(Zero, false, Zero);
   _StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
 
   // Init FSW to 0
@@ -71,7 +71,7 @@ void OpDispatchBuilder::X87LDENVF64(OpcodeArgs) {
   // ignore the rounding precision, we're always 64-bit in F64.
   // extract rounding mode
   Ref roundingMode = _Bfe(OpSize::i32Bit, 3, 10, NewFCW);
-  _SetRoundingMode(roundingMode);
+  _SetRoundingMode(roundingMode, false, roundingMode);
   _StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
 
   auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1);
@@ -89,7 +89,7 @@ void OpDispatchBuilder::X87FLDCWF64(OpcodeArgs) {
   // ignore the rounding precision, we're always 64-bit in F64.
   // extract rounding mode
   Ref roundingMode = _Bfe(OpSize::i32Bit, 3, 10, NewFCW);
-  _SetRoundingMode(roundingMode);
+  _SetRoundingMode(roundingMode, false, roundingMode);
   _StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
 }
 
@@ -783,7 +783,7 @@ void OpDispatchBuilder::X87FRSTORF64(OpcodeArgs) {
   auto roundMask = _Constant(3);
   roundingMode = _Lshr(OpSize::i32Bit, roundingMode, roundShift);
   roundingMode = _And(OpSize::i32Bit, roundingMode, roundMask);
-  _SetRoundingMode(roundingMode);
+  _SetRoundingMode(roundingMode, false, roundingMode);
   _StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
   _StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
 

diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json
@@ -226,7 +226,7 @@
         "DestSize": "4"
       },
 
-      "SetRoundingMode GPR:$RoundMode": {
+      "SetRoundingMode GPR:$RoundMode, i1:$SetDAZ, GPR:$MXCSR": {
         "Desc": ["Sets the current rounding mode options for the thread"
                 ],
         "HasSideEffects": true

diff --git a/FEXCore/include/FEXCore/Core/CoreState.h b/FEXCore/include/FEXCore/Core/CoreState.h
@@ -104,7 +104,7 @@ struct CPUState {
   // Raw segment register indexes
   uint16_t es_idx {}, cs_idx {}, ss_idx {}, ds_idx {};
   uint16_t gs_idx {}, fs_idx {};
-  uint16_t _pad2[2];
+  uint32_t mxcsr {};
 
   // Segment registers holding base addresses
   uint32_t es_cached {}, cs_cached {}, ss_cached {}, ds_cached {};
@@ -162,6 +162,10 @@ struct CPUState {
     // we encode DF as 1/-1 within the JIT, so we have to write 0x1 here to
     // zero DF.
     flags[X86State::RFLAG_DF_RAW_LOC] = 0x1;
+
+    // Default mxcsr value
+    // All exception masks enabled.
+    mxcsr = 0x1F80;
   }
 };
 static_assert(std::is_trivially_copyable_v<CPUState>, "Needs to be trivial");

diff --git a/Scripts/json_config_parse.py b/Scripts/json_config_parse.py
@@ -73,8 +73,8 @@ class HostFeatures(Flag) :
     FEATURE_BMI2   = (1 << 7)
     FEATURE_CLWB   = (1 << 8)
     FEATURE_LINUX  = (1 << 9)
-    FEATURE_AVX2   = (1 << 10)
-    FEATURE_AES256 = (1 << 11)
+    FEATURE_AES256 = (1 << 10)
+    FEATURE_AFP    = (1 << 11)
 
 RegStringLookup = {
     "NONE":  Regs.REG_NONE,
@@ -147,8 +147,8 @@ class HostFeatures(Flag) :
     "BMI2"   : HostFeatures.FEATURE_BMI2,
     "CLWB"   : HostFeatures.FEATURE_CLWB,
     "LINUX"  : HostFeatures.FEATURE_LINUX,
-    "AVX2"   : HostFeatures.FEATURE_AVX2,
     "AES256" : HostFeatures.FEATURE_AES256,
+    "AFP"    : HostFeatures.FEATURE_AFP,
 }
 
 def parse_hexstring(s):

diff --git a/Source/Tools/CommonTools/HarnessHelpers.h b/Source/Tools/CommonTools/HarnessHelpers.h
@@ -308,6 +308,7 @@ class ConfigLoader final {
     FEATURE_CLWB = (1 << 8),
     FEATURE_LINUX = (1 << 9),
     FEATURE_AES256 = (1 << 10),
+    FEATURE_AFP = (1 << 11),
   };
 
   bool Requires3DNow() const {
@@ -343,6 +344,9 @@ class ConfigLoader final {
   bool RequiresAES256() const {
     return BaseConfig.OptionHostFeatures & HostFeatures::FEATURE_AES256;
   }
+  bool RequiresAFP() const {
+    return BaseConfig.OptionHostFeatures & HostFeatures::FEATURE_AFP;
+  }
 
 private:
   FEX_CONFIG_OPT(ConfigDumpGPRs, DUMPGPRS);
@@ -512,6 +516,9 @@ class HarnessCodeLoader final : public FEX::CodeLoader {
   bool RequiresAES256() const {
     return Config.RequiresAES256();
   }
+  bool RequiresAFP() const {
+    return Config.RequiresAFP();
+  }
 
 private:
   constexpr static uint64_t STACK_OFFSET = 0xc000'0000;

diff --git a/Source/Tools/TestHarnessRunner/TestHarnessRunner.cpp b/Source/Tools/TestHarnessRunner/TestHarnessRunner.cpp
@@ -267,7 +267,8 @@ int main(int argc, char** argv, char** const envp) {
                          (!SupportsAVX && Loader.RequiresAVX()) || (!HostFeatures.SupportsRAND && Loader.RequiresRAND()) ||
                          (!HostFeatures.SupportsSHA && Loader.RequiresSHA()) || (!HostFeatures.SupportsCLZERO && Loader.RequiresCLZERO()) ||
                          (!HostFeatures.SupportsBMI1 && Loader.RequiresBMI1()) || (!HostFeatures.SupportsBMI2 && Loader.RequiresBMI2()) ||
-                         (!HostFeatures.SupportsCLWB && Loader.RequiresCLWB()) || (!HostFeatures.SupportsAES256 && Loader.RequiresAES256());
+                         (!HostFeatures.SupportsCLWB && Loader.RequiresCLWB()) ||
+                         (!HostFeatures.SupportsAES256 && Loader.RequiresAES256()) || (!HostFeatures.SupportsAFP && Loader.RequiresAFP());
 
 #ifdef _WIN32
   TestUnsupported |= Loader.RequiresLinux();

diff --git a/unittests/ASM/DAZTest.asm b/unittests/ASM/DAZTest.asm
@@ -0,0 +1,37 @@
+%ifdef CONFIG
+{
+  "HostFeatures": ["AFP"],
+  "RegData": {
+    "XMM0": ["0x0108000040e00000", "0xd1d2d3d4d5d6d7d8", "0", "0"],
+    "XMM1": ["0x00cfffff40e00000", "0xd1d2d3d4d5d6d7d8", "0", "0"]
+  }
+}
+%endif
+
+vmovaps ymm1, [rel .data_three]
+vmovaps ymm2, [rel .data_four]
+
+; Do an add without DAZ
+vaddps xmm0, xmm1, xmm2
+
+; Set DAZ
+stmxcsr [rel .data_mxcsr]
+or dword [rel .data_mxcsr], (1 << 6)
+ldmxcsr [rel .data_mxcsr]
+
+; Do an add with DAZ
+vaddps xmm1, xmm1, xmm2
+
+hlt
+align 32
+
+.data_three:
+dd 3.0, 0x00cfffff
+dq 0xa1a2a3a4a5a6a7a8, 0xb1b2b3b4b5b6b7b8, 0xc1c2c3c4c5c6c7c8
+
+.data_four:
+dd 4.0, 0x00400000
+dq 0xd1d2d3d4d5d6d7d8, 0xe1e2e3e4e5e6e7e8, 0xf1f2f3f4f5f6f7f8
+
+.data_mxcsr:
+dd 0
diff --git a/unittests/ASM/VEX/vldmxcsr.asm b/unittests/ASM/VEX/vldmxcsr.asm
@@ -2,7 +2,7 @@
 {
   "HostFeatures": ["AVX"],
   "RegData": {
-    "RAX": "0xFF80"
+    "RAX": "0xFFC0"
   },
   "MemoryRegions": {
     "0x100000000": "4096"