Skip to content

Commit

Permalink
Merge pull request FEX-Emu#3883 from Sonicadvance1/implement_daz
Browse files Browse the repository at this point in the history
Arm64: Implements support for DAZ using AFP.FIZ
  • Loading branch information
Sonicadvance1 authored Jul 21, 2024
2 parents 5c9bb65 + 4fffe68 commit f8c6baa
Show file tree
Hide file tree
Showing 16 changed files with 180 additions and 113 deletions.
43 changes: 31 additions & 12 deletions FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs, uint3
if (EmitterCTX->HostFeatures.SupportsAFP) {
// Disable AFP features when spilling registers.
//
// Disable FPCR.NEP and FPCR.AH
// Disable FPCR.NEP and FPCR.AH and FPCR.FIZ
// NEP(2): Changes ASIMD scalar instructions to insert in to the lower bits of the destination.
// AH(1): Changes NaN behaviour in some instructions. Specifically fmin, fmax.
// Also interacts with RPRES to change reciprocal/rsqrt precision from 8-bit mantissa to 12-bit.
Expand All @@ -585,7 +585,8 @@ void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs, uint3
mrs(TmpReg, ARMEmitter::SystemRegister::FPCR);
bic(ARMEmitter::Size::i64Bit, TmpReg, TmpReg,
(1U << 2) | // NEP
(1U << 1)); // AH
(1U << 1) | // AH
(1U << 0)); // FIZ
msr(ARMEmitter::SystemRegister::FPCR, TmpReg);
}
#endif
Expand Down Expand Up @@ -663,19 +664,32 @@ void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs, uint3
}
}

void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRFillMask) {
ARMEmitter::Register TmpReg = ARMEmitter::Reg::r0;
LOGMAN_THROW_A_FMT(GPRFillMask != 0, "Must fill at least 1 GPR for a temp");
[[maybe_unused]] bool FoundRegister {};
for (auto Reg : StaticRegisters) {
if (((1U << Reg.Idx()) & GPRFillMask)) {
TmpReg = Reg;
FoundRegister = true;
break;
void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRFillMask, std::optional<ARMEmitter::Register> OptionalReg,
std::optional<ARMEmitter::Register> OptionalReg2) {
auto FindTempReg = [this](uint32_t* GPRFillMask) -> std::optional<ARMEmitter::Register> {
for (auto Reg : StaticRegisters) {
if (((1U << Reg.Idx()) & *GPRFillMask)) {
*GPRFillMask &= ~(1U << Reg.Idx());
return std::make_optional(Reg);
}
}
return std::nullopt;
};

LOGMAN_THROW_A_FMT(GPRFillMask != 0, "Must fill at least 2 GPRs for a temp");
uint32_t TempGPRFillMask = GPRFillMask;
if (!OptionalReg.has_value()) {
OptionalReg = FindTempReg(&TempGPRFillMask);
}

LOGMAN_THROW_A_FMT(FoundRegister, "Didn't have an SRA register to use as a temporary while spilling!");
if (!OptionalReg2.has_value()) {
OptionalReg2 = FindTempReg(&TempGPRFillMask);
}
LOGMAN_THROW_A_FMT(OptionalReg.has_value() && OptionalReg2.has_value(), "Didn't have an SRA register to use as a temporary while "
"spilling!");

auto TmpReg = *OptionalReg;
[[maybe_unused]] auto TmpReg2 = *OptionalReg2;

#ifndef VIXL_SIMULATOR
if (EmitterCTX->HostFeatures.SupportsAFP) {
Expand All @@ -692,6 +706,11 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
orr(ARMEmitter::Size::i64Bit, TmpReg, TmpReg,
(1U << 2) | // NEP
(1U << 1)); // AH

// Insert MXCSR.DAZ in to FIZ
ldr(TmpReg2.W(), STATE.R(), offsetof(FEXCore::Core::CPUState, mxcsr));
bfxil(ARMEmitter::Size::i64Bit, TmpReg, TmpReg2, 6, 1);

msr(ARMEmitter::SystemRegister::FPCR, TmpReg);
}
#endif
Expand Down
4 changes: 3 additions & 1 deletion FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ class Arm64Emitter : public ARMEmitter::Emitter {
// and FPRs are being spilled or filled. If only GPRs are spilled/filled, then
// TMP4 is left alone.
void SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs = true, uint32_t GPRSpillMask = ~0U, uint32_t FPRSpillMask = ~0U);
void FillStaticRegs(bool FPRs = true, uint32_t GPRFillMask = ~0U, uint32_t FPRFillMask = ~0U);
void FillStaticRegs(bool FPRs = true, uint32_t GPRFillMask = ~0U, uint32_t FPRFillMask = ~0U,
std::optional<ARMEmitter::Register> OptionalReg = std::nullopt,
std::optional<ARMEmitter::Register> OptionalReg2 = std::nullopt);

// Register 0-18 + 29 + 30 are caller saved
static constexpr uint32_t CALLER_GPR_MASK = 0b0110'0000'0000'0111'1111'1111'1111'1111U;
Expand Down
4 changes: 2 additions & 2 deletions FEXCore/Source/Interface/Core/JIT/Arm64/BranchOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ DEF_OP(Syscall) {
if ((Flags & FEXCore::IR::SyscallFlags::NORETURN) != FEXCore::IR::SyscallFlags::NORETURN) {
// Result is now in x0
// Fix the stack and any values that were stepped on
FillStaticRegs(true, GPRSpillMask, FPRSpillMask);
FillStaticRegs(true, GPRSpillMask, FPRSpillMask, ARMEmitter::Reg::r1, ARMEmitter::Reg::r2);

// Now the registers we've spilled are back in their original host registers
// We can safely claim we are no longer in a syscall
Expand Down Expand Up @@ -285,7 +285,7 @@ DEF_OP(InlineSyscall) {
if ((Op->Flags & FEXCore::IR::SyscallFlags::NORETURN) != FEXCore::IR::SyscallFlags::NORETURN) {
// Now that we are done in the syscall we need to carefully peel back the state
// First unspill the registers from before
FillStaticRegs(false, SpillMask);
FillStaticRegs(false, SpillMask, ~0U, ARMEmitter::Reg::r8, ARMEmitter::Reg::r1);

// Now the registers we've spilled are back in their original host registers
// We can safely claim we are no longer in a syscall
Expand Down
8 changes: 7 additions & 1 deletion FEXCore/Source/Interface/Core/JIT/Arm64/MiscOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ DEF_OP(GetRoundingMode) {
DEF_OP(SetRoundingMode) {
auto Op = IROp->C<IR::IROp_SetRoundingMode>();
auto Src = GetReg(Op->RoundMode.ID());
auto MXCSR = GetReg(Op->MXCSR.ID());

// As above, setup the rounding flags in [31:30]
rbit(ARMEmitter::Size::i32Bit, TMP2, Src);
Expand All @@ -116,6 +117,11 @@ DEF_OP(SetRoundingMode) {
lsr(ARMEmitter::Size::i64Bit, TMP2, Src, 2);
bfi(ARMEmitter::Size::i64Bit, TMP1, TMP2, 24, 1);

if (Op->SetDAZ && HostSupportsAFP) {
// Extract DAZ from MXCSR and insert to in FPCR.FIZ
bfxil(ARMEmitter::Size::i64Bit, TMP1, MXCSR, 6, 1);
}

// Now save the new FPCR
msr(ARMEmitter::SystemRegister::FPCR, TMP1);
}
Expand Down Expand Up @@ -227,7 +233,7 @@ DEF_OP(ProcessorID) {

// Now that we are done in the syscall we need to carefully peel back the state
// First unspill the registers from before
FillStaticRegs(false, SpillMask);
FillStaticRegs(false, SpillMask, ~0U, ARMEmitter::Reg::r8, ARMEmitter::Reg::r2);

// Now the registers we've spilled are back in their original host registers
// We can safely claim we are no longer in a syscall
Expand Down
15 changes: 10 additions & 5 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2774,10 +2774,11 @@ void OpDispatchBuilder::SaveAVXState(Ref MemBase) {
}

Ref OpDispatchBuilder::GetMXCSR() {
// Default MXCSR Value
Ref MXCSR = _Constant(0x1F80);
Ref RoundingMode = _GetRoundingMode();
return _Bfi(OpSize::i32Bit, 3, 13, MXCSR, RoundingMode);
Ref MXCSR = _LoadContext(OpSize::i32Bit, GPRClass, offsetof(FEXCore::Core::CPUState, mxcsr));
// Mask out unsupported bits
// Keeps FZ, RC, exception masks, and DAZ
MXCSR = _And(OpSize::i32Bit, MXCSR, _Constant(0xFFC0));
return MXCSR;
}

void OpDispatchBuilder::FXRStoreOp(OpcodeArgs) {
Expand Down Expand Up @@ -2886,9 +2887,13 @@ void OpDispatchBuilder::RestoreSSEState(Ref MemBase) {
}

void OpDispatchBuilder::RestoreMXCSRState(Ref MXCSR) {
// Mask out unsupported bits
MXCSR = _And(OpSize::i32Bit, MXCSR, _Constant(0xFFC0));

_StoreContext(OpSize::i32Bit, GPRClass, MXCSR, offsetof(FEXCore::Core::CPUState, mxcsr));
// We only support the rounding mode and FTZ bit being set
Ref RoundingMode = _Bfe(OpSize::i32Bit, 3, 13, MXCSR);
_SetRoundingMode(RoundingMode);
_SetRoundingMode(RoundingMode, true, MXCSR);
}

void OpDispatchBuilder::RestoreAVXState(Ref MemBase) {
Expand Down
8 changes: 4 additions & 4 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ void OpDispatchBuilder::FNINITF64(OpcodeArgs) {
auto NewFCW = _Constant(16, 0x037F);
// Init host rounding mode to zero
auto Zero = _Constant(0);
_SetRoundingMode(Zero);
_SetRoundingMode(Zero, false, Zero);
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));

// Init FSW to 0
Expand All @@ -71,7 +71,7 @@ void OpDispatchBuilder::X87LDENVF64(OpcodeArgs) {
// ignore the rounding precision, we're always 64-bit in F64.
// extract rounding mode
Ref roundingMode = _Bfe(OpSize::i32Bit, 3, 10, NewFCW);
_SetRoundingMode(roundingMode);
_SetRoundingMode(roundingMode, false, roundingMode);
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));

auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1);
Expand All @@ -89,7 +89,7 @@ void OpDispatchBuilder::X87FLDCWF64(OpcodeArgs) {
// ignore the rounding precision, we're always 64-bit in F64.
// extract rounding mode
Ref roundingMode = _Bfe(OpSize::i32Bit, 3, 10, NewFCW);
_SetRoundingMode(roundingMode);
_SetRoundingMode(roundingMode, false, roundingMode);
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
}

Expand Down Expand Up @@ -783,7 +783,7 @@ void OpDispatchBuilder::X87FRSTORF64(OpcodeArgs) {
auto roundMask = _Constant(3);
roundingMode = _Lshr(OpSize::i32Bit, roundingMode, roundShift);
roundingMode = _And(OpSize::i32Bit, roundingMode, roundMask);
_SetRoundingMode(roundingMode);
_SetRoundingMode(roundingMode, false, roundingMode);
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));

Expand Down
2 changes: 1 addition & 1 deletion FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@
"DestSize": "4"
},

"SetRoundingMode GPR:$RoundMode": {
"SetRoundingMode GPR:$RoundMode, i1:$SetDAZ, GPR:$MXCSR": {
"Desc": ["Sets the current rounding mode options for the thread"
],
"HasSideEffects": true
Expand Down
6 changes: 5 additions & 1 deletion FEXCore/include/FEXCore/Core/CoreState.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ struct CPUState {
// Raw segment register indexes
uint16_t es_idx {}, cs_idx {}, ss_idx {}, ds_idx {};
uint16_t gs_idx {}, fs_idx {};
uint16_t _pad2[2];
uint32_t mxcsr {};

// Segment registers holding base addresses
uint32_t es_cached {}, cs_cached {}, ss_cached {}, ds_cached {};
Expand Down Expand Up @@ -162,6 +162,10 @@ struct CPUState {
// we encode DF as 1/-1 within the JIT, so we have to write 0x1 here to
// zero DF.
flags[X86State::RFLAG_DF_RAW_LOC] = 0x1;

// Default mxcsr value
// All exception masks enabled.
mxcsr = 0x1F80;
}
};
static_assert(std::is_trivially_copyable_v<CPUState>, "Needs to be trivial");
Expand Down
6 changes: 3 additions & 3 deletions Scripts/json_config_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ class HostFeatures(Flag) :
FEATURE_BMI2 = (1 << 7)
FEATURE_CLWB = (1 << 8)
FEATURE_LINUX = (1 << 9)
FEATURE_AVX2 = (1 << 10)
FEATURE_AES256 = (1 << 11)
FEATURE_AES256 = (1 << 10)
FEATURE_AFP = (1 << 11)

RegStringLookup = {
"NONE": Regs.REG_NONE,
Expand Down Expand Up @@ -147,8 +147,8 @@ class HostFeatures(Flag) :
"BMI2" : HostFeatures.FEATURE_BMI2,
"CLWB" : HostFeatures.FEATURE_CLWB,
"LINUX" : HostFeatures.FEATURE_LINUX,
"AVX2" : HostFeatures.FEATURE_AVX2,
"AES256" : HostFeatures.FEATURE_AES256,
"AFP" : HostFeatures.FEATURE_AFP,
}

def parse_hexstring(s):
Expand Down
7 changes: 7 additions & 0 deletions Source/Tools/CommonTools/HarnessHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ class ConfigLoader final {
FEATURE_CLWB = (1 << 8),
FEATURE_LINUX = (1 << 9),
FEATURE_AES256 = (1 << 10),
FEATURE_AFP = (1 << 11),
};

bool Requires3DNow() const {
Expand Down Expand Up @@ -343,6 +344,9 @@ class ConfigLoader final {
bool RequiresAES256() const {
return BaseConfig.OptionHostFeatures & HostFeatures::FEATURE_AES256;
}
bool RequiresAFP() const {
return BaseConfig.OptionHostFeatures & HostFeatures::FEATURE_AFP;
}

private:
FEX_CONFIG_OPT(ConfigDumpGPRs, DUMPGPRS);
Expand Down Expand Up @@ -512,6 +516,9 @@ class HarnessCodeLoader final : public FEX::CodeLoader {
bool RequiresAES256() const {
return Config.RequiresAES256();
}
bool RequiresAFP() const {
return Config.RequiresAFP();
}

private:
constexpr static uint64_t STACK_OFFSET = 0xc000'0000;
Expand Down
3 changes: 2 additions & 1 deletion Source/Tools/TestHarnessRunner/TestHarnessRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,8 @@ int main(int argc, char** argv, char** const envp) {
(!SupportsAVX && Loader.RequiresAVX()) || (!HostFeatures.SupportsRAND && Loader.RequiresRAND()) ||
(!HostFeatures.SupportsSHA && Loader.RequiresSHA()) || (!HostFeatures.SupportsCLZERO && Loader.RequiresCLZERO()) ||
(!HostFeatures.SupportsBMI1 && Loader.RequiresBMI1()) || (!HostFeatures.SupportsBMI2 && Loader.RequiresBMI2()) ||
(!HostFeatures.SupportsCLWB && Loader.RequiresCLWB()) || (!HostFeatures.SupportsAES256 && Loader.RequiresAES256());
(!HostFeatures.SupportsCLWB && Loader.RequiresCLWB()) ||
(!HostFeatures.SupportsAES256 && Loader.RequiresAES256()) || (!HostFeatures.SupportsAFP && Loader.RequiresAFP());

#ifdef _WIN32
TestUnsupported |= Loader.RequiresLinux();
Expand Down
37 changes: 37 additions & 0 deletions unittests/ASM/DAZTest.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
%ifdef CONFIG
{
"HostFeatures": ["AFP"],
"RegData": {
"XMM0": ["0x0108000040e00000", "0xd1d2d3d4d5d6d7d8", "0", "0"],
"XMM1": ["0x00cfffff40e00000", "0xd1d2d3d4d5d6d7d8", "0", "0"]
}
}
%endif

vmovaps ymm1, [rel .data_three]
vmovaps ymm2, [rel .data_four]

; Do an add without DAZ
vaddps xmm0, xmm1, xmm2

; Set DAZ
stmxcsr [rel .data_mxcsr]
or dword [rel .data_mxcsr], (1 << 6)
ldmxcsr [rel .data_mxcsr]

; Do an add with DAZ
vaddps xmm1, xmm1, xmm2

hlt
align 32

.data_three:
dd 3.0, 0x00cfffff
dq 0xa1a2a3a4a5a6a7a8, 0xb1b2b3b4b5b6b7b8, 0xc1c2c3c4c5c6c7c8

.data_four:
dd 4.0, 0x00400000
dq 0xd1d2d3d4d5d6d7d8, 0xe1e2e3e4e5e6e7e8, 0xf1f2f3f4f5f6f7f8

.data_mxcsr:
dd 0
2 changes: 1 addition & 1 deletion unittests/ASM/VEX/vldmxcsr.asm
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
{
"HostFeatures": ["AVX"],
"RegData": {
"RAX": "0xFF80"
"RAX": "0xFFC0"
},
"MemoryRegions": {
"0x100000000": "4096"
Expand Down
Loading

0 comments on commit f8c6baa

Please sign in to comment.