Skip to content

Commit

Permalink
Ensure predicate cache is reset when control flow leaves block
Browse files Browse the repository at this point in the history
Whenever the control float leaves the block, it might clobber the
predicate register so we reset the cache whenever that happens.

The difficulty here is that the cache is valid only during IR generation
so we need to make sure we catch all the cases during this pass where
the execution might leave the block.

Fixes FEX-Emu#4264
  • Loading branch information
pmatos committed Jan 22, 2025
1 parent adff4bb commit ce49434
Show file tree
Hide file tree
Showing 11 changed files with 41 additions and 22 deletions.
18 changes: 12 additions & 6 deletions FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ namespace x64 {
// p6 and p7 registers are used as temporaries no not added here for RA
// See PREF_TMP_16B and PREF_TMP_32B
// p0-p1 are also used in the jit as temps.
// Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5.
constexpr std::array<ARMEmitter::PRegister, 4> PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
// Also p8-p15 cannot be used can only encode p0-p7, p2 is a static register, so we're left with p3-p5.
constexpr std::array<ARMEmitter::PRegister, 3> PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};

constexpr unsigned RAPairs = 6;

Expand All @@ -82,7 +82,7 @@ namespace x64 {
ARMEmitter::VReg::v12, ARMEmitter::VReg::v13, ARMEmitter::VReg::v14, ARMEmitter::VReg::v15,
};
#else
constexpr std::array<ARMEmitter::Register, 18> SRA = {
constexpr std::array<ARMEmitter::Register, 19> SRA = {
ARMEmitter::Reg::r8,
ARMEmitter::Reg::r0,
ARMEmitter::Reg::r1,
Expand All @@ -100,6 +100,8 @@ namespace x64 {
ARMEmitter::Reg::r20,
ARMEmitter::Reg::r21,
ARMEmitter::Reg::r22,
// Predicate register
ARMEmitter::PR::p2,
REG_PF,
REG_AF,
};
Expand All @@ -112,8 +114,8 @@ namespace x64 {
// p6 and p7 registers are used as temporaries no not added here for RA
// See PREF_TMP_16B and PREF_TMP_32B
// p0-p1 are also used in the jit as temps.
// Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5.
constexpr std::array<ARMEmitter::PRegister, 4> PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
// Also p8-p15 cannot be used can only encode p0-p7, p2 is a static register, so we're left with p2-p5.
constexpr std::array<ARMEmitter::PRegister, 3> PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};

constexpr unsigned RAPairs = 6;

Expand Down Expand Up @@ -250,7 +252,7 @@ namespace x32 {
// See PREF_TMP_16B and PREF_TMP_32B
// p0-p1 are also used in the jit as temps.
// Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5.
constexpr std::array<ARMEmitter::PRegister, 4> PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
constexpr std::array<ARMEmitter::PRegister, 3> PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};

// All are caller saved
constexpr std::array<ARMEmitter::VRegister, 8> SRAFPR = {
Expand Down Expand Up @@ -822,6 +824,10 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
}
}

if (FillP2) {
ptrue(ARMEmitter::SubRegSize::i16Bit, SVE_OPT_PRED, ARMEmitter::PredicatePattern::SVE_VL5);
}

// PF/AF are special, remove them from the mask
uint32_t PFAFMask = ((1u << REG_PF.Idx()) | ((1u << REG_AF.Idx())));
uint32_t PFAFFillMask = GPRFillMask & PFAFMask;
Expand Down
9 changes: 9 additions & 0 deletions FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ constexpr auto REG_AF = ARMEmitter::Reg::r27;
// Vector temporaries
constexpr auto VTMP1 = ARMEmitter::VReg::v0;
constexpr auto VTMP2 = ARMEmitter::VReg::v1;

// Predicate register for X87 SVE Optimization
constexpr auto SVE_OPT_PRED = ARMEmitter::PReg::p2;

#else
constexpr auto TMP1 = ARMEmitter::XReg::x10;
constexpr auto TMP2 = ARMEmitter::XReg::x11;
Expand All @@ -67,6 +71,9 @@ constexpr auto VTMP2 = ARMEmitter::VReg::v17;
constexpr auto EC_CALL_CHECKER_PC_REG = ARMEmitter::XReg::x9;
constexpr auto EC_ENTRY_CPUAREA_REG = ARMEmitter::XReg::x17;

// Predicate register for X87 SVE Optimization
constexpr auto SVE_OPT_PRED = ARMEmitter::PReg::p2;

// These structures are not included in the standard Windows headers, define the offsets of members we care about for EC here.
constexpr size_t TEB_CPU_AREA_OFFSET = 0x1788;
constexpr size_t TEB_PEB_OFFSET = 0x60;
Expand Down Expand Up @@ -107,6 +114,8 @@ class Arm64Emitter : public ARMEmitter::Emitter {
void LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, uint64_t Constant, bool NOPPad = false);

void FillSpecialRegs(ARMEmitter::Register TmpReg, ARMEmitter::Register TmpReg2, bool SetFIZ, bool SetPredRegs);
// Do we need to fill register p2 for the SVE X87 Store Optimization
bool FillP2 = false;

// Correlate an ARM register back to an x86 register index.
// Returning REG_INVALID if there was no mapping.
Expand Down
2 changes: 2 additions & 0 deletions FEXCore/Source/Interface/Core/JIT/JIT.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -734,6 +734,8 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size
this->DebugData = DebugData;
this->IR = IR;

FillP2 = false; // Reset for each block

// Fairly excessive buffer range to make sure we don't overflow
uint32_t BufferRange = SSACount * 16;
if ((GetCursorOffset() + BufferRange) > CurrentCodeBuffer->Size) {
Expand Down
6 changes: 3 additions & 3 deletions FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1590,10 +1590,10 @@ DEF_OP(StoreMem) {
}
}

DEF_OP(InitPredicate) {
const auto Op = IROp->C<IR::IROp_InitPredicate>();
DEF_OP(LoadSVEOptPredicate) {
FillP2 = true;
const auto OpSize = IROp->Size;
ptrue(ConvertSubRegSize16(OpSize), GetPReg(Node), static_cast<ARMEmitter::PredicatePattern>(Op->Pattern));
ptrue(ConvertSubRegSize16(OpSize), GetPReg(Node), ARMEmitter::PredicatePattern::SVE_VL5);
}

DEF_OP(StoreMemPredicate) {
Expand Down
4 changes: 2 additions & 2 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4314,7 +4314,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T
Ref MemSrc = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
// Using SVE we can load this with a single instruction.
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
auto PReg = _LoadSVEOptPredicate(OpSize::i16Bit);
return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc);
} else {
// For X87 extended doubles, Split the load.
Expand Down Expand Up @@ -4448,7 +4448,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
if (OpSize == OpSize::f80Bit) {
Ref MemStoreDst = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
auto PReg = _LoadSVEOptPredicate(OpSize::i16Bit);
_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst);
} else {
// For X87 extended doubles, split before storing
Expand Down
4 changes: 0 additions & 4 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,6 @@ class OpDispatchBuilder final : public IREmitter {
void FNINIT(OpcodeArgs);

void X87ModifySTP(OpcodeArgs, bool Inc);
void X87SinCos(OpcodeArgs);
void X87FYL2X(OpcodeArgs, bool IsFYL2XP1);
void X87LDENV(OpcodeArgs);
void X87FLDCW(OpcodeArgs);
Expand Down Expand Up @@ -764,9 +763,6 @@ class OpDispatchBuilder final : public IREmitter {
void FTSTF64(OpcodeArgs);
void FRNDINTF64(OpcodeArgs);
void FSQRTF64(OpcodeArgs);
void X87UnaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp);
void X87BinaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp);
void X87SinCosF64(OpcodeArgs);
void X87FLDCWF64(OpcodeArgs);
void X87TANF64(OpcodeArgs);
void X87ATANF64(OpcodeArgs);
Expand Down
4 changes: 2 additions & 2 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -567,8 +567,8 @@
]
},

"PRED = InitPredicate OpSize:#Size, u8:$Pattern": {
"Desc": ["Initialize predicate register from Pattern"],
"PRED = LoadSVEOptPredicate OpSize:#Size": {
"Desc": ["Load the predicate register for the X87 SVE optimization with the necessary pattern (VL5)."],
"DestSize": "Size"
},

Expand Down
1 change: 1 addition & 0 deletions FEXCore/Source/Interface/IR/IREmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) {
case FPRClass:
case GPRFixedClass:
case FPRFixedClass:
case PREDClass:
case InvalidClass: return Class;
default: break;
}
Expand Down
2 changes: 2 additions & 0 deletions FEXCore/Source/Interface/IR/IREmitter.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: MIT
#pragma once

#include "CodeEmitter/Emitter.h"
#include "Interface/IR/IR.h"
#include "Interface/IR/IntrusiveIRList.h"

Expand All @@ -9,6 +10,7 @@

#include <FEXCore/Utils/LogManager.h>
#include <FEXCore/fextl/vector.h>
#include <FEXCore/fextl/unordered_map.h>

#include <algorithm>
#include <stdint.h>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ class ConstrainedRAPass final : public RegisterAllocationPass {
};

Ref DecodeSRANode(const IROp_Header* IROp, Ref Node) {
if (IROp->Op == OP_LOADREGISTER || IROp->Op == OP_LOADPF || IROp->Op == OP_LOADAF) {
if (IROp->Op == OP_LOADREGISTER || IROp->Op == OP_LOADPF || IROp->Op == OP_LOADAF || IROp->Op == OP_LOADSVEOPTPREDICATE) {
return Node;
} else if (IROp->Op == OP_STOREREGISTER) {
const IROp_StoreRegister* Op = IROp->C<IR::IROp_StoreRegister>();
Expand Down Expand Up @@ -269,6 +269,9 @@ class ConstrainedRAPass final : public RegisterAllocationPass {
return PhysicalRegister {GPRFixedClass, FlagOffset};
} else if (IROp->Op == OP_LOADAF || IROp->Op == OP_STOREAF) {
return PhysicalRegister {GPRFixedClass, (uint8_t)(FlagOffset + 1)};
} else if (IROp->Op == OP_LOADSVEOPTPREDICATE) {
// We use p2 for the X87 SVE Store optimization.
return PhysicalRegister {PREDClass, 2};
}

LOGMAN_THROW_A_FMT(Class == GPRClass || Class == FPRClass, "SRA classes");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include "FEXCore/IR/IR.h"
#include "FEXCore/Utils/Profiler.h"
#include "FEXCore/Core/HostFeatures.h"
#include "CodeEmitter/Emitter.h"
#include "Interface/Core/ArchHelpers/Arm64Emitter.h"

#include <array>
#include <cstddef>
Expand Down Expand Up @@ -838,13 +838,13 @@ void X87StackOptimization::Run(IREmitter* Emit) {
if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert
StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode);
}
if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize()
if (Op->StoreSize == OpSize::f80Bit) {
if (Features.SupportsSVE128 || Features.SupportsSVE256) {
auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
auto PredReg = IREmit->_LoadSVEOptPredicate(OpSize::i16Bit);
if (!IsZero(Offset)) {
AddrNode = IREmit->_Add(OpSize::i64Bit, AddrNode, Offset);
}
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode);
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PredReg, AddrNode);
} else {
// For X87 extended doubles, split before storing
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
Expand Down

0 comments on commit ce49434

Please sign in to comment.