Skip to content

Commit

Permalink
Merge pull request FEX-Emu#3742 from Sonicadvance1/export_avx_reg_hel…
Browse files Browse the repository at this point in the history
…pers

FEXCore: Implement AVX reconstruction helpers
  • Loading branch information
Sonicadvance1 authored Jun 24, 2024
2 parents 8f769ce + 96ac717 commit 6edf461
Show file tree
Hide file tree
Showing 7 changed files with 98 additions and 58 deletions.
3 changes: 3 additions & 0 deletions FEXCore/Source/Interface/Context/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ class ContextImpl final : public FEXCore::Context::Context {
uint32_t ReconstructCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, bool WasInJIT, uint64_t* HostGPRs, uint64_t PSTATE) override;
void SetFlagsFromCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, uint32_t EFLAGS) override;

void ReconstructXMMRegisters(const FEXCore::Core::InternalThreadState* Thread, __uint128_t* XMM_Low, __uint128_t* YMM_High) override;
void SetXMMRegistersFromState(FEXCore::Core::InternalThreadState* Thread, const __uint128_t* XMM_Low, const __uint128_t* YMM_High) override;

/**
* @brief Used to create FEX thread objects in preparation for creating a true OS thread. Does set a TID or PID.
*
Expand Down
49 changes: 49 additions & 0 deletions FEXCore/Source/Interface/Core/Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,55 @@ uint32_t ContextImpl::ReconstructCompactedEFLAGS(FEXCore::Core::InternalThreadSt
return EFLAGS;
}

void ContextImpl::ReconstructXMMRegisters(const FEXCore::Core::InternalThreadState* Thread, __uint128_t* XMM_Low, __uint128_t* YMM_High) {
const size_t MaximumRegisters = Config.Is64BitMode ? FEXCore::Core::CPUState::NUM_XMMS : 8;

if (YMM_High != nullptr && HostFeatures.SupportsAVX) {
const bool SupportsConvergedRegisters = HostFeatures.SupportsSVE256;

if (SupportsConvergedRegisters) {
///< Output wants to de-interleave
for (size_t i = 0; i < MaximumRegisters; ++i) {
memcpy(&XMM_Low[i], &Thread->CurrentFrame->State.xmm.avx.data[i][0], sizeof(__uint128_t));
memcpy(&YMM_High[i], &Thread->CurrentFrame->State.xmm.avx.data[i][2], sizeof(__uint128_t));
}
} else {
///< Matches what FEX wants with non-converged registers
for (size_t i = 0; i < MaximumRegisters; ++i) {
memcpy(&XMM_Low[i], &Thread->CurrentFrame->State.xmm.sse.data[i][0], sizeof(__uint128_t));
memcpy(&YMM_High[i], &Thread->CurrentFrame->State.avx_high[i][0], sizeof(__uint128_t));
}
}
} else {
// Only support SSE, no AVX here, even if requested.
memcpy(XMM_Low, Thread->CurrentFrame->State.xmm.sse.data, MaximumRegisters * sizeof(__uint128_t));
}
}

void ContextImpl::SetXMMRegistersFromState(FEXCore::Core::InternalThreadState* Thread, const __uint128_t* XMM_Low, const __uint128_t* YMM_High) {
const size_t MaximumRegisters = Config.Is64BitMode ? FEXCore::Core::CPUState::NUM_XMMS : 8;
if (YMM_High != nullptr && HostFeatures.SupportsAVX) {
const bool SupportsConvergedRegisters = HostFeatures.SupportsSVE256;

if (SupportsConvergedRegisters) {
///< Output wants to de-interleave
for (size_t i = 0; i < MaximumRegisters; ++i) {
memcpy(&Thread->CurrentFrame->State.xmm.avx.data[i][0], &XMM_Low[i], sizeof(__uint128_t));
memcpy(&Thread->CurrentFrame->State.xmm.avx.data[i][2], &YMM_High[i], sizeof(__uint128_t));
}
} else {
///< Matches what FEX wants with non-converged registers
for (size_t i = 0; i < MaximumRegisters; ++i) {
memcpy(&Thread->CurrentFrame->State.xmm.sse.data[i][0], &XMM_Low[i], sizeof(__uint128_t));
memcpy(&Thread->CurrentFrame->State.avx_high[i][0], &YMM_High[i], sizeof(__uint128_t));
}
}
} else {
// Only support SSE, no AVX here, even if requested.
memcpy(Thread->CurrentFrame->State.xmm.sse.data, XMM_Low, MaximumRegisters * sizeof(__uint128_t));
}
}

void ContextImpl::SetFlagsFromCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, uint32_t EFLAGS) {
const auto Frame = Thread->CurrentFrame;
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_EFLAG_BITS; ++i) {
Expand Down
5 changes: 5 additions & 0 deletions FEXCore/include/FEXCore/Core/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,11 @@ class Context {
///< Sets FEX's internal EFLAGS representation to the passed in compacted form.
FEX_DEFAULT_VISIBILITY virtual void SetFlagsFromCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, uint32_t EFLAGS) = 0;

FEX_DEFAULT_VISIBILITY virtual void
ReconstructXMMRegisters(const FEXCore::Core::InternalThreadState* Thread, __uint128_t* XMM_Low, __uint128_t* YMM_High) = 0;
FEX_DEFAULT_VISIBILITY virtual void
SetXMMRegistersFromState(FEXCore::Core::InternalThreadState* Thread, const __uint128_t* XMM_Low, const __uint128_t* YMM_High) = 0;

/**
* @brief Create a new thread object that doesn't inherit any state.
* Used to create FEX thread objects in preparation for creating a true OS thread.
Expand Down
72 changes: 22 additions & 50 deletions Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,14 +439,9 @@ void SignalDelegator::RestoreFrame_x64(FEXCore::Core::InternalThreadState* Threa
memcpy(Frame->State.mm, fpstate->_st, sizeof(Frame->State.mm));

if (IsAVXEnabled) {
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; i++) {
memcpy(&Frame->State.xmm.avx.data[i][0], &fpstate->_xmm[i], sizeof(__uint128_t));
}
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; i++) {
memcpy(&Frame->State.xmm.avx.data[i][2], &xstate->ymmh.ymmh_space[i], sizeof(__uint128_t));
}
CTX->SetXMMRegistersFromState(Thread, fpstate->_xmm, xstate->ymmh.ymmh_space);
} else {
memcpy(Frame->State.xmm.sse.data, fpstate->_xmm, sizeof(Frame->State.xmm.sse.data));
CTX->SetXMMRegistersFromState(Thread, fpstate->_xmm, nullptr);
}

// FCW store default
Expand Down Expand Up @@ -517,14 +512,9 @@ void SignalDelegator::RestoreFrame_ia32(FEXCore::Core::InternalThreadState* Thre

// Extended XMM state
if (IsAVXEnabled) {
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; i++) {
memcpy(&Frame->State.xmm.avx.data[i][0], &fpstate->_xmm[i], sizeof(__uint128_t));
}
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; i++) {
memcpy(&Frame->State.xmm.avx.data[i][2], &xstate->ymmh.ymmh_space[i], sizeof(__uint128_t));
}
CTX->SetXMMRegistersFromState(Thread, fpstate->_xmm, xstate->ymmh.ymmh_space);
} else {
memcpy(Frame->State.xmm.sse.data, fpstate->_xmm, sizeof(Frame->State.xmm.sse.data));
CTX->SetXMMRegistersFromState(Thread, fpstate->_xmm, nullptr);
}

// FCW store default
Expand Down Expand Up @@ -596,14 +586,9 @@ void SignalDelegator::RestoreRTFrame_ia32(FEXCore::Core::InternalThreadState* Th

// Extended XMM state
if (IsAVXEnabled) {
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; i++) {
memcpy(&Frame->State.xmm.avx.data[i][0], &fpstate->_xmm[i], sizeof(__uint128_t));
}
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; i++) {
memcpy(&Frame->State.xmm.avx.data[i][2], &xstate->ymmh.ymmh_space[i], sizeof(__uint128_t));
}
CTX->SetXMMRegistersFromState(Thread, fpstate->_xmm, xstate->ymmh.ymmh_space);
} else {
memcpy(Frame->State.xmm.sse.data, fpstate->_xmm, sizeof(Frame->State.xmm.sse.data));
CTX->SetXMMRegistersFromState(Thread, fpstate->_xmm, nullptr);
}

// FCW store default
Expand Down Expand Up @@ -726,14 +711,9 @@ uint64_t SignalDelegator::SetupFrame_x64(FEXCore::Core::InternalThreadState* Thr
memcpy(fpstate->_st, Frame->State.mm, sizeof(Frame->State.mm));

if (IsAVXEnabled) {
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; i++) {
memcpy(&fpstate->_xmm[i], &Frame->State.xmm.avx.data[i][0], sizeof(__uint128_t));
}
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; i++) {
memcpy(&xstate->ymmh.ymmh_space[i], &Frame->State.xmm.avx.data[i][2], sizeof(__uint128_t));
}
CTX->ReconstructXMMRegisters(Thread, fpstate->_xmm, xstate->ymmh.ymmh_space);
} else {
memcpy(fpstate->_xmm, Frame->State.xmm.sse.data, sizeof(Frame->State.xmm.sse.data));
CTX->ReconstructXMMRegisters(Thread, fpstate->_xmm, nullptr);
}

// FCW store default
Expand Down Expand Up @@ -771,9 +751,9 @@ uint64_t SignalDelegator::SetupFrame_x64(FEXCore::Core::InternalThreadState* Thr
return NewGuestSP;
}

uint64_t SignalDelegator::SetupFrame_ia32(ArchHelpers::Context::ContextBackup* ContextBackup, FEXCore::Core::CpuStateFrame* Frame,
int Signal, siginfo_t* HostSigInfo, void* ucontext, GuestSigAction* GuestAction,
stack_t* GuestStack, uint64_t NewGuestSP, const uint32_t eflags) {
uint64_t SignalDelegator::SetupFrame_ia32(FEXCore::Core::InternalThreadState* Thread, ArchHelpers::Context::ContextBackup* ContextBackup,
FEXCore::Core::CpuStateFrame* Frame, int Signal, siginfo_t* HostSigInfo, void* ucontext,
GuestSigAction* GuestAction, stack_t* GuestStack, uint64_t NewGuestSP, const uint32_t eflags) {

const bool IsAVXEnabled = Config.SupportsAVX;
const uint64_t SignalReturn = reinterpret_cast<uint64_t>(VDSOPointers.VDSO_kernel_sigreturn);
Expand Down Expand Up @@ -851,15 +831,11 @@ uint64_t SignalDelegator::SetupFrame_ia32(ArchHelpers::Context::ContextBackup* C

// Extended XMM state
fpstate->status = FEXCore::x86::fpstate_magic::MAGIC_XFPSTATE;

if (IsAVXEnabled) {
for (size_t i = 0; i < std::size(Frame->State.xmm.avx.data); i++) {
memcpy(&fpstate->_xmm[i], &Frame->State.xmm.avx.data[i][0], sizeof(__uint128_t));
}
for (size_t i = 0; i < std::size(Frame->State.xmm.avx.data); i++) {
memcpy(&xstate->ymmh.ymmh_space[i], &Frame->State.xmm.avx.data[i][2], sizeof(__uint128_t));
}
CTX->ReconstructXMMRegisters(Thread, fpstate->_xmm, xstate->ymmh.ymmh_space);
} else {
memcpy(fpstate->_xmm, Frame->State.xmm.sse.data, sizeof(Frame->State.xmm.sse.data));
CTX->ReconstructXMMRegisters(Thread, fpstate->_xmm, nullptr);
}

// FCW store default
Expand Down Expand Up @@ -904,9 +880,9 @@ uint64_t SignalDelegator::SetupFrame_ia32(ArchHelpers::Context::ContextBackup* C
return NewGuestSP;
}

uint64_t SignalDelegator::SetupRTFrame_ia32(ArchHelpers::Context::ContextBackup* ContextBackup, FEXCore::Core::CpuStateFrame* Frame,
int Signal, siginfo_t* HostSigInfo, void* ucontext, GuestSigAction* GuestAction,
stack_t* GuestStack, uint64_t NewGuestSP, const uint32_t eflags) {
uint64_t SignalDelegator::SetupRTFrame_ia32(FEXCore::Core::InternalThreadState* Thread, ArchHelpers::Context::ContextBackup* ContextBackup,
FEXCore::Core::CpuStateFrame* Frame, int Signal, siginfo_t* HostSigInfo, void* ucontext,
GuestSigAction* GuestAction, stack_t* GuestStack, uint64_t NewGuestSP, const uint32_t eflags) {

const bool IsAVXEnabled = Config.SupportsAVX;
const uint64_t SignalReturn = reinterpret_cast<uint64_t>(VDSOPointers.VDSO_kernel_rt_sigreturn);
Expand Down Expand Up @@ -990,15 +966,11 @@ uint64_t SignalDelegator::SetupRTFrame_ia32(ArchHelpers::Context::ContextBackup*

// Extended XMM state
fpstate->status = FEXCore::x86::fpstate_magic::MAGIC_XFPSTATE;

if (IsAVXEnabled) {
for (size_t i = 0; i < std::size(Frame->State.xmm.avx.data); i++) {
memcpy(&fpstate->_xmm[i], &Frame->State.xmm.avx.data[i][0], sizeof(__uint128_t));
}
for (size_t i = 0; i < std::size(Frame->State.xmm.avx.data); i++) {
memcpy(&xstate->ymmh.ymmh_space[i], &Frame->State.xmm.avx.data[i][2], sizeof(__uint128_t));
}
CTX->ReconstructXMMRegisters(Thread, fpstate->_xmm, xstate->ymmh.ymmh_space);
} else {
memcpy(fpstate->_xmm, Frame->State.xmm.sse.data, sizeof(Frame->State.xmm.sse.data));
CTX->ReconstructXMMRegisters(Thread, fpstate->_xmm, nullptr);
}

// FCW store default
Expand Down Expand Up @@ -1193,9 +1165,9 @@ bool SignalDelegator::HandleDispatcherGuestSignal(FEXCore::Core::InternalThreadS
} else {
const bool SigInfoFrame = (GuestAction->sa_flags & SA_SIGINFO) == SA_SIGINFO;
if (SigInfoFrame) {
NewGuestSP = SetupRTFrame_ia32(ContextBackup, Frame, Signal, HostSigInfo, ucontext, GuestAction, GuestStack, NewGuestSP, eflags);
NewGuestSP = SetupRTFrame_ia32(Thread, ContextBackup, Frame, Signal, HostSigInfo, ucontext, GuestAction, GuestStack, NewGuestSP, eflags);
} else {
NewGuestSP = SetupFrame_ia32(ContextBackup, Frame, Signal, HostSigInfo, ucontext, GuestAction, GuestStack, NewGuestSP, eflags);
NewGuestSP = SetupFrame_ia32(Thread, ContextBackup, Frame, Signal, HostSigInfo, ucontext, GuestAction, GuestStack, NewGuestSP, eflags);
}
}

Expand Down
12 changes: 6 additions & 6 deletions Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,14 +256,14 @@ class SignalDelegator final : public FEXCore::SignalDelegator, public FEXCore::A
GuestSigAction* GuestAction, stack_t* GuestStack, uint64_t NewGuestSP, const uint32_t eflags);

///< Setup the signal frame for a 32-bit signal without SA_SIGINFO.
uint64_t SetupFrame_ia32(ArchHelpers::Context::ContextBackup* ContextBackup, FEXCore::Core::CpuStateFrame* Frame, int Signal,
siginfo_t* HostSigInfo, void* ucontext, GuestSigAction* GuestAction, stack_t* GuestStack, uint64_t NewGuestSP,
const uint32_t eflags);
uint64_t SetupFrame_ia32(FEXCore::Core::InternalThreadState* Thread, ArchHelpers::Context::ContextBackup* ContextBackup,
FEXCore::Core::CpuStateFrame* Frame, int Signal, siginfo_t* HostSigInfo, void* ucontext,
GuestSigAction* GuestAction, stack_t* GuestStack, uint64_t NewGuestSP, const uint32_t eflags);

///< Setup the signal frame for a 32-bit signal with SA_SIGINFO.
uint64_t SetupRTFrame_ia32(ArchHelpers::Context::ContextBackup* ContextBackup, FEXCore::Core::CpuStateFrame* Frame, int Signal,
siginfo_t* HostSigInfo, void* ucontext, GuestSigAction* GuestAction, stack_t* GuestStack, uint64_t NewGuestSP,
const uint32_t eflags);
uint64_t SetupRTFrame_ia32(FEXCore::Core::InternalThreadState* Thread, ArchHelpers::Context::ContextBackup* ContextBackup,
FEXCore::Core::CpuStateFrame* Frame, int Signal, siginfo_t* HostSigInfo, void* ucontext,
GuestSigAction* GuestAction, stack_t* GuestStack, uint64_t NewGuestSP, const uint32_t eflags);

enum class RestoreType {
TYPE_REALTIME, ///< Signal restore type is from a `realtime` signal.
Expand Down
11 changes: 11 additions & 0 deletions Source/Tools/TestHarnessRunner/TestHarnessRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,17 @@ int main(int argc, char** argv, char** const envp) {
FEX::HLE::_SyscallHandler->TM.DestroyThread(ParentThread, true);

SyscallHandler.reset();

if (SupportsAVX) {
///< Reconstruct the XMM registers even if they are in split view, then remerge them.
__uint128_t XMM_Low[FEXCore::Core::CPUState::NUM_XMMS];
__uint128_t YMM_High[FEXCore::Core::CPUState::NUM_XMMS];
CTX->ReconstructXMMRegisters(ParentThread->Thread, XMM_Low, YMM_High);
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; ++i) {
memcpy(&State.xmm.avx.data[i][0], &XMM_Low[i], sizeof(__uint128_t));
memcpy(&State.xmm.avx.data[i][2], &YMM_High[i], sizeof(__uint128_t));
}
}
}
#ifndef _WIN32
else {
Expand Down
4 changes: 2 additions & 2 deletions Source/Windows/WOW64/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ void LoadStateFromWowContext(FEXCore::Core::InternalThreadState* Thread, uint64_
// Floating-point register state
const auto* XSave = reinterpret_cast<XSAVE_FORMAT*>(Context->ExtendedRegisters);

memcpy(State.xmm.sse.data, XSave->XmmRegisters, sizeof(State.xmm.sse.data));
CTX->SetXMMRegistersFromState(Thread, reinterpret_cast<const __uint128_t*>(XSave->XmmRegisters), nullptr);
memcpy(State.mm, XSave->FloatRegisters, sizeof(State.mm));

State.FCW = XSave->ControlWord;
Expand Down Expand Up @@ -199,7 +199,7 @@ void StoreWowContextFromState(FEXCore::Core::InternalThreadState* Thread, WOW64_

auto* XSave = reinterpret_cast<XSAVE_FORMAT*>(Context->ExtendedRegisters);

memcpy(XSave->XmmRegisters, State.xmm.sse.data, sizeof(State.xmm.sse.data));
CTX->ReconstructXMMRegisters(Thread, reinterpret_cast<__uint128_t*>(XSave->XmmRegisters), nullptr);
memcpy(XSave->FloatRegisters, State.mm, sizeof(State.mm));

XSave->ControlWord = State.FCW;
Expand Down

0 comments on commit 6edf461

Please sign in to comment.