From 5845474dcaa42114c38527c1f268d66df7e0d149 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 21 Jan 2025 19:04:06 -0800 Subject: [PATCH 1/9] Windows: Expose support for NtCreateSection and NtMapViewOfSection --- Source/Windows/include/winternl.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Source/Windows/include/winternl.h b/Source/Windows/include/winternl.h index d55d1e5695..2a3bbe1dd0 100644 --- a/Source/Windows/include/winternl.h +++ b/Source/Windows/include/winternl.h @@ -375,6 +375,11 @@ typedef struct _SYSTEM_CPU_INFORMATION { ULONG ProcessorFeatureBits; } SYSTEM_CPU_INFORMATION, *PSYSTEM_CPU_INFORMATION; +typedef enum _SECTION_INHERIT { + ViewShare = 1, + ViewUnmap = 2, +} SECTION_INHERIT; + /* definitions of bits in the Feature set for the x86 processors */ #define CPU_FEATURE_VME 0x00000005 /* Virtual 86 Mode Extensions */ #define CPU_FEATURE_TSC 0x00000002 /* Time Stamp Counter available */ @@ -455,10 +460,12 @@ NTSTATUS WINAPI LdrGetProcedureAddress(HMODULE, const ANSI_STRING*, ULONG, void* NTSTATUS WINAPI NtAllocateVirtualMemoryEx(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MEM_EXTENDED_PARAMETER*, ULONG); NTSTATUS WINAPI NtAllocateVirtualMemory(HANDLE, PVOID*, ULONG_PTR, SIZE_T*, ULONG, ULONG); NTSTATUS WINAPI NtContinue(PCONTEXT, BOOLEAN); +NTSTATUS WINAPI NtCreateSection(HANDLE*, ACCESS_MASK, const OBJECT_ATTRIBUTES*, const LARGE_INTEGER*, ULONG, ULONG, HANDLE); NTSTATUS WINAPI NtFlushInstructionCache(HANDLE, LPCVOID, SIZE_T); NTSTATUS WINAPI NtFreeVirtualMemory(HANDLE, PVOID*, SIZE_T*, ULONG); NTSTATUS WINAPI NtGetContextThread(HANDLE, CONTEXT*); ULONG WINAPI NtGetCurrentProcessorNumber(void); +NTSYSAPI NTSTATUS WINAPI NtMapViewOfSection(HANDLE, HANDLE, PVOID*, ULONG_PTR, SIZE_T, const LARGE_INTEGER*, SIZE_T*, SECTION_INHERIT, ULONG, ULONG); NTSTATUS WINAPI NtOpenKeyEx(PHANDLE, ACCESS_MASK, const OBJECT_ATTRIBUTES*, ULONG); NTSTATUS WINAPI NtProtectVirtualMemory(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG*); NTSTATUS WINAPI NtQueryAttributesFile(const OBJECT_ATTRIBUTES*, FILE_BASIC_INFORMATION*); From a1b800a736dacd69a5577b3c363301e8a50b8176 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 21 Jan 2025 19:06:55 -0800 Subject: [PATCH 2/9] FEXCore: Implements baseline per-thread profile stats Not wired up, just the definitions so it lives in the InternalThreadState. We want this accessible from both FEXCore and the frontends so it needs to live there. Two types of events supported. Scoped cyclecounts and instant increments. This gives us JIT time and Signal handling time, plus events for number of SIGBUS and number of SMC events. All useful statistics for seeing stutter live. --- .../Source/Interface/Config/Config.json.in | 8 ++ .../FEXCore/Debug/InternalThreadState.h | 6 ++ FEXCore/include/FEXCore/Utils/Profiler.h | 96 +++++++++++++++++++ 3 files changed, 110 insertions(+) diff --git a/FEXCore/Source/Interface/Config/Config.json.in b/FEXCore/Source/Interface/Config/Config.json.in index a28c9c2f9e..25b7e3f126 100644 --- a/FEXCore/Source/Interface/Config/Config.json.in +++ b/FEXCore/Source/Interface/Config/Config.json.in @@ -363,6 +363,14 @@ "Redirects the telemetry folder that FEX usually writes to.", "By default telemetry data is stored in {$FEX_APP_DATA_LOCATION,{$XDG_DATA_HOME,$HOME}/.fex-emu/Telemetry/}" ] + }, + "ProfileStats": { + "Type": "bool", + "Default": "false", + "Desc": [ + "Enables FEX's low-overhead sampling profile statistics.", + "Requires a supported version of Mangohud to see the results" + ] } }, "Hacks": { diff --git a/FEXCore/include/FEXCore/Debug/InternalThreadState.h b/FEXCore/include/FEXCore/Debug/InternalThreadState.h index 81c49932a5..5eb185e54a 100644 --- a/FEXCore/include/FEXCore/Debug/InternalThreadState.h +++ b/FEXCore/include/FEXCore/Debug/InternalThreadState.h @@ -36,6 +36,10 @@ class OpDispatchBuilder; class PassManager; } // namespace FEXCore::IR +namespace FEXCore::Profiler { +struct ThreadStats; +}; + namespace FEXCore::Core { // Special-purpose replacement for std::unique_ptr to allow InternalThreadState to be standard layout. @@ -95,6 +99,8 @@ struct InternalThreadState : public FEXCore::Allocator::FEXAllocOperators { std::shared_mutex ObjectCacheRefCounter {}; + FEXCore::Profiler::ThreadStats* ThreadStats {}; + ///< Data pointer for exclusive use by the frontend void* FrontendPtr; diff --git a/FEXCore/include/FEXCore/Utils/Profiler.h b/FEXCore/include/FEXCore/Utils/Profiler.h index 059d49d064..3653dafc45 100644 --- a/FEXCore/include/FEXCore/Utils/Profiler.h +++ b/FEXCore/include/FEXCore/Utils/Profiler.h @@ -1,13 +1,72 @@ // SPDX-License-Identifier: MIT #pragma once +#include #include #include +#ifdef _M_X86_64 +#include +#endif + #include namespace FEXCore::Profiler { +// FEXCore live-stats +constexpr uint8_t STATS_VERSION = 1; +enum class AppType : uint8_t { + LINUX_32, + LINUX_64, + WIN_ARM64EC, + WIN_WOW64, +}; + +struct ThreadStatsHeader { + uint8_t Version; + AppType app_type; + uint8_t _pad[2]; + char fex_version[48]; + std::atomic Head; + std::atomic Size; +}; + +struct ThreadStats { + std::atomic Next; + std::atomic TID; + + // Accumulated time (In unscaled CPU cycles!) + uint64_t AccumulatedJITTime; + uint64_t AccumulatedSignalTime; + + // Accumulated event counts + uint64_t AccumulatedSIGBUSCount; + uint64_t AccumulatedSMCCount; +}; + #ifdef ENABLE_FEXCORE_PROFILER +#ifdef _M_ARM_64 +/** + * @brief Get the raw cycle counter which is synchronizing. + * + * `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature. + */ +static inline uint64_t GetCycleCounter() { + uint64_t Result {}; + __asm volatile(R"( + isb; + mrs %[Res], CNTVCT_EL0; + )" + : [Res] "=r"(Result)); + return Result; +} +#else +static inline uint64_t GetCycleCounter() { + unsigned dummy; + uint64_t tsc = __rdtscp(&dummy); + return tsc; +} +#endif + FEX_DEFAULT_VISIBILITY void Init(); FEX_DEFAULT_VISIBILITY void Shutdown(); FEX_DEFAULT_VISIBILITY void TraceObject(std::string_view const Format); @@ -34,6 +93,36 @@ class ProfilerBlock final { // Declare a scoped profile block variable with a fixed name. #define FEXCORE_PROFILE_SCOPED(name) FEXCore::Profiler::ProfilerBlock UniqueScopeName(ScopedBlock_, __LINE__)(name) +template +class AccumulationBlock final { +public: + AccumulationBlock(T* Stat) + : Begin {GetCycleCounter()} + , Stat {Stat} {} + + ~AccumulationBlock() { + const auto Duration = GetCycleCounter() - Begin + FlatOffset; + if (Stat) { + auto ref = std::atomic_ref(*Stat); + ref.fetch_add(Duration, std::memory_order_relaxed); + } + } + +private: + uint64_t Begin; + T* Stat; +}; + +#define FEXCORE_PROFILE_ACCUMULATION(ThreadState, Stat) \ + FEXCore::Profiler::AccumulationBlockThreadStats->Stat)> UniqueScopeName(ScopedAccumulation_, __LINE__)( \ + ThreadState->ThreadStats ? &ThreadState->ThreadStats->Stat : nullptr); +#define FEXCORE_PROFILE_INSTANT_INCREMENT(ThreadState, Stat, value) \ + do { \ + if (ThreadState->ThreadStats) { \ + ThreadState->ThreadStats->Stat += value; \ + } \ + } while (0) + #else [[maybe_unused]] static void Init() {} @@ -50,5 +139,12 @@ static void TraceObject(std::string_view const, uint64_t) {} #define FEXCORE_PROFILE_SCOPED(...) \ do { \ } while (0) +#define FEXCORE_PROFILE_ACCUMULATION(...) \ + do { \ + } while (0) +#define FEXCORE_PROFILE_INSTANT_INCREMENT(...) \ + do { \ + } while (0) + #endif } // namespace FEXCore::Profiler From 6c666361e946fdb6d9e1a0de7c9dda339408b00e Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 21 Jan 2025 19:10:10 -0800 Subject: [PATCH 3/9] Profiler: Sprinkle the profile stats around For the four things we care about --- FEXCore/Source/Interface/Core/Core.cpp | 3 ++- .../LinuxSyscalls/SignalDelegator.cpp | 2 ++ .../LinuxSyscalls/SyscallsSMCTracking.cpp | 1 + Source/Windows/ARM64EC/Module.cpp | 21 ++++++++++++------- Source/Windows/WOW64/Module.cpp | 7 ++++++- 5 files changed, 24 insertions(+), 10 deletions(-) diff --git a/FEXCore/Source/Interface/Core/Core.cpp b/FEXCore/Source/Interface/Core/Core.cpp index fcea488510..cc3600c580 100644 --- a/FEXCore/Source/Interface/Core/Core.cpp +++ b/FEXCore/Source/Interface/Core/Core.cpp @@ -831,8 +831,9 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT } uintptr_t ContextImpl::CompileBlock(FEXCore::Core::CpuStateFrame* Frame, uint64_t GuestRIP, uint64_t MaxInst) { - FEXCORE_PROFILE_SCOPED("CompileBlock"); auto Thread = Frame->Thread; + FEXCORE_PROFILE_SCOPED("CompileBlock"); + FEXCORE_PROFILE_ACCUMULATION(Thread, AccumulatedJITTime); // Invalidate might take a unique lock on this, to guarantee that during invalidation no code gets compiled auto lk = GuardSignalDeferringSection(CodeInvalidationMutex, Thread); diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.cpp index 151e8d9b66..5635984aee 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.cpp @@ -59,6 +59,7 @@ static FEX::HLE::ThreadStateObject* GetThreadFromAltStack(const stack_t& alt_sta static void SignalHandlerThunk(int Signal, siginfo_t* Info, void* UContext) { ucontext_t* _context = (ucontext_t*)UContext; auto ThreadObject = GetThreadFromAltStack(_context->uc_stack); + FEXCORE_PROFILE_ACCUMULATION(ThreadObject->Thread, AccumulatedSignalTime); ThreadObject->SignalInfo.Delegator->HandleSignal(ThreadObject, Signal, Info, UContext); } @@ -916,6 +917,7 @@ SignalDelegator::SignalDelegator(FEXCore::Context::Context* _CTX, const std::str return false; } + FEXCORE_PROFILE_INSTANT_INCREMENT(Thread, AccumulatedSIGBUSCount, 1); const auto Delegator = FEX::HLE::ThreadManager::GetStateObjectFromFEXCoreThread(Thread)->SignalInfo.Delegator; const auto Result = FEXCore::ArchHelpers::Arm64::HandleUnalignedAccess(Thread, Delegator->GetUnalignedHandlerType(), PC, ArchHelpers::Context::GetArmGPRs(ucontext)); diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/SyscallsSMCTracking.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/SyscallsSMCTracking.cpp index b81242d95a..8edfa70c4d 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/SyscallsSMCTracking.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/SyscallsSMCTracking.cpp @@ -97,6 +97,7 @@ bool SyscallHandler::HandleSegfault(FEXCore::Core::InternalThreadState* Thread, }); } + FEXCORE_PROFILE_INSTANT_INCREMENT(Thread, AccumulatedSMCCount, 1); return true; } } diff --git a/Source/Windows/ARM64EC/Module.cpp b/Source/Windows/ARM64EC/Module.cpp index bc085b92da..52a2e55ea6 100644 --- a/Source/Windows/ARM64EC/Module.cpp +++ b/Source/Windows/ARM64EC/Module.cpp @@ -255,12 +255,14 @@ struct alignas(16) KiUserExceptionDispatcherStackLayout { }; static bool HandleUnalignedAccess(ARM64_NT_CONTEXT& Context) { - if (!CTX->IsAddressInCodeBuffer(GetCPUArea().ThreadState(), Context.Pc)) { + auto Thread = GetCPUArea().ThreadState(); + if (!CTX->IsAddressInCodeBuffer(Thread, Context.Pc)) { return false; } - const auto Result = FEXCore::ArchHelpers::Arm64::HandleUnalignedAccess(GetCPUArea().ThreadState(), - HandlerConfig->GetUnalignedHandlerType(), Context.Pc, &Context.X0); + FEXCORE_PROFILE_INSTANT_INCREMENT(Thread, AccumulatedSIGBUSCount, 1); + const auto Result = + FEXCore::ArchHelpers::Arm64::HandleUnalignedAccess(Thread, HandlerConfig->GetUnalignedHandlerType(), Context.Pc, &Context.X0); if (!Result.first) { return false; } @@ -590,19 +592,22 @@ class ScopedCallbackDisable { // Returns true if exception dispatch should be halted and the execution context restored to NativeContext bool ResetToConsistentStateImpl(EXCEPTION_RECORD* Exception, CONTEXT* GuestContext, ARM64_NT_CONTEXT* NativeContext) { const auto CPUArea = GetCPUArea(); + auto Thread = CPUArea.ThreadState(); + FEXCORE_PROFILE_ACCUMULATION(Thread, AccumulatedSignalTime); LogMan::Msg::DFmt("Exception: Code: {:X} Address: {:X}", Exception->ExceptionCode, reinterpret_cast(Exception->ExceptionAddress)); - if (Exception->ExceptionCode == EXCEPTION_ACCESS_VIOLATION && CPUArea.ThreadState() && InvalidationTracker) { + if (Exception->ExceptionCode == EXCEPTION_ACCESS_VIOLATION && Thread && InvalidationTracker) { const auto FaultAddress = static_cast(Exception->ExceptionInformation[1]); std::scoped_lock Lock(ThreadCreationMutex); if (InvalidationTracker->HandleRWXAccessViolation(FaultAddress)) { - if (CTX->IsAddressInCodeBuffer(CPUArea.ThreadState(), NativeContext->Pc) && !CTX->IsCurrentBlockSingleInst(CPUArea.ThreadState()) && - CTX->IsAddressInCurrentBlock(CPUArea.ThreadState(), FaultAddress, 8)) { + FEXCORE_PROFILE_INSTANT_INCREMENT(Thread, AccumulatedSMCCount, 1); + if (CTX->IsAddressInCodeBuffer(Thread, NativeContext->Pc) && !CTX->IsCurrentBlockSingleInst(CPUArea.ThreadState()) && + CTX->IsAddressInCurrentBlock(Thread, FaultAddress, 8)) { // If we are not patching ourself (single inst block case) and patching the current block, this is inline SMC. Reconstruct the current context (before the SMC write) then single step the write to reduce it to regular SMC. - Exception::ReconstructThreadState(CPUArea.ThreadState(), *NativeContext); + Exception::ReconstructThreadState(Thread, *NativeContext); LogMan::Msg::DFmt("Handled inline self-modifying code: pc: {:X} rip: {:X} fault: {:X}", NativeContext->Pc, - CPUArea.ThreadState()->CurrentFrame->State.rip, FaultAddress); + Thread->CurrentFrame->State.rip, FaultAddress); NativeContext->Pc = CPUArea.DispatcherLoopTopEnterECFillSRA(); NativeContext->Sp = CPUArea.EmulatorStackBase(); NativeContext->X10 = 1; // Set ENTRY_FILL_SRA_SINGLE_INST_REG to force a single step diff --git a/Source/Windows/WOW64/Module.cpp b/Source/Windows/WOW64/Module.cpp index 8a6e552283..a971107539 100644 --- a/Source/Windows/WOW64/Module.cpp +++ b/Source/Windows/WOW64/Module.cpp @@ -519,6 +519,8 @@ void BTCpuThreadTerm(HANDLE Thread, LONG ExitCode) { return; } + auto* OldThreadState = TLS.ThreadState(); + THREAD_BASIC_INFORMATION Info; if (NTSTATUS Err = NtQueryInformationThread(Thread, ThreadBasicInformation, &Info, sizeof(Info), nullptr); Err) { return; @@ -530,7 +532,7 @@ void BTCpuThreadTerm(HANDLE Thread, LONG ExitCode) { Threads.erase(ThreadTID); } - CTX->DestroyThread(TLS.ThreadState()); + CTX->DestroyThread(OldThreadState); if (ThreadTID == GetCurrentThreadId()) { FEX::Windows::DeinitCRTThread(); } @@ -686,6 +688,7 @@ bool BTCpuResetToConsistentStateImpl(EXCEPTION_POINTERS* Ptrs) { auto* Context = Ptrs->ContextRecord; auto* Exception = Ptrs->ExceptionRecord; auto Thread = GetTLS().ThreadState(); + FEXCORE_PROFILE_ACCUMULATION(Thread, AccumulatedSignalTime); if (Exception->ExceptionCode == EXCEPTION_ACCESS_VIOLATION) { const auto FaultAddress = static_cast(Exception->ExceptionInformation[1]); @@ -701,6 +704,7 @@ bool BTCpuResetToConsistentStateImpl(EXCEPTION_POINTERS* Ptrs) { if (Thread) { std::scoped_lock Lock(ThreadCreationMutex); + FEXCORE_PROFILE_INSTANT_INCREMENT(Thread, AccumulatedSMCCount, 1); if (InvalidationTracker->HandleRWXAccessViolation(FaultAddress)) { LogMan::Msg::DFmt("Handled self-modifying code: pc: {:X} fault: {:X}", Context->Pc, FaultAddress); return true; @@ -712,6 +716,7 @@ bool BTCpuResetToConsistentStateImpl(EXCEPTION_POINTERS* Ptrs) { return false; } + FEXCORE_PROFILE_INSTANT_INCREMENT(Thread, AccumulatedSIGBUSCount, 1); if (Exception->ExceptionCode == EXCEPTION_DATATYPE_MISALIGNMENT && Context::HandleUnalignedAccess(Context)) { LogMan::Msg::DFmt("Handled unaligned atomic: new pc: {:X}", Context->Pc); return true; From f6d9d3dc03fafe7c0cc5586a63e2b25e62e987bf Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 21 Jan 2025 19:15:02 -0800 Subject: [PATCH 4/9] WinAPI: Implement support for DeleteFile --- Source/Windows/Common/WinAPI/IO.cpp | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/Source/Windows/Common/WinAPI/IO.cpp b/Source/Windows/Common/WinAPI/IO.cpp index d453f509be..16ad036a33 100644 --- a/Source/Windows/Common/WinAPI/IO.cpp +++ b/Source/Windows/Common/WinAPI/IO.cpp @@ -45,6 +45,37 @@ FILE_INFORMATION_CLASS FileInfoClassToNT(FILE_INFO_BY_HANDLE_CLASS InformationCl } } // namespace +DLLEXPORT_FUNC(BOOL, DeleteFileA, (LPCSTR lpFileName)) { + ScopedUnicodeString FileName {lpFileName}; + return DeleteFileW(FileName->Buffer); +} + +DLLEXPORT_FUNC(BOOL, DeleteFileW, (LPCWSTR lpFileName)) { + UNICODE_STRING PathW; + RtlInitUnicodeString(&PathW, lpFileName); + + ScopedUnicodeString NTPath; + if (!RtlDosPathNameToNtPathName_U(PathW.Buffer, &*NTPath, nullptr, nullptr)) { + SetLastError(ERROR_PATH_NOT_FOUND); + return false; + } + + OBJECT_ATTRIBUTES ObjAttributes; + InitializeObjectAttributes(&ObjAttributes, &*NTPath, OBJ_CASE_INSENSITIVE, nullptr, nullptr); + + HANDLE Handle; + IO_STATUS_BLOCK IOSB; + + NTSTATUS Status = + NtCreateFile(&Handle, SYNCHRONIZE | DELETE, &ObjAttributes, &IOSB, nullptr, 0, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + FILE_OPEN, FILE_DELETE_ON_CLOSE | FILE_NON_DIRECTORY_FILE, nullptr, 0); + if (WinAPIReturn(Status)) { + Status = NtClose(Handle); + } + + return WinAPIReturn(Status); +} + DLLEXPORT_FUNC(HANDLE, CreateFileA, (LPCSTR lpFileName, DWORD dwDesiredAccess, DWORD dwShareMode, LPSECURITY_ATTRIBUTES lpSecurityAttributes, DWORD dwCreationDisposition, DWORD dwFlagsAndAttributes, HANDLE hTemplateFile)) { From 13ebda4c97d3d630f5ed188aa35a8d6fefcd783f Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 21 Jan 2025 19:18:34 -0800 Subject: [PATCH 5/9] Common: Implement a base profiler implementation Not wired up to anything. Requires the frontends to allocate shared memory in the expected way. --- Source/Common/CMakeLists.txt | 3 +- Source/Common/Profiler.cpp | 120 +++++++++++++++++++++++++++++++++++ Source/Common/Profiler.h | 67 +++++++++++++++++++ 3 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 Source/Common/Profiler.cpp create mode 100644 Source/Common/Profiler.h diff --git a/Source/Common/CMakeLists.txt b/Source/Common/CMakeLists.txt index 241df8e5aa..d0b0d29a30 100644 --- a/Source/Common/CMakeLists.txt +++ b/Source/Common/CMakeLists.txt @@ -7,7 +7,8 @@ set(SRCS EnvironmentLoader.cpp HostFeatures.cpp JSONPool.cpp - StringUtil.cpp) + StringUtil.cpp + Profiler.cpp) if (NOT MINGW_BUILD) list (APPEND SRCS diff --git a/Source/Common/Profiler.cpp b/Source/Common/Profiler.cpp new file mode 100644 index 0000000000..70001a7621 --- /dev/null +++ b/Source/Common/Profiler.cpp @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: MIT +#include "Common/Profiler.h" +#include "git_version.h" + +#include + +namespace FEX::Profiler { +void StatAllocBase::SaveHeader(FEXCore::Profiler::AppType AppType) { + if (!Base) { + return; + } + + Head = reinterpret_cast(Base); + Head->Size.store(CurrentSize, std::memory_order_relaxed); + Head->Version = FEXCore::Profiler::STATS_VERSION; + + constexpr std::array::length(GIT_DESCRIBE_STRING) + 1> GitString = {GIT_DESCRIBE_STRING}; + strncpy(Head->fex_version, GitString.data(), std::min(GitString.size(), sizeof(Head->fex_version))); + Head->app_type = AppType; + + Stats = reinterpret_cast(reinterpret_cast(Base) + sizeof(FEXCore::Profiler::ThreadStatsHeader)); + + RemainingSlots = TotalSlotsFromSize(); +} + +bool StatAllocBase::AllocateMoreSlots() { + const auto OriginalSlotCount = TotalSlotsFromSize(); + + uint64_t NewSize = AllocateMoreSlots(CurrentSize * 2); + + if (NewSize == CurrentSize) { + return false; + } + + CurrentSize = NewSize; + Head->Size.store(CurrentSize, std::memory_order_relaxed); + RemainingSlots = TotalSlotsFromSize() - OriginalSlotCount; + + return true; +} + +FEXCore::Profiler::ThreadStats* StatAllocBase::AllocateBaseSlot(uint32_t TID) { + if (!RemainingSlots) { + if (!AllocateMoreSlots()) { + return nullptr; + } + } + + // Find a free slot + memory_barrier(); + FEXCore::Profiler::ThreadStats* AllocatedSlot {}; + for (size_t i = 0; i < TotalSlotsFromSize(); ++i) { + AllocatedSlot = &Stats[i]; + if (AllocatedSlot->TID.load(std::memory_order_relaxed) == 0) { + break; + } + } + + --RemainingSlots; + + // Slot might be reused, just zero it now. + memset(AllocatedSlot, 0, sizeof(FEXCore::Profiler::ThreadStatsHeader)); + + // TID != 0 means slot is allocated. + AllocatedSlot->TID.store(TID, std::memory_order_relaxed); + + // Setup singly-linked list + if (Head->Head.load(std::memory_order_relaxed) == 0) { + Head->Head.store(OffsetFromStat(AllocatedSlot), std::memory_order_relaxed); + } else { + StatTail->Next.store(OffsetFromStat(AllocatedSlot), std::memory_order_relaxed); + } + + // Update the tail. + StatTail = AllocatedSlot; + return AllocatedSlot; +} + +void StatAllocBase::DeallocateBaseSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot) { + if (!AllocatedSlot) { + return; + } + + // TID == 0 will signal the reader to ignore this slot & deallocate it! + AllocatedSlot->TID.store(0, std::memory_order_relaxed); + + memory_barrier(); + + const auto SlotOffset = OffsetFromStat(AllocatedSlot); + const auto AllocatedSlotNext = AllocatedSlot->Next.load(std::memory_order_relaxed); + + const bool IsTail = AllocatedSlot == StatTail; + + // Update the linked list. + if (Head->Head == SlotOffset) { + Head->Head.store(AllocatedSlotNext, std::memory_order_relaxed); + if (IsTail) { + StatTail = nullptr; + } + } else { + for (size_t i = 0; i < TotalSlotsFromSize(); ++i) { + auto Slot = &Stats[i]; + auto NextSlotOffset = Slot->Next.load(std::memory_order_relaxed); + + if (NextSlotOffset == SlotOffset) { + Slot->Next.store(AllocatedSlotNext, std::memory_order_relaxed); + + if (IsTail) { + // This slot is now the tail. + StatTail = Slot; + } + break; + } + } + } + + ++RemainingSlots; +} + +} // namespace FEX::Profiler diff --git a/Source/Common/Profiler.h b/Source/Common/Profiler.h new file mode 100644 index 0000000000..023678ce8b --- /dev/null +++ b/Source/Common/Profiler.h @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +/* +$info$ +tags: Common|Profiler +desc: Frontend profiler common code +$end_info$ +*/ +#pragma once +#include + +namespace FEXCore::Core { +struct InternalThreadState; +} + +#ifdef _M_ARM_64 +static inline void memory_barrier() { + asm volatile("dmb ishst;" ::: "memory"); +} + +#else +static inline void memory_barrier() { + // Intentionally empty. +} +#endif + +namespace FEX::Profiler { +class StatAllocBase { +public: + virtual ~StatAllocBase() = default; + +protected: + FEXCore::Profiler::ThreadStats* AllocateBaseSlot(uint32_t TID); + void DeallocateBaseSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot); + + uint32_t OffsetFromStat(FEXCore::Profiler::ThreadStats* Stat) const { + return reinterpret_cast(Stat) - reinterpret_cast(Base); + } + size_t TotalSlotsFromSize() const { + return (CurrentSize - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats) - 1; + } + size_t SlotIndexFromOffset(uint32_t Offset) { + return (Offset - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats); + } + + void SaveHeader(FEXCore::Profiler::AppType AppType); + + void* Base; + size_t CurrentSize {}; + FEXCore::Profiler::ThreadStatsHeader* Head {}; + FEXCore::Profiler::ThreadStats* Stats; + FEXCore::Profiler::ThreadStats* StatTail {}; + uint64_t RemainingSlots; + + // Limited to 4MB which should be a few hundred threads of tracking capability. + // I (Sonicadvance1) wanted to reserve 128MB of VA space because it's cheap, but ran in to a bug when running WINE. + // WINE allocates [0x7fff'fe00'0000, 0x7fff'ffff'0000) which /consistently/ overlaps with FEX's sigaltstack. + // This only occurs when this stat allocation size is large as the top-down allocation pushes the alt-stack further. + // Additionally, only occurs on 48-bit VA systems, as mmap on lesser VA will fail regardless. + // TODO: Bump allocation size up once FEXCore's allocator can first use the 128TB of blocked VA space on 48-bit systems. + constexpr static size_t MAX_STATS_SIZE = 4 * 1024 * 1024; + +private: + virtual uint64_t AllocateMoreSlots(uint64_t NewSize) = 0; + bool AllocateMoreSlots(); +}; + +} // namespace FEX::Profiler From 3bdc69dd70ca2b92774e9fb06a72de8ee8fc0ebe Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 21 Jan 2025 19:20:32 -0800 Subject: [PATCH 6/9] LinuxSyscalls: Implements support for Linux side profile stats This is fairly straightforward. It creates the shared memory region in /dev/shm/fex--stats so that Mangohud can sample it. --- .../LinuxSyscalls/SignalDelegator.cpp | 3 + .../LinuxEmulation/LinuxSyscalls/Syscalls.cpp | 1 + .../LinuxSyscalls/Syscalls/Thread.cpp | 4 + .../LinuxSyscalls/ThreadManager.cpp | 169 +++++++++++++++++- .../LinuxSyscalls/ThreadManager.h | 34 ++++ 5 files changed, 208 insertions(+), 3 deletions(-) diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.cpp index 5635984aee..88226efc75 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/SignalDelegator.cpp @@ -18,6 +18,7 @@ desc: Handles host -> host and host -> guest signal routing, emulates procmask & #include #include #include +#include #include #include @@ -674,6 +675,8 @@ void SignalDelegator::HandleGuestSignal(FEX::HLE::ThreadStateObject* ThreadObjec SaveTelemetry(); #endif + FEX::HLE::_SyscallHandler->TM.CleanupForExit(); + // Reassign back to DFL and crash signal(Signal, SIG_DFL); if (SigInfo.si_code != SI_KERNEL) { diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls.cpp index 58a4a2dacc..d929a5c165 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls.cpp @@ -883,6 +883,7 @@ uint64_t UnimplementedSyscallSafe(FEXCore::Core::CpuStateFrame* Frame, uint64_t } void SyscallHandler::LockBeforeFork(FEXCore::Core::InternalThreadState* Thread) { + TM.LockBeforeFork(); Thread->CTX->LockBeforeFork(Thread); VMATracking.Mutex.lock(); } diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp index 8b5e380ac6..ea27d41737 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp @@ -61,6 +61,9 @@ static void* ThreadHandler(void* Data) { Thread->ThreadInfo.PID = ::getpid(); Thread->ThreadInfo.TID = FHU::Syscalls::gettid(); + if (Thread->Thread->ThreadStats) { + Thread->Thread->ThreadStats->TID.store(Thread->ThreadInfo.TID, std::memory_order_relaxed); + } FEX::HLE::_SyscallHandler->RegisterTLSState(Thread); @@ -558,6 +561,7 @@ void RegisterThread(FEX::HLE::SyscallHandler* Handler) { [](FEXCore::Core::CpuStateFrame* Frame, int status) -> uint64_t { // Save telemetry if we're exiting. FEX::HLE::_SyscallHandler->GetSignalDelegator()->SaveTelemetry(); + FEX::HLE::_SyscallHandler->TM.CleanupForExit(); syscall(SYSCALL_DEF(exit_group), status); // This will never be reached diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp index 106e5a4cb1..3a1d156bc0 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp @@ -4,8 +4,160 @@ #include "LinuxSyscalls/SignalDelegator.h" #include +#include +#include + +#include +#include +#include +#include namespace FEX::HLE { + +ThreadManager::StatAlloc::StatAlloc() { + Initialize(); + SaveHeader(Is64BitMode() ? FEXCore::Profiler::AppType::LINUX_64 : FEXCore::Profiler::AppType::LINUX_32); +} + +void ThreadManager::StatAlloc::Initialize() { + if (!ProfileStats()) { + return; + } + + int fd = shm_open(fextl::fmt::format("fex-{}-stats", ::getpid()).c_str(), O_CREAT | O_TRUNC | O_RDWR, USER_PERMS); + if (!fd) { + return; + } + CurrentSize = sysconf(_SC_PAGESIZE); + if (CurrentSize == 0) { + CurrentSize = 4096; + } + + if (ftruncate(fd, CurrentSize) == -1) { + LogMan::Msg::EFmt("[StatAlloc] ftruncate failed"); + goto err; + } + + // 128MB ought to be enough for anyone. + Base = ::mmap(nullptr, MAX_STATS_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0); + if (Base == MAP_FAILED) { + LogMan::Msg::EFmt("[StatAlloc] mmap base failed"); + Base = nullptr; + goto err; + } + + // Allocate a small working shared space for now, grow as necessary. + { + auto SharedBase = ::mmap(Base, CurrentSize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0); + if (SharedBase == MAP_FAILED) { + LogMan::Msg::EFmt("[StatAlloc] mmap shm failed"); + munmap(Base, MAX_STATS_SIZE); + Base = nullptr; + goto err; + } + } + +err: + close(fd); +} + +uint64_t ThreadManager::StatAlloc::AllocateMoreSlots(uint64_t NewSize) { + if (CurrentSize == MAX_STATS_SIZE) { + // Nope. + return CurrentSize; + } + NewSize = std::max(MAX_STATS_SIZE, NewSize); + + // When allocating more slots, open the fd without O_TRUNC | O_CREAT. + int fd = shm_open(fextl::fmt::format("fex-{}-stats", ::getpid()).c_str(), O_RDWR, USER_PERMS); + if (!fd) { + return CurrentSize; + } + + if (ftruncate(fd, NewSize) == -1) { + LogMan::Msg::EFmt("[StatAlloc] ftruncate more failed"); + + goto err; + } + + { + auto SharedBase = ::mmap(Base, NewSize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0); + if (SharedBase == MAP_FAILED) { + LogMan::Msg::EFmt("[StatAlloc] allocate more mmap shm failed"); + goto err; + } + + // TODO: Just a sanity check. + const char* SharedTest = (const char*)Base; + for (size_t i = CurrentSize; i < NewSize; ++i) { + if (SharedTest[i] != 0) { + LogMan::Msg::EFmt("truncate and map shared resulted in not zero'd memory!"); + } + } + } + +err: + close(fd); + return NewSize; +} + +FEXCore::Profiler::ThreadStats* ThreadManager::StatAlloc::AllocateSlot(uint32_t TID) { + std::scoped_lock lk(StatMutex); + return AllocateBaseSlot(TID); +} + +void ThreadManager::StatAlloc::DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot) { + if (!AllocatedSlot) { + return; + } + + std::scoped_lock lk(StatMutex); + DeallocateBaseSlot(AllocatedSlot); +} + +void ThreadManager::StatAlloc::CleanupForExit() { + shm_unlink(fextl::fmt::format("fex-{}-stats", ::getpid()).c_str()); +} + +void ThreadManager::StatAlloc::LockBeforeFork() { + if (!ProfileStats()) { + return; + } + StatMutex.lock(); +} + +void ThreadManager::StatAlloc::UnlockAfterFork(FEXCore::Core::InternalThreadState* Thread, bool Child) { + if (!ProfileStats()) { + return; + } + + if (!Child) { + StatMutex.unlock(); + return; + } + + StatMutex.StealAndDropActiveLocks(); + + // shm_memory tied to this process is now not owned by this process. + // Replace the shm region! Otherwise this process will keep reporting time in the original parent thread's stats region! + munmap(Base, MAX_STATS_SIZE); + Base = nullptr; + CurrentSize = 0; + Head = nullptr; + Stats = nullptr; + StatTail = nullptr; + RemainingSlots = 0; + + Thread->ThreadStats = nullptr; + + Initialize(); + SaveHeader(Is64BitMode() ? FEXCore::Profiler::AppType::LINUX_64 : FEXCore::Profiler::AppType::LINUX_32); + + // Update this thread's ThreadStats object + auto ThreadObject = FEX::HLE::ThreadManager::GetStateObjectFromFEXCoreThread(Thread); + ThreadObject->Thread->ThreadStats = AllocateSlot(ThreadObject->ThreadInfo.TID); +} + FEX::HLE::ThreadStateObject* ThreadManager::CreateThread(uint64_t InitialRIP, uint64_t StackPointer, const FEXCore::Core::CPUState* NewThreadState, uint64_t ParentTID, FEX::HLE::ThreadStateObject* InheritThread) { auto ThreadStateObject = new FEX::HLE::ThreadStateObject; @@ -13,12 +165,13 @@ FEX::HLE::ThreadStateObject* ThreadManager::CreateThread(uint64_t InitialRIP, ui ThreadStateObject->ThreadInfo.parent_tid = ParentTID; ThreadStateObject->ThreadInfo.PID = ::getpid(); - if (ParentTID == 0) { - ThreadStateObject->ThreadInfo.TID = FHU::Syscalls::gettid(); - } + ThreadStateObject->ThreadInfo.TID = FHU::Syscalls::gettid(); ThreadStateObject->Thread = CTX->CreateThread(InitialRIP, StackPointer, NewThreadState, ParentTID); ThreadStateObject->Thread->FrontendPtr = ThreadStateObject; + if (ProfileStats()) { + ThreadStateObject->Thread->ThreadStats = Stat.AllocateSlot(ThreadStateObject->ThreadInfo.TID); + } if (InheritThread) { FEX::HLE::_SyscallHandler->SeccompEmulator.InheritSeccompFilters(InheritThread, ThreadStateObject); @@ -37,6 +190,8 @@ void ThreadManager::DestroyThread(FEX::HLE::ThreadStateObject* Thread, bool Need Threads.erase(It); } + Stat.DeallocateSlot(Thread->Thread->ThreadStats); + HandleThreadDeletion(Thread, NeedsTLSUninstall); } @@ -212,7 +367,12 @@ void ThreadManager::UnpauseThread(FEX::HLE::ThreadStateObject* Thread) { Thread->ThreadPaused.NotifyOne(); } +void ThreadManager::LockBeforeFork() { + Stat.LockBeforeFork(); +} + void ThreadManager::UnlockAfterFork(FEXCore::Core::InternalThreadState* LiveThread, bool Child) { + Stat.UnlockAfterFork(LiveThread, Child); if (!Child) { return; } @@ -220,6 +380,9 @@ void ThreadManager::UnlockAfterFork(FEXCore::Core::InternalThreadState* LiveThre // This function is called after fork // We need to cleanup some of the thread data that is dead for (auto& DeadThread : Threads) { + // This is not owned by the child after fork. + DeadThread->Thread->ThreadStats = nullptr; + if (DeadThread->Thread == LiveThread) { continue; } diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h index 2401a88357..949088853e 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h @@ -8,11 +8,14 @@ desc: Frontend thread management #pragma once +#include "Common/Profiler.h" + #include "LinuxSyscalls/Types.h" #include "LinuxSyscalls/Seccomp/SeccompEmulator.h" #include #include +#include #include #include @@ -105,6 +108,35 @@ class ThreadManager final { ~ThreadManager(); + class StatAlloc final : public FEX::Profiler::StatAllocBase { + public: + StatAlloc(); + + void LockBeforeFork(); + void UnlockAfterFork(FEXCore::Core::InternalThreadState* Thread, bool Child); + + void CleanupForExit(); + + FEXCore::Profiler::ThreadStats* AllocateSlot(uint32_t TID); + void DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot); + + private: + void Initialize(); + + uint64_t AllocateMoreSlots(uint64_t NewSize) override; + FEX_CONFIG_OPT(ProfileStats, PROFILESTATS); + FEX_CONFIG_OPT(Is64BitMode, IS64BIT_MODE); + + constexpr static int USER_PERMS = S_IRWXU | S_IRWXG | S_IRWXO; + FEXCore::ForkableUniqueMutex StatMutex; + }; + + void CleanupForExit() { + Stat.CleanupForExit(); + } + + StatAlloc Stat; + ///< Returns the ThreadStateObject from a CpuStateFrame object. static inline FEX::HLE::ThreadStateObject* GetStateObjectFromCPUState(FEXCore::Core::CpuStateFrame* Frame) { return static_cast(Frame->Thread->FrontendPtr); @@ -136,6 +168,7 @@ class ThreadManager final { void SleepThread(FEXCore::Context::Context* CTX, FEXCore::Core::CpuStateFrame* Frame); + void LockBeforeFork(); void UnlockAfterFork(FEXCore::Core::InternalThreadState* Thread, bool Child); void IncrementIdleRefCount() { @@ -188,6 +221,7 @@ class ThreadManager final { void HandleThreadDeletion(FEX::HLE::ThreadStateObject* Thread, bool NeedsTLSUninstall = false); void NotifyPause(); + FEX_CONFIG_OPT(ProfileStats, PROFILESTATS); }; } // namespace FEX::HLE From 46dca8afaac1287c2c2b83d39f1e57a348e1a3a8 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 21 Jan 2025 19:22:54 -0800 Subject: [PATCH 7/9] Wine: Implements support for profile stats This is a little trickier, we actually open the `/dev/shm/fex--stats` file directly using Windows APIs that way Mangohud (which is going to be on the Linux side, or potentially even embedded in to Gamescope) can safely pick up the stats. A little quirky plus doesn't support expanding its size since WINE doesn't support NtExtendSection, but that's fine. --- Source/Windows/ARM64EC/Module.cpp | 20 ++++++++- Source/Windows/Common/CMakeLists.txt | 2 +- Source/Windows/Common/Profiler.cpp | 65 ++++++++++++++++++++++++++++ Source/Windows/Common/Profiler.h | 28 ++++++++++++ Source/Windows/WOW64/Module.cpp | 21 ++++++++- 5 files changed, 131 insertions(+), 5 deletions(-) create mode 100644 Source/Windows/Common/Profiler.cpp create mode 100644 Source/Windows/Common/Profiler.h diff --git a/Source/Windows/ARM64EC/Module.cpp b/Source/Windows/ARM64EC/Module.cpp index 52a2e55ea6..5d09600899 100644 --- a/Source/Windows/ARM64EC/Module.cpp +++ b/Source/Windows/ARM64EC/Module.cpp @@ -37,6 +37,7 @@ desc: Implements the ARM64EC BT module API using FEXCore #include "Common/CRT/CRT.h" #include "DummyHandlers.h" #include "BTInterface.h" +#include "Windows/Common/Profiler.h" #include #include @@ -122,6 +123,7 @@ namespace { fextl::unique_ptr CTX; fextl::unique_ptr SignalDelegator; fextl::unique_ptr SyscallHandler; +fextl::unique_ptr StatAllocHandler; std::optional InvalidationTracker; std::optional CPUFeatures; std::optional OvercommitTracker; @@ -569,10 +571,17 @@ NTSTATUS ProcessInit() { const uintptr_t KiUserExceptionDispatcherFFS = reinterpret_cast(GetProcAddress(NtDll, "KiUserExceptionDispatcher")); Exception::KiUserExceptionDispatcher = NtDllRedirectionLUT[KiUserExceptionDispatcherFFS - NtDllBase] + NtDllBase; + FEX_CONFIG_OPT(ProfileStats, PROFILESTATS); + + if (IsWine && ProfileStats()) { + StatAllocHandler = fextl::make_unique(FEXCore::Profiler::AppType::WIN_ARM64EC); + } return STATUS_SUCCESS; } -void ProcessTerm(HANDLE Handle, BOOL After, NTSTATUS Status) {} +void ProcessTerm(HANDLE Handle, BOOL After, NTSTATUS Status) { + StatAllocHandler.reset(); +} class ScopedCallbackDisable { private: @@ -808,7 +817,11 @@ NTSTATUS ThreadInit() { { std::scoped_lock Lock(ThreadCreationMutex); - Threads.emplace(GetCurrentThreadId(), Thread); + auto ThreadTID = GetCurrentThreadId(); + Threads.emplace(ThreadTID, Thread); + if (StatAllocHandler) { + Thread->ThreadStats = StatAllocHandler->AllocateSlot(ThreadTID); + } } CPUArea.ThreadState() = Thread; @@ -833,6 +846,9 @@ NTSTATUS ThreadTerm(HANDLE Thread, LONG ExitCode) { { std::scoped_lock Lock(ThreadCreationMutex); Threads.erase(ThreadTID); + if (StatAllocHandler) { + StatAllocHandler->DeallocateSlot(OldThreadState->ThreadStats); + } } CTX->DestroyThread(OldThreadState); diff --git a/Source/Windows/Common/CMakeLists.txt b/Source/Windows/Common/CMakeLists.txt index 8d92d8f18a..70a0e46c84 100644 --- a/Source/Windows/Common/CMakeLists.txt +++ b/Source/Windows/Common/CMakeLists.txt @@ -1,4 +1,4 @@ -add_library(CommonWindows STATIC CPUFeatures.cpp InvalidationTracker.cpp Logging.cpp LoadConfig.S) +add_library(CommonWindows STATIC CPUFeatures.cpp Profiler.cpp InvalidationTracker.cpp Logging.cpp LoadConfig.S) add_subdirectory(CRT) add_subdirectory(WinAPI) target_link_libraries(CommonWindows FEXCore_Base JemallocLibs) diff --git a/Source/Windows/Common/Profiler.cpp b/Source/Windows/Common/Profiler.cpp new file mode 100644 index 0000000000..2dedde586e --- /dev/null +++ b/Source/Windows/Common/Profiler.cpp @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: MIT +#include "Windows/Common/Profiler.h" + +#include +#include + +#include +#include +#include +#include +#include + +namespace FEX::Windows { +__attribute__((naked)) uint64_t linux_getpid() { + asm volatile(R"( + mov x8, 172; + svc #0; + ret; + )" :: + : "r0", "r8"); +} + +uint64_t StatAlloc::AllocateMoreSlots(uint64_t NewSize) { + LogMan::Msg::DFmt("Ran out of slots. Can't allocate more"); + return CurrentSize; +} + +StatAlloc::StatAlloc(FEXCore::Profiler::AppType AppType) { + CurrentSize = MAX_STATS_SIZE; + + auto handle = CreateFile(fextl::fmt::format("/dev/shm/fex-{}-stats", linux_getpid()).c_str(), GENERIC_READ | GENERIC_WRITE, + FILE_SHARE_READ, nullptr, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, nullptr); + + // Create the section mapping for the file handle for the full size. + HANDLE SectionMapping; + LARGE_INTEGER SectionSize {{MAX_STATS_SIZE}}; + auto Result = NtCreateSection(&SectionMapping, SECTION_EXTEND_SIZE | SECTION_MAP_READ | SECTION_MAP_WRITE, nullptr, &SectionSize, + PAGE_READWRITE, SEC_COMMIT, handle); + if (Result != 0) { + CloseHandle(handle); + return; + } + + // Section mapping is used from now on. + CloseHandle(handle); + + // Now actually map the view of the section. + Base = 0; + size_t FullSize = MAX_STATS_SIZE; + Result = NtMapViewOfSection(SectionMapping, NtCurrentProcess(), &Base, 0, 0, nullptr, &FullSize, ViewUnmap, MEM_RESERVE | MEM_TOP_DOWN, + PAGE_READWRITE); + if (Result != 0) { + CloseHandle(SectionMapping); + return; + } + + // Once WINE supports NtExtendSection and SECTION_EXTEND_SIZE correctly then we can map/commit a single page, map the full MAX_STATS_SIZE + // view as reserved, and extend the view using NtExtendSection. + SaveHeader(AppType); +} +StatAlloc::~StatAlloc() { + DeleteFile(fextl::fmt::format("/dev/shm/fex-{}-stats", linux_getpid()).c_str()); +} + +} // namespace FEX::Windows diff --git a/Source/Windows/Common/Profiler.h b/Source/Windows/Common/Profiler.h new file mode 100644 index 0000000000..b12ad9631a --- /dev/null +++ b/Source/Windows/Common/Profiler.h @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +#pragma once + +#include "Common/Profiler.h" + +namespace FEX::Windows { +class StatAlloc final : public FEX::Profiler::StatAllocBase { +public: + StatAlloc(FEXCore::Profiler::AppType AppType); + virtual ~StatAlloc(); + + FEXCore::Profiler::ThreadStats* AllocateSlot(uint32_t TID) { + return AllocateBaseSlot(TID); + } + + void DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot) { + if (!AllocatedSlot) { + return; + } + + DeallocateBaseSlot(AllocatedSlot); + } + +private: + uint64_t AllocateMoreSlots(uint64_t NewSize) override; +}; + +} // namespace FEX::Windows diff --git a/Source/Windows/WOW64/Module.cpp b/Source/Windows/WOW64/Module.cpp index a971107539..bb55985c40 100644 --- a/Source/Windows/WOW64/Module.cpp +++ b/Source/Windows/WOW64/Module.cpp @@ -38,6 +38,7 @@ desc: Implements the WOW64 BT module API using FEXCore #include "Common/CRT/CRT.h" #include "DummyHandlers.h" #include "BTInterface.h" +#include "Windows/Common/Profiler.h" #include #include @@ -105,6 +106,7 @@ namespace BridgeInstrs { fextl::unique_ptr CTX; fextl::unique_ptr SignalDelegator; fextl::unique_ptr SyscallHandler; +fextl::unique_ptr StatAllocHandler; std::optional InvalidationTracker; std::optional CPUFeatures; @@ -499,9 +501,17 @@ void BTCpuProcessInit() { // wow64.dll will only initialise the cross-process queue if this is set GetTLS().Wow64Info().CpuFlags = WOW64_CPUFLAGS_SOFTWARE; + + FEX_CONFIG_OPT(ProfileStats, PROFILESTATS); + + if (IsWine && ProfileStats()) { + StatAllocHandler = fextl::make_unique(FEXCore::Profiler::AppType::WIN_WOW64); + } } -void BTCpuProcessTerm(HANDLE Handle, BOOL After, ULONG Status) {} +void BTCpuProcessTerm(HANDLE Handle, BOOL After, ULONG Status) { + StatAllocHandler.reset(); +} void BTCpuThreadInit() { FEX::Windows::InitCRTThread(); @@ -510,7 +520,11 @@ void BTCpuThreadInit() { GetTLS().ControlWord().fetch_or(ControlBits::WOW_CPU_AREA_DIRTY, std::memory_order::relaxed); std::scoped_lock Lock(ThreadCreationMutex); - Threads.emplace(GetCurrentThreadId(), Thread); + auto ThreadTID = GetCurrentThreadId(); + Threads.emplace(ThreadTID, Thread); + if (StatAllocHandler) { + Thread->ThreadStats = StatAllocHandler->AllocateSlot(ThreadTID); + } } void BTCpuThreadTerm(HANDLE Thread, LONG ExitCode) { @@ -530,6 +544,9 @@ void BTCpuThreadTerm(HANDLE Thread, LONG ExitCode) { { std::scoped_lock Lock(ThreadCreationMutex); Threads.erase(ThreadTID); + if (StatAllocHandler) { + StatAllocHandler->DeallocateSlot(OldThreadState->ThreadStats); + } } CTX->DestroyThread(OldThreadState); From 832f9c2684d0203b939cf06f3198ef3c376224ec Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 22 Jan 2025 11:48:53 -0800 Subject: [PATCH 8/9] Review --- .../FEXCore/Debug/InternalThreadState.h | 1 + FEXCore/include/FEXCore/Utils/Profiler.h | 5 +-- Source/Common/Profiler.cpp | 12 +++---- Source/Common/Profiler.h | 28 +++++++++-------- .../LinuxSyscalls/ThreadManager.cpp | 31 +++++++++---------- .../LinuxSyscalls/ThreadManager.h | 2 +- Source/Windows/ARM64EC/Module.cpp | 4 +-- Source/Windows/Common/Profiler.cpp | 2 +- Source/Windows/Common/Profiler.h | 6 ++-- Source/Windows/WOW64/Module.cpp | 10 +++--- 10 files changed, 49 insertions(+), 52 deletions(-) diff --git a/FEXCore/include/FEXCore/Debug/InternalThreadState.h b/FEXCore/include/FEXCore/Debug/InternalThreadState.h index 5eb185e54a..5c8bc424fb 100644 --- a/FEXCore/include/FEXCore/Debug/InternalThreadState.h +++ b/FEXCore/include/FEXCore/Debug/InternalThreadState.h @@ -99,6 +99,7 @@ struct InternalThreadState : public FEXCore::Allocator::FEXAllocOperators { std::shared_mutex ObjectCacheRefCounter {}; + // This pointer is owned by the frontend. FEXCore::Profiler::ThreadStats* ThreadStats {}; ///< Data pointer for exclusive use by the frontend diff --git a/FEXCore/include/FEXCore/Utils/Profiler.h b/FEXCore/include/FEXCore/Utils/Profiler.h index 3653dafc45..123d672a76 100644 --- a/FEXCore/include/FEXCore/Utils/Profiler.h +++ b/FEXCore/include/FEXCore/Utils/Profiler.h @@ -26,7 +26,8 @@ struct ThreadStatsHeader { uint8_t _pad[2]; char fex_version[48]; std::atomic Head; - std::atomic Size; + std::atomic Size; + uint32_t Pad; }; struct ThreadStats { @@ -46,7 +47,7 @@ struct ThreadStats { #ifdef _M_ARM_64 /** - * @brief Get the raw cycle counter which is synchronizing. + * @brief Get the raw cycle counter with synchronizing isb. * * `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature. */ diff --git a/Source/Common/Profiler.cpp b/Source/Common/Profiler.cpp index 70001a7621..1e2f662f07 100644 --- a/Source/Common/Profiler.cpp +++ b/Source/Common/Profiler.cpp @@ -14,7 +14,7 @@ void StatAllocBase::SaveHeader(FEXCore::Profiler::AppType AppType) { Head->Size.store(CurrentSize, std::memory_order_relaxed); Head->Version = FEXCore::Profiler::STATS_VERSION; - constexpr std::array::length(GIT_DESCRIBE_STRING) + 1> GitString = {GIT_DESCRIBE_STRING}; + std::string_view GitString = GIT_DESCRIBE_STRING; strncpy(Head->fex_version, GitString.data(), std::min(GitString.size(), sizeof(Head->fex_version))); Head->app_type = AppType; @@ -26,7 +26,7 @@ void StatAllocBase::SaveHeader(FEXCore::Profiler::AppType AppType) { bool StatAllocBase::AllocateMoreSlots() { const auto OriginalSlotCount = TotalSlotsFromSize(); - uint64_t NewSize = AllocateMoreSlots(CurrentSize * 2); + uint32_t NewSize = FrontendAllocateSlots(CurrentSize * 2); if (NewSize == CurrentSize) { return false; @@ -39,7 +39,7 @@ bool StatAllocBase::AllocateMoreSlots() { return true; } -FEXCore::Profiler::ThreadStats* StatAllocBase::AllocateBaseSlot(uint32_t TID) { +FEXCore::Profiler::ThreadStats* StatAllocBase::AllocateSlot(uint32_t TID) { if (!RemainingSlots) { if (!AllocateMoreSlots()) { return nullptr; @@ -47,7 +47,7 @@ FEXCore::Profiler::ThreadStats* StatAllocBase::AllocateBaseSlot(uint32_t TID) { } // Find a free slot - memory_barrier(); + store_memory_barrier(); FEXCore::Profiler::ThreadStats* AllocatedSlot {}; for (size_t i = 0; i < TotalSlotsFromSize(); ++i) { AllocatedSlot = &Stats[i]; @@ -76,7 +76,7 @@ FEXCore::Profiler::ThreadStats* StatAllocBase::AllocateBaseSlot(uint32_t TID) { return AllocatedSlot; } -void StatAllocBase::DeallocateBaseSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot) { +void StatAllocBase::DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot) { if (!AllocatedSlot) { return; } @@ -84,7 +84,7 @@ void StatAllocBase::DeallocateBaseSlot(FEXCore::Profiler::ThreadStats* Allocated // TID == 0 will signal the reader to ignore this slot & deallocate it! AllocatedSlot->TID.store(0, std::memory_order_relaxed); - memory_barrier(); + store_memory_barrier(); const auto SlotOffset = OffsetFromStat(AllocatedSlot); const auto AllocatedSlotNext = AllocatedSlot->Next.load(std::memory_order_relaxed); diff --git a/Source/Common/Profiler.h b/Source/Common/Profiler.h index 023678ce8b..821ae7cdf0 100644 --- a/Source/Common/Profiler.h +++ b/Source/Common/Profiler.h @@ -13,43 +13,45 @@ struct InternalThreadState; } #ifdef _M_ARM_64 -static inline void memory_barrier() { +static inline void store_memory_barrier() { asm volatile("dmb ishst;" ::: "memory"); } #else -static inline void memory_barrier() { +static inline void store_memory_barrier() { // Intentionally empty. + // x86 is strongly memory ordered with regular loadstores. No need for barrier. } #endif namespace FEX::Profiler { class StatAllocBase { -public: - virtual ~StatAllocBase() = default; - protected: - FEXCore::Profiler::ThreadStats* AllocateBaseSlot(uint32_t TID); - void DeallocateBaseSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot); + FEXCore::Profiler::ThreadStats* AllocateSlot(uint32_t TID); + void DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot); uint32_t OffsetFromStat(FEXCore::Profiler::ThreadStats* Stat) const { return reinterpret_cast(Stat) - reinterpret_cast(Base); } - size_t TotalSlotsFromSize() const { + uint32_t TotalSlotsFromSize() const { return (CurrentSize - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats) - 1; } - size_t SlotIndexFromOffset(uint32_t Offset) { + uint32_t TotalSlotsFromSize(uint32_t Size) const { + return (Size - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats) - 1; + } + + uint32_t SlotIndexFromOffset(uint32_t Offset) { return (Offset - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats); } void SaveHeader(FEXCore::Profiler::AppType AppType); void* Base; - size_t CurrentSize {}; + uint32_t CurrentSize {}; FEXCore::Profiler::ThreadStatsHeader* Head {}; FEXCore::Profiler::ThreadStats* Stats; FEXCore::Profiler::ThreadStats* StatTail {}; - uint64_t RemainingSlots; + uint32_t RemainingSlots; // Limited to 4MB which should be a few hundred threads of tracking capability. // I (Sonicadvance1) wanted to reserve 128MB of VA space because it's cheap, but ran in to a bug when running WINE. @@ -57,10 +59,10 @@ class StatAllocBase { // This only occurs when this stat allocation size is large as the top-down allocation pushes the alt-stack further. // Additionally, only occurs on 48-bit VA systems, as mmap on lesser VA will fail regardless. // TODO: Bump allocation size up once FEXCore's allocator can first use the 128TB of blocked VA space on 48-bit systems. - constexpr static size_t MAX_STATS_SIZE = 4 * 1024 * 1024; + constexpr static uint32_t MAX_STATS_SIZE = 4 * 1024 * 1024; private: - virtual uint64_t AllocateMoreSlots(uint64_t NewSize) = 0; + virtual uint32_t FrontendAllocateSlots(uint32_t NewSize) = 0; bool AllocateMoreSlots(); }; diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp index 3a1d156bc0..742cac5380 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp @@ -25,7 +25,7 @@ void ThreadManager::StatAlloc::Initialize() { } int fd = shm_open(fextl::fmt::format("fex-{}-stats", ::getpid()).c_str(), O_CREAT | O_TRUNC | O_RDWR, USER_PERMS); - if (!fd) { + if (fd == -1) { return; } CurrentSize = sysconf(_SC_PAGESIZE); @@ -38,7 +38,11 @@ void ThreadManager::StatAlloc::Initialize() { goto err; } - // 128MB ought to be enough for anyone. + // Reserve a region of MAX_STATS_SIZE so we can grow the allocation buffer. + // Number of thread slots when ThreadStatsHeader == 64bytes and ThreadStats == 40bytes: + // 1 page: 99 slots + // 1 MB: 26211 slots + // 128 MB: 3355440 slots Base = ::mmap(nullptr, MAX_STATS_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0); if (Base == MAP_FAILED) { LogMan::Msg::EFmt("[StatAlloc] mmap base failed"); @@ -61,9 +65,10 @@ void ThreadManager::StatAlloc::Initialize() { close(fd); } -uint64_t ThreadManager::StatAlloc::AllocateMoreSlots(uint64_t NewSize) { +uint32_t ThreadManager::StatAlloc::FrontendAllocateSlots(uint32_t NewSize) { if (CurrentSize == MAX_STATS_SIZE) { - // Nope. + // Allocator has reached maximum slots. We can't allocate anymore. + // New threads won't get stats. return CurrentSize; } NewSize = std::max(MAX_STATS_SIZE, NewSize); @@ -86,14 +91,6 @@ uint64_t ThreadManager::StatAlloc::AllocateMoreSlots(uint64_t NewSize) { LogMan::Msg::EFmt("[StatAlloc] allocate more mmap shm failed"); goto err; } - - // TODO: Just a sanity check. - const char* SharedTest = (const char*)Base; - for (size_t i = CurrentSize; i < NewSize; ++i) { - if (SharedTest[i] != 0) { - LogMan::Msg::EFmt("truncate and map shared resulted in not zero'd memory!"); - } - } } err: @@ -103,7 +100,7 @@ uint64_t ThreadManager::StatAlloc::AllocateMoreSlots(uint64_t NewSize) { FEXCore::Profiler::ThreadStats* ThreadManager::StatAlloc::AllocateSlot(uint32_t TID) { std::scoped_lock lk(StatMutex); - return AllocateBaseSlot(TID); + return StatAllocBase::AllocateSlot(TID); } void ThreadManager::StatAlloc::DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot) { @@ -112,7 +109,7 @@ void ThreadManager::StatAlloc::DeallocateSlot(FEXCore::Profiler::ThreadStats* Al } std::scoped_lock lk(StatMutex); - DeallocateBaseSlot(AllocatedSlot); + StatAllocBase::DeallocateSlot(AllocatedSlot); } void ThreadManager::StatAlloc::CleanupForExit() { @@ -138,8 +135,8 @@ void ThreadManager::StatAlloc::UnlockAfterFork(FEXCore::Core::InternalThreadStat StatMutex.StealAndDropActiveLocks(); - // shm_memory tied to this process is now not owned by this process. - // Replace the shm region! Otherwise this process will keep reporting time in the original parent thread's stats region! + // shm_memory ownership is retained by the parent process, so the child must replace it with its own one. + // Otherwise this process will keep reporting in the original parent thread's stats region. munmap(Base, MAX_STATS_SIZE); Base = nullptr; CurrentSize = 0; @@ -380,7 +377,7 @@ void ThreadManager::UnlockAfterFork(FEXCore::Core::InternalThreadState* LiveThre // This function is called after fork // We need to cleanup some of the thread data that is dead for (auto& DeadThread : Threads) { - // This is not owned by the child after fork. + // The fork parent retains ownership of ThreadStats DeadThread->Thread->ThreadStats = nullptr; if (DeadThread->Thread == LiveThread) { diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h index 949088853e..8fae497131 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h @@ -123,7 +123,7 @@ class ThreadManager final { private: void Initialize(); - uint64_t AllocateMoreSlots(uint64_t NewSize) override; + uint32_t FrontendAllocateSlots(uint32_t NewSize) override; FEX_CONFIG_OPT(ProfileStats, PROFILESTATS); FEX_CONFIG_OPT(Is64BitMode, IS64BIT_MODE); diff --git a/Source/Windows/ARM64EC/Module.cpp b/Source/Windows/ARM64EC/Module.cpp index 5d09600899..a3eb0a1a57 100644 --- a/Source/Windows/ARM64EC/Module.cpp +++ b/Source/Windows/ARM64EC/Module.cpp @@ -579,9 +579,7 @@ NTSTATUS ProcessInit() { return STATUS_SUCCESS; } -void ProcessTerm(HANDLE Handle, BOOL After, NTSTATUS Status) { - StatAllocHandler.reset(); -} +void ProcessTerm(HANDLE Handle, BOOL After, NTSTATUS Status) {} class ScopedCallbackDisable { private: diff --git a/Source/Windows/Common/Profiler.cpp b/Source/Windows/Common/Profiler.cpp index 2dedde586e..6e24be04c2 100644 --- a/Source/Windows/Common/Profiler.cpp +++ b/Source/Windows/Common/Profiler.cpp @@ -20,7 +20,7 @@ __attribute__((naked)) uint64_t linux_getpid() { : "r0", "r8"); } -uint64_t StatAlloc::AllocateMoreSlots(uint64_t NewSize) { +uint32_t StatAlloc::FrontendAllocateSlots(uint32_t NewSize) { LogMan::Msg::DFmt("Ran out of slots. Can't allocate more"); return CurrentSize; } diff --git a/Source/Windows/Common/Profiler.h b/Source/Windows/Common/Profiler.h index b12ad9631a..6bdd0f87fc 100644 --- a/Source/Windows/Common/Profiler.h +++ b/Source/Windows/Common/Profiler.h @@ -10,7 +10,7 @@ class StatAlloc final : public FEX::Profiler::StatAllocBase { virtual ~StatAlloc(); FEXCore::Profiler::ThreadStats* AllocateSlot(uint32_t TID) { - return AllocateBaseSlot(TID); + return StatAllocBase::AllocateSlot(TID); } void DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot) { @@ -18,11 +18,11 @@ class StatAlloc final : public FEX::Profiler::StatAllocBase { return; } - DeallocateBaseSlot(AllocatedSlot); + StatAllocBase::DeallocateSlot(AllocatedSlot); } private: - uint64_t AllocateMoreSlots(uint64_t NewSize) override; + uint32_t FrontendAllocateSlots(uint32_t NewSize) override; }; } // namespace FEX::Windows diff --git a/Source/Windows/WOW64/Module.cpp b/Source/Windows/WOW64/Module.cpp index bb55985c40..950b1c815a 100644 --- a/Source/Windows/WOW64/Module.cpp +++ b/Source/Windows/WOW64/Module.cpp @@ -509,9 +509,7 @@ void BTCpuProcessInit() { } } -void BTCpuProcessTerm(HANDLE Handle, BOOL After, ULONG Status) { - StatAllocHandler.reset(); -} +void BTCpuProcessTerm(HANDLE Handle, BOOL After, ULONG Status) {} void BTCpuThreadInit() { FEX::Windows::InitCRTThread(); @@ -533,7 +531,7 @@ void BTCpuThreadTerm(HANDLE Thread, LONG ExitCode) { return; } - auto* OldThreadState = TLS.ThreadState(); + auto* ThreadState = TLS.ThreadState(); THREAD_BASIC_INFORMATION Info; if (NTSTATUS Err = NtQueryInformationThread(Thread, ThreadBasicInformation, &Info, sizeof(Info), nullptr); Err) { @@ -545,11 +543,11 @@ void BTCpuThreadTerm(HANDLE Thread, LONG ExitCode) { std::scoped_lock Lock(ThreadCreationMutex); Threads.erase(ThreadTID); if (StatAllocHandler) { - StatAllocHandler->DeallocateSlot(OldThreadState->ThreadStats); + StatAllocHandler->DeallocateSlot(ThreadState->ThreadStats); } } - CTX->DestroyThread(OldThreadState); + CTX->DestroyThread(ThreadState); if (ThreadTID == GetCurrentThreadId()) { FEX::Windows::DeinitCRTThread(); } From a9b28f9f2857873d06c1558fc494610aa43b6d49 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 22 Jan 2025 18:41:13 -0800 Subject: [PATCH 9/9] Wine: Add support for magic fex+wine shm path Fallback to the previous path if it doesn't exist. --- Source/Windows/Common/Profiler.cpp | 35 +++++++++++++++++++++++++++++- Source/Windows/Common/Profiler.h | 1 + Source/Windows/include/winternl.h | 7 ++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/Source/Windows/Common/Profiler.cpp b/Source/Windows/Common/Profiler.cpp index 6e24be04c2..cfa6437105 100644 --- a/Source/Windows/Common/Profiler.cpp +++ b/Source/Windows/Common/Profiler.cpp @@ -21,11 +21,44 @@ __attribute__((naked)) uint64_t linux_getpid() { } uint32_t StatAlloc::FrontendAllocateSlots(uint32_t NewSize) { - LogMan::Msg::DFmt("Ran out of slots. Can't allocate more"); + if (CurrentSize == MAX_STATS_SIZE || !UsingNTQueryPath) { + LogMan::Msg::DFmt("Ran out of slots. Can't allocate more"); + return CurrentSize; + } + + MEMORY_FEX_STATS_SHM_INFORMATION Info { + .shm_base = nullptr, + .map_size = std::min(CurrentSize * 2, MAX_STATS_SIZE), + .max_size = MAX_STATS_SIZE, + }; + size_t Length {}; + auto Result = NtQueryVirtualMemory(NtCurrentProcess(), nullptr, MemoryFexStatsShm, &Info, sizeof(Info), &Length); + if (!Result) { + CurrentSize = Info.map_size; + } + return CurrentSize; } StatAlloc::StatAlloc(FEXCore::Profiler::AppType AppType) { + // Try wine+fex magic path. + + { + MEMORY_FEX_STATS_SHM_INFORMATION Info { + .shm_base = nullptr, + .map_size = 4096, + .max_size = MAX_STATS_SIZE, + }; + size_t Length {}; + auto Result = NtQueryVirtualMemory(NtCurrentProcess(), nullptr, MemoryFexStatsShm, &Info, sizeof(Info), &Length); + if (!Result) { + UsingNTQueryPath = true; + CurrentSize = Info.map_size; + Base = Info.shm_base; + SaveHeader(AppType); + return; + } + } CurrentSize = MAX_STATS_SIZE; auto handle = CreateFile(fextl::fmt::format("/dev/shm/fex-{}-stats", linux_getpid()).c_str(), GENERIC_READ | GENERIC_WRITE, diff --git a/Source/Windows/Common/Profiler.h b/Source/Windows/Common/Profiler.h index 6bdd0f87fc..9deea2e552 100644 --- a/Source/Windows/Common/Profiler.h +++ b/Source/Windows/Common/Profiler.h @@ -23,6 +23,7 @@ class StatAlloc final : public FEX::Profiler::StatAllocBase { private: uint32_t FrontendAllocateSlots(uint32_t NewSize) override; + bool UsingNTQueryPath {}; }; } // namespace FEX::Windows diff --git a/Source/Windows/include/winternl.h b/Source/Windows/include/winternl.h index 2a3bbe1dd0..914d38a2ce 100644 --- a/Source/Windows/include/winternl.h +++ b/Source/Windows/include/winternl.h @@ -434,6 +434,7 @@ typedef enum _MEMORY_INFORMATION_CLASS { MemoryWineUnixFuncs = 1000, MemoryWineUnixWow64Funcs, #endif + MemoryFexStatsShm = 2000, } MEMORY_INFORMATION_CLASS; typedef enum _KEY_VALUE_INFORMATION_CLASS { @@ -452,6 +453,12 @@ typedef struct _KEY_VALUE_PARTIAL_INFORMATION { UCHAR Data[1]; } KEY_VALUE_PARTIAL_INFORMATION, *PKEY_VALUE_PARTIAL_INFORMATION; +typedef struct _MEMORY_FEX_STATS_SHM_INFORMATION { + void* shm_base; + DWORD map_size; + DWORD max_size; +} MEMORY_FEX_STATS_SHM_INFORMATION, *PMEMORY_FEX_STATS_SHM_INFORMATION; + NTSTATUS WINAPIV DbgPrint(LPCSTR fmt, ...); NTSTATUS WINAPI LdrDisableThreadCalloutsForDll(HMODULE); NTSTATUS WINAPI LdrGetDllFullName(HMODULE, UNICODE_STRING*);