From 4559701817441f8db881b59f4ad03fe4a31801d5 Mon Sep 17 00:00:00 2001 From: Ivan Date: Thu, 28 Nov 2024 00:11:13 +0300 Subject: [PATCH] Print the heap stats when tcmalloc memory limit is hit (#11968) --- ydb/core/mon_alloc/tcmalloc.cpp | 101 ++++++++++++++++++++++++++++++-- ydb/core/mon_alloc/ya.make | 7 +-- 2 files changed, 97 insertions(+), 11 deletions(-) diff --git a/ydb/core/mon_alloc/tcmalloc.cpp b/ydb/core/mon_alloc/tcmalloc.cpp index d16f58d73501..db1796336bf1 100644 --- a/ydb/core/mon_alloc/tcmalloc.cpp +++ b/ydb/core/mon_alloc/tcmalloc.cpp @@ -3,19 +3,19 @@ #include #include -#include +#include +#include #if defined(USE_DWARF_BACKTRACE) # include #endif - #include #include -#include - #include +#include + using namespace NActors; namespace NKikimr { @@ -478,6 +478,91 @@ class TTcMallocState : public IAllocState { }; +void HandleTcMallocSoftLimit(); + +class TTcMallocLimitHandler : public TSingletonTraits { +public: + Y_DECLARE_SINGLETON_FRIEND(); + + ~TTcMallocLimitHandler() { + if (Thread_.joinable()) { + { + std::unique_lock lock(Mutex_); + JustQuit_ = true; + } + Fire(); + Thread_.join(); + } + } + + void SetOutputStream(IOutputStream& out) { + Out_ = &out; + } + + void Fire() { + std::unique_lock lock(Mutex_); + Fired_ = true; + CV_.notify_all(); + } + +private: + TTcMallocLimitHandler() { + tcmalloc::MallocExtension::EnableForkSupport(); + tcmalloc::MallocExtension::SetSoftMemoryLimitHandler(&HandleTcMallocSoftLimit); + Thread_ = std::thread(&TTcMallocLimitHandler::Handle, this); + } + +private: + std::mutex Mutex_; + bool Fired_ = false; // protected by Mutex_ + bool JustQuit_ = false; // protected by Mutex_ + std::condition_variable CV_; // protected by Mutex_ + + IOutputStream* Out_ = &Cerr; + std::thread Thread_; + + void Handle() { + std::unique_lock lock(Mutex_); + CV_.wait(lock, [&] { + return Fired_; + }); + + if (JustQuit_) { + return; + } + + *Out_ << tcmalloc::MallocExtension::GetStats() << Endl; + + if (auto childPid = fork(); childPid == 0) { + kill(getppid(), SIGSTOP); + + *Out_ << "Child: " << getpid() << ", parent process stopped: " << getppid() << Endl; + + try { + auto profile = tcmalloc::MallocExtension::SnapshotCurrent(tcmalloc::ProfileType::kHeap); + TAllocationAnalyzer analyzer(std::move(profile)); + TAllocationStats allocationStats; + analyzer.Prepare(&allocationStats); + analyzer.Dump(*Out_, 256, 1024, true, true); + } catch (...) { + kill(getppid(), SIGCONT); + throw; + } + + kill(getppid(), SIGCONT); + } else if (childPid < 0) { + *Out_ << "Failed to dump current heap: fork failed" << Endl; + } + + // TODO: probably should wait for child, but we're going to OOM anyway. + } +}; + +void HandleTcMallocSoftLimit() { + Singleton()->Fire(); +} + + class TTcMallocMonitor : public IAllocMonitor { TDynamicCountersPtr CounterGroup; @@ -694,6 +779,11 @@ class TTcMallocMonitor : public IAllocMonitor { CountHistogram = CounterGroup->GetHistogram("tcmalloc.sampled_count", NMonitoring::ExponentialHistogram(TAllocationStats::MaxSizeIndex, 2, 1), false); + +#ifdef PROFILE_MEMORY_ALLOCATIONS + // Setup tcmalloc soft limit handling + Singleton(); +#endif } void RegisterPages(TMon* mon, TActorSystem* actorSystem, TActorId actorId) override { @@ -807,6 +897,7 @@ class TTcMallocProfiler : public IProfilerLogic { } }; +// Public functions std::unique_ptr CreateTcMallocStats(TDynamicCountersPtr group) { return std::make_unique(std::move(group)); @@ -824,4 +915,4 @@ std::unique_ptr CreateTcMallocProfiler() { return std::make_unique(); } -} +} // namespace NKikimr diff --git a/ydb/core/mon_alloc/ya.make b/ydb/core/mon_alloc/ya.make index 6591349bf293..9210af29de82 100644 --- a/ydb/core/mon_alloc/ya.make +++ b/ydb/core/mon_alloc/ya.make @@ -1,11 +1,5 @@ LIBRARY() -IF (PROFILE_MEMORY_ALLOCATIONS) - CFLAGS( - -DPROFILE_MEMORY_ALLOCATIONS - ) -ENDIF() - SRCS( memory_info.cpp monitor.cpp @@ -35,6 +29,7 @@ PEERDIR( ydb/library/actors/core ydb/library/actors/prof ydb/library/services + yql/essentials/utils/memory_profiling ) END()