Skip to content

Commit

Permalink
refactor(profiling): remove memalloc lock debug asserts (#12657)
Browse files Browse the repository at this point in the history
We added some debug assertions around the locks we added to memalloc and
the GIL. When we added the locks, we hadn't fully root-caused the
problem and weren't 100% sure why we needed the locks in the first
place. The assertions were there to find where the locks were contended,
and to see if the GIL wasn't held.

It turns out the GIL _was_ held, at least in a test program which
crashed prior to adding locking. But we fundamentally misunderstood the
guarantees the GIL made (or lack thereof). See
https://cython.readthedocs.io/en/latest/src/userguide/nogil.html#don-t-use-the-gil-as-a-lock.
The GIL is not intended to be used for anything other than
protecting access to the interpreter. The GIL doesn't make critical
sections for C extensions. The GIL isn't our lock, in other words.

Since we understand better now why the locks are needed, we can remove
the debug assertions. In their place, add some documentation for why the
locks are needed.
  • Loading branch information
nsrip-dd authored Mar 6, 2025
1 parent 351de15 commit 9ef9c1a
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 78 deletions.
48 changes: 13 additions & 35 deletions ddtrace/profiling/collector/_memalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#define PY_SSIZE_T_CLEAN
#include <Python.h>

#include "_memalloc_debug.h"
#include "_memalloc_heap.h"
#include "_memalloc_reentrant.h"
#include "_memalloc_tb.h"
Expand Down Expand Up @@ -43,19 +42,18 @@ static PyObject* object_string = NULL;

#define ALLOC_TRACKER_MAX_COUNT UINT64_MAX

// The data coordination primitives in this and related files are related to a crash we started seeing.
// We don't have a precise understanding of the causal factors within the runtime that lead to this condition,
// since the GIL alone was sufficient in the past for preventing this issue.
// We add an option here to _add_ a crash, in order to observe this condition in a future diagnostic iteration.
// **This option is _intended_ to crash the Python process** do not use without a good reason!
static char g_crash_on_mutex_pass_str[] = "_DD_PROFILING_MEMALLOC_CRASH_ON_MUTEX_PASS";
// The allocation profiler functions should (in theory) only be called when allocating Python
// objects, which should (in theory) only be done with the GIL held. We have reason to believe
// that this code is sometimes reached without the GIL held, since some crashes in the code
// seem to go away with our own locking. This debug flag will make the profiler crash if
// it detects the GIL is not held in places where we think it ought to be.
static char g_crash_on_no_gil_str[] = "_DD_PROFILING_MEMALLOC_CRASH_ON_NO_GIL";
static bool g_crash_on_no_gil = false;
/* This lock protects access to global_alloc_tracker. The GIL is NOT sufficient
to protect our data structures from concurrent access. For one, the GIL is an
implementation detail and may go away in the future. Additionally, even if the
GIL is held on _entry_ to our C extension functions, making it safe to call
Python C API functions, the GIL can be released during Python C API calls if
we call back into interpreter code. This can happen if we allocate a Python
object (such as frame info), trigger garbage collection, and run arbitrary
destructors. When this happens, other threads can run python code, such as the
thread that aggregates and uploads the profile data and mutates the global
data structures. The GIL does not create critical sections for C extension
functions!
*/
static memlock_t g_memalloc_lock;

static alloc_tracker_t* global_alloc_tracker;
Expand Down Expand Up @@ -98,31 +96,15 @@ __attribute__((constructor))
static void
memalloc_init()
{
// Check if we should crash the process on mutex pass
bool crash_on_mutex_pass = memalloc_get_bool_env(g_crash_on_mutex_pass_str);
memlock_init(&g_memalloc_lock, crash_on_mutex_pass);
memlock_init(&g_memalloc_lock);
#ifndef _WIN32
pthread_atfork(memalloc_prefork, memalloc_postfork_parent, memalloc_postfork_child);
#endif

g_crash_on_no_gil = memalloc_get_bool_env(g_crash_on_no_gil_str);
}

static void
memalloc_assert_gil()
{
if (g_crash_on_no_gil && !PyGILState_Check()) {
int* p = NULL;
*p = 0;
abort(); // should never reach here
}
}

static void
memalloc_add_event(memalloc_context_t* ctx, void* ptr, size_t size)
{
memalloc_assert_gil();

uint64_t alloc_count = atomic_add_clamped(&global_alloc_tracker->alloc_count, 1, ALLOC_TRACKER_MAX_COUNT);

/* Return if we've reached the maximum number of allocations */
Expand Down Expand Up @@ -345,8 +327,6 @@ memalloc_stop(PyObject* Py_UNUSED(module), PyObject* Py_UNUSED(args))
return NULL;
}

memalloc_assert_gil();

PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &global_memalloc_ctx.pymem_allocator_obj);
memalloc_tb_deinit();
if (memlock_trylock(&g_memalloc_lock)) {
Expand Down Expand Up @@ -406,8 +386,6 @@ iterevents_new(PyTypeObject* type, PyObject* Py_UNUSED(args), PyObject* Py_UNUSE
return NULL;
}

memalloc_assert_gil();

/* Reset the current traceback list. Do this outside lock so we can track it,
* and avoid reentrancy/deadlock problems, if we start tracking the raw
* allocator domain */
Expand Down
25 changes: 0 additions & 25 deletions ddtrace/profiling/collector/_memalloc_debug.h

This file was deleted.

9 changes: 4 additions & 5 deletions ddtrace/profiling/collector/_memalloc_heap.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include <stdlib.h>

#define PY_SSIZE_T_CLEAN
#include "_memalloc_debug.h"
#include "_memalloc_heap.h"
#include "_memalloc_reentrant.h"
#include "_memalloc_tb.h"
Expand Down Expand Up @@ -80,7 +79,9 @@ typedef struct
} freezer;
} heap_tracker_t;

static char g_crash_on_mutex_pass_str[] = "_DD_PROFILING_MEMHEAP_CRASH_ON_MUTEX_PASS";
/* This lock protects global_heap_tracker. See g_memalloc_lock docs for why this
* is needed, and why the GIL is not sufficient to protect our data structures
*/
static memlock_t g_memheap_lock;

static heap_tracker_t global_heap_tracker;
Expand Down Expand Up @@ -120,9 +121,7 @@ __attribute__((constructor))
static void
memheap_init()
{
// Check if we should crash the process on mutex pass
bool crash_on_mutex_pass = memalloc_get_bool_env(g_crash_on_mutex_pass_str);
memlock_init(&g_memheap_lock, crash_on_mutex_pass);
memlock_init(&g_memheap_lock);
#ifndef _WIN32
pthread_atfork(memheap_prefork, memheap_postfork_parent, memheap_postfork_child);
#endif
Expand Down
14 changes: 1 addition & 13 deletions ddtrace/profiling/collector/_memalloc_reentrant.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,13 @@ typedef struct
#endif
} memlock_t;

// Global setting; if a lock fails to be acquired, crash
static bool g_crash_on_mutex_pass = false;

// Generic initializer
static inline bool
memlock_init(memlock_t* lock, bool crash_on_pass)
memlock_init(memlock_t* lock)
{
if (!lock)
return false;

g_crash_on_mutex_pass = crash_on_pass;

#ifdef _WIN32
lock->mutex = CreateMutex(NULL, FALSE, NULL);
return lock->mutex != NULL;
Expand Down Expand Up @@ -137,13 +132,6 @@ memlock_trylock(memlock_t* lock)
#else
bool result = 0 == pthread_mutex_trylock(&lock->mutex);
#endif
if (!result && g_crash_on_mutex_pass) {
// segfault
int* p = NULL;
*p = 0;
abort(); // should never reach here
}

return result;
}

Expand Down

0 comments on commit 9ef9c1a

Please sign in to comment.