How can I see when the CPython GIL is held using Nsight Systems?
I’m trying to track the causes of intermittent latency spikes (200-400ms) in a multithreaded Python program that includes a combination of CPU-bound work in C extensions and interspersed async network/disk/serial I/O. As part of understanding these spikes, I’d like to understand when Python threads are waiting for or holding the CPython global interpreter lock. I’ve tried modifying CPython’s ceval_gil.h
to track GIL usage using NVTX’s nvtxDomainRangePushEx
and nvtxDomainRangePop
, but my traces seem very suspicious – they show many overlapping GIL regions, while I expect only one thread to be holding the GIL at a time. As an alternative, I tried naming the gil mutex using nvtxDomainResourceCreate
, but I couldn’t see it anywhere in Nsight Systems.
Unexpected Overlapping GIL Regions:
What I’ve tried with ranges:
I modified v3.10.4/Python/ceval_gil.h by replacing every call to MUTEX_LOCK(gil->mutex)
and MUTEX_UNLOCK(gil->mutex)
with GIL_MUTEX_LOCK(gil->mutex)
and GIL_MUTEX_UNLOCK(gil->mutex)
macros:
static nvtxDomainHandle_t GIL_NVTX_DOMAIN;
static nvtxStringHandle_t GIL_NVTX_WAITING_MSG;
static nvtxStringHandle_t GIL_NVTX_HOLDING_MSG;
#define GIL_MUTEX_LOCK(m) \
nvtxDomainRangePushEx(GIL_NVTX_DOMAIN, &(nvtxEventAttributes_t){.message={.registered=GIL_NVTX_WAITING_MSG}, .color=0xFF0000FF, .version=NVTX_VERSION, .size=NVTX_EVENT_ATTRIB_STRUCT_SIZE, .messageType = NVTX_MESSAGE_TYPE_REGISTERED, .colorType=NVTX_COLOR_ARGB}); \
MUTEX_LOCK(m); \
nvtxDomainRangePop(GIL_NVTX_DOMAIN); \
nvtxDomainRangePushEx(GIL_NVTX_DOMAIN, &(nvtxEventAttributes_t){.message={.registered=GIL_NVTX_HOLDING_MSG}, .color=0xFFFF0000, .version=NVTX_VERSION, .size=NVTX_EVENT_ATTRIB_STRUCT_SIZE, .messageType = NVTX_MESSAGE_TYPE_REGISTERED, .colorType=NVTX_COLOR_ARGB})
#define GIL_MUTEX_UNLOCK(m) \
MUTEX_UNLOCK(m); \
nvtxDomainRangePop(GIL_NVTX_DOMAIN)
static void _gil_initialize(struct _gil_runtime_state *gil)
{
_Py_atomic_int uninitialized = {-1};
gil->locked = uninitialized;
gil->interval = DEFAULT_INTERVAL;
GIL_NVTX_DOMAIN = nvtxDomainCreate("GIL NVTX Domain");
GIL_NVTX_WAITING_MSG = nvtxDomainRegisterStringA(GIL_NVTX_DOMAIN, "Waiting for GIL");
GIL_NVTX_HOLDING_MSG = nvtxDomainRegisterStringA(GIL_NVTX_DOMAIN, "Holding GIL");
}
What I’ve tried with resource naming:
I modified v3.10.4/Python/ceval_gil.h by changing create_gil
and destroy_gil
:
static nvtxResourceHandle_t GIL_NVTX_HANDLE;
static void create_gil(struct _gil_runtime_state *gil)
{
MUTEX_INIT(gil->mutex);
nvtxResourceAttributes_t attribs = {0};
attribs.version = NVTX_VERSION;
attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
attribs.identifier.pValue = (const void*)(&(gil->mutex));
attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
attribs.message.ascii = "GIL Mutex";
GIL_NVTX_HANDLE = nvtxDomainResourceCreate(GIL_NVTX_DOMAIN, &attribs);
#ifdef FORCE_SWITCHING
MUTEX_INIT(gil->switch_mutex);
#endif
COND_INIT(gil->cond);
#ifdef FORCE_SWITCHING
COND_INIT(gil->switch_cond);
#endif
_Py_atomic_store_relaxed(&gil->last_holder, 0);
_Py_ANNOTATE_RWLOCK_CREATE(&gil->locked);
_Py_atomic_store_explicit(&gil->locked, 0, _Py_memory_order_release);
}
static void destroy_gil(struct _gil_runtime_state *gil)
{
nvtxDomainResourceDestroy(GIL_NVTX_HANDLE);
/* some pthread-like implementations tie the mutex to the cond
* and must have the cond destroyed first.
*/
COND_FINI(gil->cond);
MUTEX_FINI(gil->mutex);
#ifdef FORCE_SWITCHING
COND_FINI(gil->switch_cond);
MUTEX_FINI(gil->switch_mutex);
#endif
_Py_atomic_store_explicit(&gil->locked, -1,
_Py_memory_order_release);
_Py_ANNOTATE_RWLOCK_DESTROY(&gil->locked);
}