My code about tracing NVTX APIs are as follow:
else if (domain == CUPTI_CB_DOMAIN_NVTX)
{
if(callbackid == CUPTI_CBID_NVTX_nvtxRangePushA)
{
cout << “NVTX Range detected” << endl;
CtxProfilerData user_data;
CUpti_CallbackData const *pnvtxRangePushData = static_cast<CUpti_CallbackData const *>(pCallbackData);
CUcontext ctx = pnvtxRangePushData->context;
user_data.ctx = ctx;
if(pnvtxRangePushData->callbackSite == CUPTI_API_ENTER)
{
user_data_mutex.lock();
if(user_data_map.count(ctx))
{
if (user_data_map[ctx].cur_ranges == user_data_map[ctx].max_num_ranges)
{
EndSession(user_data_map[ctx]);
user_data_map[ctx].cur_ranges = 0;
}if (user_data_map[ctx].cur_ranges == 0) { InitializeContext(user_data_map[ctx]); StartSession(user_data_map[ctx]); } user_data_map[ctx].cur_ranges++; } else { user_data_map[ctx] = user_data; InitializeContext(user_data_map[ctx]); StartSession(user_data_map[ctx]); user_data_map[ctx].cur_ranges++; } user_data_mutex.unlock(); } } if(callbackid == CUPTI_CBID_NVTX_nvtxRangePop) { cout << "NVTX Range pop detected" << endl; CUpti_CallbackData const *pnvtxRangePopData = static_cast<CUpti_CallbackData const *>(pCallbackData); CUcontext ctx = pnvtxRangePopData->context; if(pnvtxRangePopData->callbackSite == CUPTI_API_EXIT) { user_data_mutex.lock(); EndSession(user_data_map[ctx]); user_data_map[ctx].cur_ranges = 0; user_data_mutex.unlock(); } } }
And for sure I have registered these callbacks:
void RegisterCallbacks()
{
CUpti_SubscriberHandle subscriber;
CUPTI_API_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)ProfilerCallbackHandler, NULL));
CUPTI_API_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
CUPTI_API_CALL(cuptiEnableCallback(1,subscriber,CUPTI_CB_DOMAIN_DRIVER_API,CUPTI_DRIVER_TRACE_CBID_cuMemAlloc));
CUPTI_API_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE, CUPTI_CBID_RESOURCE_CONTEXT_CREATED));
CUPTI_API_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API, CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020));
CUPTI_API_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_NVTX,CUPTI_CBID_NVTX_nvtxRangePushA));
CUPTI_API_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_NVTX,CUPTI_CBID_NVTX_nvtxRangePop));atexit(EndExecution);}
my cuda kernel code is like this:
nvtxRangePushA(“kernel_launch”);
LaunchKernelsMultiStreams(device_map[0]);
nvtxRangePop();
however,the output message dont contain anything about nvtx apis,which means my profiler failed to trace nvtx apis.
but by using nsys,the range covered by my nvtx apis do appear,which means nvtx take effect in my kernel,so whats the problem?