lkrupp
July 26, 2019, 11:44am
1
Hello everyone,
I have a question concerning the following part of the CUPTI example callback_event.cu from the callback implementation:
cuptiErr = cuptiEventGroupReadEvent(traceData->eventData->eventGroup,
CUPTI_EVENT_READ_FLAG_NONE,
traceData->eventData->eventId,
&bytesRead, values);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");
traceData->eventVal2 = 0;
for (i=0; i<numInstances; i++) {
traceData->eventVal2 += values[i];
}
cudaDeviceSynchronize();
cuptiErr = cuptiEventGroupReadEvent(traceData->eventData->eventGroup,
CUPTI_EVENT_READ_FLAG_NONE,
traceData->eventData->eventId,
&bytesRead, values);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");
traceData->eventVal = 0;
for (i=0; i<numInstances; i++) {
traceData->eventVal += values[i];
}
Why does the event counter has to be read twice and how to I get my total counter value from the two results eventVal and eventVal2?
Which version of CUDA Toolkit are you using?
I checked the CUPTI callback_event sample code for several CUDA Toolkit versions. The code is different. It does not use eventVal2 .
Pasting corresponding code from the CUDA Toolkit 10.0 version of callback_event.cu :
void CUPTIAPI
getEventValueCallback(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo)
{
CUptiResult cuptiErr;
RuntimeApiTrace_t *traceData = (RuntimeApiTrace_t*)userdata;
size_t bytesRead;
// This callback is enabled only for launch so we shouldn't see anything else.
if ((cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) &&
(cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000))
{
printf("%s:%d: unexpected cbid %d\n", __FILE__, __LINE__, cbid);
exit(-1);
}
if (cbInfo->callbackSite == CUPTI_API_ENTER) {
cudaDeviceSynchronize();
cuptiErr = cuptiSetEventCollectionMode(cbInfo->context,
CUPTI_EVENT_COLLECTION_MODE_KERNEL);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiSetEventCollectionMode");
cuptiErr = cuptiEventGroupEnable(traceData->eventData->eventGroup);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupEnable");
}
if (cbInfo->callbackSite == CUPTI_API_EXIT) {
uint32_t numInstances = 0, i;
uint64_t *values = NULL;
size_t valueSize = sizeof(numInstances);
cuptiErr = cuptiEventGroupGetAttribute(traceData->eventData->eventGroup,
CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT,
&valueSize, &numInstances);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupGetAttribute");
bytesRead = sizeof (uint64_t) * numInstances;
values = (uint64_t *) malloc(bytesRead);
if (values == NULL) {
printf("%s:%d: Out of memory\n", __FILE__, __LINE__);
exit(-1);
}
cudaDeviceSynchronize();
cuptiErr = cuptiEventGroupReadEvent(traceData->eventData->eventGroup,
CUPTI_EVENT_READ_FLAG_NONE,
traceData->eventData->eventId,
&bytesRead, values);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");
traceData->eventVal = 0;
for (i=0; i<numInstances; i++) {
traceData->eventVal += values[i];
}
free(values);
cuptiErr = cuptiEventGroupDisable(traceData->eventData->eventGroup);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDisable");
}
}
lkrupp
July 30, 2019, 12:03pm
3
I use CUDA 10.0 on a Jetson TX2. But I think the code snippet you posted is not the corresponding one as the event counter group is not read anywhere. It seems to be only the initialization part. The code I posted should be right below yours in the callback_event.cu sample.
For reference, here is the whole getEventValueCallback function from the callback_event.cu sample of CUDA 10.0, taken directly from Jetson TX2. The corresponding code is written in bolt:
void CUPTIAPI
getEventValueCallback(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo)
{
CUptiResult cuptiErr;
RuntimeApiTrace_t *traceData = (RuntimeApiTrace_t*)userdata;
size_t bytesRead;
// This callback is enabled only for launch so we shouldn't see anything else.
if ((cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) &&
(cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000))
{
printf("%s:%d: unexpected cbid %d\n", __FILE__, __LINE__, cbid);
exit(-1);
}
if (cbInfo->callbackSite == CUPTI_API_ENTER) {
printf("API enter\n");
cudaDeviceSynchronize();
cuptiErr = cuptiSetEventCollectionMode(cbInfo->context,
CUPTI_EVENT_COLLECTION_MODE_KERNEL);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiSetEventCollectionMode");
cuptiErr = cuptiEventGroupEnable(traceData->eventData->eventGroup);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupEnable");
}
if (cbInfo->callbackSite == CUPTI_API_EXIT) {
printf("API exit\n");
uint32_t numInstances = 0, i;
uint64_t *values = NULL;
size_t valueSize = sizeof(numInstances);
cuptiErr = cuptiEventGroupGetAttribute(traceData->eventData->eventGroup,
CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT,
&valueSize, &numInstances);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupGetAttribute");
bytesRead = sizeof (uint64_t) * numInstances;
printf("bytesRead : %zu\n", bytesRead);
values = (uint64_t *) malloc(bytesRead);
if (values == NULL) {
printf("%s:%d: Out of memory\n", __FILE__, __LINE__);
exit(-1);
}
[b]cuptiErr = cuptiEventGroupReadEvent(traceData->eventData->eventGroup,
CUPTI_EVENT_READ_FLAG_NONE,
traceData->eventData->eventId,
&bytesRead, values);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");
traceData->eventVal2 = 0;
for (i=0; i<numInstances; i++) {
traceData->eventVal2 += values[i];
}
cudaDeviceSynchronize();
cuptiErr = cuptiEventGroupReadEvent(traceData->eventData->eventGroup,
CUPTI_EVENT_READ_FLAG_NONE,
traceData->eventData->eventId,
&bytesRead, values);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");
traceData->eventVal = 0;
for (i=0; i<numInstances; i++) {
traceData->eventVal += values[i];
}[/b]
free(values);
cuptiErr = cuptiEventGroupDisable(traceData->eventData->eventGroup);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDisable");
}
}
Updated the code in my earlier comment to include the full code for the function getEventValueCallback() .
Somehow you seem to have a different version of the code.