Counter read twice in CUPTI sample callback_event

Hello everyone,

I have a question concerning the following part of the CUPTI example callback_event.cu from the callback implementation:

cuptiErr = cuptiEventGroupReadEvent(traceData->eventData->eventGroup, 
                                        CUPTI_EVENT_READ_FLAG_NONE, 
                                        traceData->eventData->eventId, 
                                        &bytesRead, values);
    CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");
    traceData->eventVal2 = 0;
    for (i=0; i<numInstances; i++) {
        traceData->eventVal2 += values[i];
    }

    cudaDeviceSynchronize();
    cuptiErr = cuptiEventGroupReadEvent(traceData->eventData->eventGroup, 
                                        CUPTI_EVENT_READ_FLAG_NONE, 
                                        traceData->eventData->eventId, 
                                        &bytesRead, values);
    CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");

    traceData->eventVal = 0;
    for (i=0; i<numInstances; i++) {
        traceData->eventVal += values[i];
    }

Why does the event counter has to be read twice and how to I get my total counter value from the two results eventVal and eventVal2?

Which version of CUDA Toolkit are you using?
I checked the CUPTI callback_event sample code for several CUDA Toolkit versions. The code is different. It does not use eventVal2.

Pasting corresponding code from the CUDA Toolkit 10.0 version of callback_event.cu:

void CUPTIAPI
getEventValueCallback(void *userdata, CUpti_CallbackDomain domain,
                      CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo)
{
  CUptiResult cuptiErr;
  RuntimeApiTrace_t *traceData = (RuntimeApiTrace_t*)userdata;
  size_t bytesRead;

  // This callback is enabled only for launch so we shouldn't see anything else.
  if ((cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) &&
      (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000))
  {
    printf("%s:%d: unexpected cbid %d\n", __FILE__, __LINE__, cbid);
    exit(-1);
  }

  if (cbInfo->callbackSite == CUPTI_API_ENTER) {
    cudaDeviceSynchronize();
    cuptiErr = cuptiSetEventCollectionMode(cbInfo->context,
                                           CUPTI_EVENT_COLLECTION_MODE_KERNEL);
    CHECK_CUPTI_ERROR(cuptiErr, "cuptiSetEventCollectionMode");
    cuptiErr = cuptiEventGroupEnable(traceData->eventData->eventGroup);
    CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupEnable");
  }

  if (cbInfo->callbackSite == CUPTI_API_EXIT) {
    uint32_t numInstances = 0, i;
    uint64_t *values = NULL;
    size_t valueSize = sizeof(numInstances);

    cuptiErr = cuptiEventGroupGetAttribute(traceData->eventData->eventGroup,
                                           CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT,
                                           &valueSize, &numInstances);
    CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupGetAttribute");

    bytesRead = sizeof (uint64_t) * numInstances;
    values = (uint64_t *) malloc(bytesRead);
    if (values == NULL) {
        printf("%s:%d: Out of memory\n", __FILE__, __LINE__);
        exit(-1);
    }
    cudaDeviceSynchronize();
    cuptiErr = cuptiEventGroupReadEvent(traceData->eventData->eventGroup,
                                        CUPTI_EVENT_READ_FLAG_NONE,
                                        traceData->eventData->eventId,
                                        &bytesRead, values);
    CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");

    traceData->eventVal = 0;
    for (i=0; i<numInstances; i++) {
        traceData->eventVal += values[i];
    }
    free(values);

    cuptiErr = cuptiEventGroupDisable(traceData->eventData->eventGroup);
    CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDisable");
  }
}

I use CUDA 10.0 on a Jetson TX2. But I think the code snippet you posted is not the corresponding one as the event counter group is not read anywhere. It seems to be only the initialization part. The code I posted should be right below yours in the callback_event.cu sample.

For reference, here is the whole getEventValueCallback function from the callback_event.cu sample of CUDA 10.0, taken directly from Jetson TX2. The corresponding code is written in bolt:

void CUPTIAPI
getEventValueCallback(void *userdata, CUpti_CallbackDomain domain,
                      CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo)
{
  CUptiResult cuptiErr;
  RuntimeApiTrace_t *traceData = (RuntimeApiTrace_t*)userdata;
  size_t bytesRead; 
     
  // This callback is enabled only for launch so we shouldn't see anything else.
  if ((cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) &&
      (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000))
  {
    printf("%s:%d: unexpected cbid %d\n", __FILE__, __LINE__, cbid);
    exit(-1);
  }

  if (cbInfo->callbackSite == CUPTI_API_ENTER) {
    printf("API enter\n");
    cudaDeviceSynchronize();
    cuptiErr = cuptiSetEventCollectionMode(cbInfo->context, 
                                           CUPTI_EVENT_COLLECTION_MODE_KERNEL);
    CHECK_CUPTI_ERROR(cuptiErr, "cuptiSetEventCollectionMode");
    cuptiErr = cuptiEventGroupEnable(traceData->eventData->eventGroup);
    CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupEnable");
  }
    
  if (cbInfo->callbackSite == CUPTI_API_EXIT) {
    printf("API exit\n");
    uint32_t numInstances = 0, i;
    uint64_t *values = NULL;
    size_t valueSize = sizeof(numInstances);

    cuptiErr = cuptiEventGroupGetAttribute(traceData->eventData->eventGroup, 
                                           CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT, 
                                           &valueSize, &numInstances);
    CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupGetAttribute");

    bytesRead = sizeof (uint64_t) * numInstances;
    printf("bytesRead : %zu\n", bytesRead);
    values = (uint64_t *) malloc(bytesRead);
    if (values == NULL) {
        printf("%s:%d: Out of memory\n", __FILE__, __LINE__);
        exit(-1);
    }

   [b]cuptiErr = cuptiEventGroupReadEvent(traceData->eventData->eventGroup, 
                                        CUPTI_EVENT_READ_FLAG_NONE, 
                                        traceData->eventData->eventId, 
                                        &bytesRead, values);
    CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");
    traceData->eventVal2 = 0;
    for (i=0; i<numInstances; i++) {
        traceData->eventVal2 += values[i];
    }

    cudaDeviceSynchronize();
    cuptiErr = cuptiEventGroupReadEvent(traceData->eventData->eventGroup, 
                                        CUPTI_EVENT_READ_FLAG_NONE, 
                                        traceData->eventData->eventId, 
                                        &bytesRead, values);
    CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");

    traceData->eventVal = 0;
    for (i=0; i<numInstances; i++) {
        traceData->eventVal += values[i];
    }[/b]
    free(values);

    cuptiErr = cuptiEventGroupDisable(traceData->eventData->eventGroup);
    CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDisable");
  }
}

Updated the code in my earlier comment to include the full code for the function getEventValueCallback().
Somehow you seem to have a different version of the code.