CUPTI problem, cuptiEventGroupReadEvent() returns me a value buffer with all 0

Hi guys,

I am in trouble with this cupti function. I am trying to use CUPTI to collect the metric information for shared memory, inst, cache, and etc. I look at the CUPTI samples, especially the one called “callback_metric()”. I also run callback_metric() with different metric name, which all work. However, problem shows up when I port the callback functions from callback_metric.cu to matrixMul.cu.

Right now, what I can guarantee is that all parameters and functions of using callback_metric are successfully applied into matrixMul.cu code (the one in the sample). Now if I run matrixMul with the command “./matrixMul 0 METRIC_NAME”, the executable file runs but return 0 for all metrics.

I looked into the code and print out all parameters for metric callback uses, they all match to callback_metric.cu, except “group” which I think they are randomly picked value for storing the event groups. Therefore, nothing different, but values returned from “cuptiEventGroupReadEvent()” in matrixMul.cu are all 0, which work properly and return good values in callback_metric.cu.

Does anyone know what is happening here?

Thanks

The code of the problem function is listed below.

void CUPTIAPI
getMetricValueCallback(void *userdata, CUpti_CallbackDomain domain,
                       CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo)
{
  MetricData_t *metricData = (MetricData_t*)userdata;
  unsigned int i, j, k;
	
  // This callback is enabled only for launch so we shouldn't see
  // anything else.
  if (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) {
    printf("%s:%d: unexpected cbid %d\n", __FILE__, __LINE__, cbid);
    exit(-1);
  }

  // on entry, enable all the event groups being collected this pass,
  // for metrics we collect for all instances of the event
  if (cbInfo->callbackSite == CUPTI_API_ENTER) {
    cudaDeviceSynchronize();

    CUPTI_CALL(cuptiSetEventCollectionMode(cbInfo->context,
                                           CUPTI_EVENT_COLLECTION_MODE_KERNEL));

    for (i = 0; i < metricData->eventGroups->numEventGroups; i++) {
      uint32_t all = 1;
      CUPTI_CALL(cuptiEventGroupSetAttribute(metricData->eventGroups->eventGroups[i],
                                             CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES,
                                             sizeof(all), &all));
      CUPTI_CALL(cuptiEventGroupEnable(metricData->eventGroups->eventGroups[i]));
    }
  }

  // on exit, read and record event values
  if (cbInfo->callbackSite == CUPTI_API_EXIT) {
    cudaDeviceSynchronize();

    // for each group, read the event values from the group and record
    // in metricData
    for (i = 0; i < metricData->eventGroups->numEventGroups; i++) {
		printf("numEventGroups: %d\n", metricData->eventGroups->numEventGroups);
      CUpti_EventGroup group = metricData->eventGroups->eventGroups[i];
      CUpti_EventDomainID groupDomain;
      uint32_t numEvents, numInstances, numTotalInstances;
      CUpti_EventID *eventIds;
      size_t groupDomainSize = sizeof(groupDomain);
      size_t numEventsSize = sizeof(numEvents);
      size_t numInstancesSize = sizeof(numInstances);
      size_t numTotalInstancesSize = sizeof(numTotalInstances);
      uint64_t *values, normalized, sum;
      size_t valuesSize, eventIdsSize;

      CUPTI_CALL(cuptiEventGroupGetAttribute(group,
                                             CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID,
                                             &groupDomainSize, &groupDomain));
      CUPTI_CALL(cuptiDeviceGetEventDomainAttribute(metricData->device, groupDomain,
                                                    CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT,
                                                    &numTotalInstancesSize, &numTotalInstances));
      CUPTI_CALL(cuptiEventGroupGetAttribute(group,
                                             CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT,
                                             &numInstancesSize, &numInstances));
      CUPTI_CALL(cuptiEventGroupGetAttribute(group,
                                             CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS,
                                             &numEventsSize, &numEvents));
      
      eventIdsSize = numEvents * sizeof(CUpti_EventID);
      eventIds = (CUpti_EventID *)malloc(eventIdsSize);
      CUPTI_CALL(cuptiEventGroupGetAttribute(group,
                                             CUPTI_EVENT_GROUP_ATTR_EVENTS,
                                             &eventIdsSize, eventIds));

      valuesSize = sizeof(uint64_t) * numInstances;
      values = (uint64_t *)malloc(valuesSize);

      for (j = 0; j < numEvents; j++) {
        CUPTI_CALL(cuptiEventGroupReadEvent(group, CUPTI_EVENT_READ_FLAG_NONE,
                                            eventIds[j], &valuesSize, values));

        // sum collect event values from all instances
        sum = 0;
        for (k = 0; k < numInstances; k++)
          sum += values[k];
		
        // normalize the event value to represent the total number of
        // domain instances on the device
        normalized = (sum * numTotalInstances) / numInstances;

        metricData->eventIdArray[metricData->eventIdx] = eventIds[j];
        metricData->eventValueArray[metricData->eventIdx] = normalized;
        metricData->eventIdx++;

        // print collected value
        {
          char eventName[128];
          size_t eventNameSize = sizeof(eventName) - 1;
          CUPTI_CALL(cuptiEventGetAttribute(eventIds[j], CUPTI_EVENT_ATTR_NAME,
                                            &eventNameSize, eventName));
          eventName[127] = '\0';
        }
      }

      free(values);
    }

    for (i = 0; i < metricData->eventGroups->numEventGroups; i++)
      CUPTI_CALL(cuptiEventGroupDisable(metricData->eventGroups->eventGroups[i]));
  }
}

int main(int argc, char **argv)
{
	CUpti_SubscriberHandle subscriber;
	CUcontext context = 0;
	CUdevice device = 0;
	int computeCapabilityMajor=0;
	int computeCapabilityMinor=0;
	int deviceNum;
	char deviceName[32];
	const char *metricName;
	CUpti_MetricID metricId;
	CUpti_EventGroupSets *passData;
	MetricData_t metricData;
	unsigned int pass;
	CUpti_MetricValue metricValue;
	
	// make sure activity is enabled before any CUDA API
	CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));

	DRIVER_API_CALL(cuInit(0)); 	//must be called before any driver API
	DRIVER_API_CALL(cuCtxCreate(&context, 0, device));   
	

	DRIVER_API_CALL(cuDeviceGet(&device, deviceNum));
	DRIVER_API_CALL(cuDeviceGetName(deviceName, 32, device));
	
	DRIVER_API_CALL(cuDeviceComputeCapability(&computeCapabilityMajor,
                                            &computeCapabilityMinor,
                                            device));
  	DRIVER_API_CALL(cuCtxCreate(&context, 0, device));
    // Get the name of the metric to collect
	if (argc > 2)
		metricName = argv[2];
	else {
		if (computeCapabilityMajor > 1) {
			metricName = METRIC_NAME_FERMI;
		}
		else {
			metricName = METRIC_NAME_TESLA;
		}
	}
    int block_size = 32;
    dim3 dimsA(32*block_size, 32*block_size, 1);
    dim3 dimsB(128*block_size, 32*block_size, 1);

    CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));
    int matrix_result = matrixMultiply(/*argc, argv, */block_size, dimsA, dimsB);
	cudaDeviceSynchronize();
    CUPTI_CALL(cuptiActivityFlushAll(0));
   
    // setup launch callback for event collection
  CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)getMetricValueCallback, &metricData));
  CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API,
                                 CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020));
	
  // allocate space to hold all the events needed for the metric
  CUPTI_CALL(cuptiMetricGetIdFromName(device, metricName, &metricId));
  CUPTI_CALL(cuptiMetricGetNumEvents(metricId, &metricData.numEvents));
  metricData.device = device;
  metricData.eventIdArray = (CUpti_EventID *)malloc(metricData.numEvents * sizeof(CUpti_EventID));
  metricData.eventValueArray = (uint64_t *)malloc(metricData.numEvents * sizeof(uint64_t));
  metricData.eventIdx = 0;
	
  // get the number of passes required to collect all the events
  // needed for the metric and the event groups for each pass
  CUPTI_CALL(cuptiMetricCreateEventGroupSets(context, sizeof(metricId), &metricId, &passData));
  for (pass = 0; pass < passData->numSets; pass++) {
    printf("Pass %u\n", pass);
    metricData.eventGroups = passData->sets + pass;
    //int matrix_result = 
    matrixMultiply(/*argc, argv,*/ block_size, dimsA, dimsB);
  }
  
  // use all the collected events to calculate the metric value
  CUPTI_CALL(cuptiMetricGetValue(device, metricId,
                                 metricData.numEvents * sizeof(CUpti_EventID),
                                 metricData.eventIdArray,
                                 metricData.numEvents * sizeof(uint64_t),
                                 metricData.eventValueArray,
                                 kernelDuration, &metricValue));

  // print metric value, we format based on the value kind
  
    CUpti_MetricValueKind valueKind;
    size_t valueKindSize = sizeof(valueKind);
    CUPTI_CALL(cuptiMetricGetAttribute(metricId, CUPTI_METRIC_ATTR_VALUE_KIND,
                                       &valueKindSize, &valueKind));
    switch (valueKind) {
    case CUPTI_METRIC_VALUE_KIND_DOUBLE:
      printf("Metric %s = %f\n", metricName, metricValue.metricValueDouble);
      break;
    case CUPTI_METRIC_VALUE_KIND_UINT64:
      printf("Metric %s = %llu\n", metricName,
             (unsigned long long)metricValue.metricValueUint64);
      break;
    case CUPTI_METRIC_VALUE_KIND_INT64:
      printf("Metric %s = %lld\n", metricName,
             (long long)metricValue.metricValueInt64);
      break;
    case CUPTI_METRIC_VALUE_KIND_PERCENT:
      printf("Metric %s = %f%%\n", metricName, metricValue.metricValuePercent);
      break;
    case CUPTI_METRIC_VALUE_KIND_THROUGHPUT:
      printf("Metric %s = %llu bytes/sec\n", metricName,
             (unsigned long long)metricValue.metricValueThroughput);
      break;
    case CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL:
      printf("Metric %s = utilization level %u\n", metricName,
             (unsigned int)metricValue.metricValueUtilizationLevel);
      break;
    default:
      fprintf(stderr, "error: unknown value kind\n");
      exit(-1);
    }

CUPTI_CALL(cuptiUnsubscribe(subscriber));

  exit(matrix_result);
}

Any ideas would be helpful. Thanks