Measure metrics from a CUDA binary?

I’ve downloaded the callback_metrics sample of CUPTI and I’ve executed it. All works fine but when I change the runPass() function and run system("./runPass") insted (where “./runPass” is an external executable with the same code that the runPass() function included in the sample) the program doesn’t run. ¿Is there are any way to measure metrics in a CUPTI script from a CUDA binary?

The code below is callback_metrics sample with my modifications to run and profile other binaries. “./runPass” contains the same VecAdd kernel and all the code of runPass in its main.

CUDA version 8.0, drivers-384 and Ubuntu 14 are installed.
This is the terminal output:
Screenshot from 2020-09-07 12-45-29
/*
* Copyright 2011-2015 NVIDIA Corporation. All rights reserved
*
* Sample app to demonstrate use of CUPTI library to obtain metric values
* using callbacks for CUDA runtime APIs
*
*/

#include <stdio.h>
#include <cuda.h>
#include <cupti.h>
#include <unistd.h>
#include <sys/wait.h>
#include <sys/types.h>


#define METRIC_NAME_TESLA "branch_efficiency"
#define METRIC_NAME_FERMI "ipc"

#define DRIVER_API_CALL(apiFuncCall)                                           \
do {                                                                           \
    CUresult _status = apiFuncCall;                                            \
    if (_status != CUDA_SUCCESS) {                                             \
        fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n",   \
                __FILE__, __LINE__, #apiFuncCall, _status);                    \
        exit(-1);                                                              \
    }                                                                          \
} while (0)

#define RUNTIME_API_CALL(apiFuncCall)                                          \
do {                                                                           \
    cudaError_t _status = apiFuncCall;                                         \
    if (_status != cudaSuccess) {                                              \
        fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n",   \
                __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
        exit(-1);                                                              \
    }                                                                          \
} while (0)

#define CUPTI_CALL(call)                                                \
  do {                                                                  \
    CUptiResult _status = call;                                         \
    if (_status != CUPTI_SUCCESS) {                                     \
      const char *errstr;                                               \
      cuptiGetResultString(_status, &errstr);                           \
      fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
              __FILE__, __LINE__, #call, errstr);                       \
      exit(-1);                                                         \
    }                                                                   \
  } while (0)

#define ALIGN_SIZE (8)
#define ALIGN_BUFFER(buffer, align)                                            \
  (((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer))

// User data for event collection callback
typedef struct MetricData_st {
  // the device where metric is being collected
  CUdevice device;
  // the set of event groups to collect for a pass
  CUpti_EventGroupSet *eventGroups;
  // the current number of events collected in eventIdArray and
  // eventValueArray
  uint32_t eventIdx;
  // the number of entries in eventIdArray and eventValueArray
  uint32_t numEvents;
  // array of event ids
  CUpti_EventID *eventIdArray;
  // array of event values
  uint64_t *eventValueArray;
} MetricData_t;

static uint64_t kernelDuration;

// Device code
__global__ void VecAdd(const int* A, const int* B, int* C, int N)
{
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < N)
    C[i] = A[i] + B[i];
}

static void
initVec(int *vec, int n)
{
  for (int i=0; i< n; i++)
    vec[i] = i;
}

void CUPTIAPI
getMetricValueCallback(void *userdata, CUpti_CallbackDomain domain,
                       CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo)
{
  MetricData_t *metricData = (MetricData_t*)userdata;
  unsigned int i, j, k;
	printf("Entro en el callback\n");
  // This callback is enabled only for launch so we shouldn't see
  // anything else.
  if (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) {
    printf("%s:%d: unexpected cbid %d\n", __FILE__, __LINE__, cbid);
    exit(-1);
  }

  // on entry, enable all the event groups being collected this pass,
  // for metrics we collect for all instances of the event
  if (cbInfo->callbackSite == CUPTI_API_ENTER) {
    cudaDeviceSynchronize();

    CUPTI_CALL(cuptiSetEventCollectionMode(cbInfo->context,
                                           CUPTI_EVENT_COLLECTION_MODE_KERNEL));

    for (i = 0; i < metricData->eventGroups->numEventGroups; i++) {
      uint32_t all = 1;
      CUPTI_CALL(cuptiEventGroupSetAttribute(metricData->eventGroups->eventGroups[i],
                                             CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES,
                                             sizeof(all), &all));
      CUPTI_CALL(cuptiEventGroupEnable(metricData->eventGroups->eventGroups[i]));
    }
  }

  // on exit, read and record event values
  if (cbInfo->callbackSite == CUPTI_API_EXIT) {
    cudaDeviceSynchronize();

    // for each group, read the event values from the group and record
    // in metricData
    for (i = 0; i < metricData->eventGroups->numEventGroups; i++) {
      CUpti_EventGroup group = metricData->eventGroups->eventGroups[i];
      CUpti_EventDomainID groupDomain;
      uint32_t numEvents, numInstances, numTotalInstances;
      CUpti_EventID *eventIds;
      size_t groupDomainSize = sizeof(groupDomain);
      size_t numEventsSize = sizeof(numEvents);
      size_t numInstancesSize = sizeof(numInstances);
      size_t numTotalInstancesSize = sizeof(numTotalInstances);
      uint64_t *values, normalized, sum;
      size_t valuesSize, eventIdsSize;

      CUPTI_CALL(cuptiEventGroupGetAttribute(group,
                                             CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID,
                                             &groupDomainSize, &groupDomain));
      CUPTI_CALL(cuptiDeviceGetEventDomainAttribute(metricData->device, groupDomain,
                                                    CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT,
                                                    &numTotalInstancesSize, &numTotalInstances));
      CUPTI_CALL(cuptiEventGroupGetAttribute(group,
                                             CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT,
                                             &numInstancesSize, &numInstances));
      CUPTI_CALL(cuptiEventGroupGetAttribute(group,
                                             CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS,
                                             &numEventsSize, &numEvents));
      eventIdsSize = numEvents * sizeof(CUpti_EventID);
      eventIds = (CUpti_EventID *)malloc(eventIdsSize);
      CUPTI_CALL(cuptiEventGroupGetAttribute(group,
                                             CUPTI_EVENT_GROUP_ATTR_EVENTS,
                                             &eventIdsSize, eventIds));

      valuesSize = sizeof(uint64_t) * numInstances;
      values = (uint64_t *)malloc(valuesSize);

      for (j = 0; j < numEvents; j++) {
        CUPTI_CALL(cuptiEventGroupReadEvent(group, CUPTI_EVENT_READ_FLAG_NONE,
                                            eventIds[j], &valuesSize, values));
        if (metricData->eventIdx >= metricData->numEvents) {
          fprintf(stderr, "error: too many events collected, metric expects only %d\n",
                  (int)metricData->numEvents);
          exit(-1);
        }

        // sum collect event values from all instances
        sum = 0;
        for (k = 0; k < numInstances; k++)
          sum += values[k];

        // normalize the event value to represent the total number of
        // domain instances on the device
        normalized = (sum * numTotalInstances) / numInstances;

        metricData->eventIdArray[metricData->eventIdx] = eventIds[j];
        metricData->eventValueArray[metricData->eventIdx] = normalized;
        metricData->eventIdx++;

        // print collected value
        {
          char eventName[128];
          size_t eventNameSize = sizeof(eventName) - 1;
          CUPTI_CALL(cuptiEventGetAttribute(eventIds[j], CUPTI_EVENT_ATTR_NAME,
                                            &eventNameSize, eventName));
          eventName[127] = '\0';
          printf("\t%s = %llu (", eventName, (unsigned long long)sum);
          if (numInstances > 1) {
            for (k = 0; k < numInstances; k++) {
              if (k != 0)
                printf(", ");
              printf("%llu", (unsigned long long)values[k]);
            }
          }

          printf(")\n");
          printf("\t%s (normalized) (%llu * %u) / %u = %llu\n",
                 eventName, (unsigned long long)sum,
                 numTotalInstances, numInstances,
                 (unsigned long long)normalized);
        }
      }

      free(values);
    }

    for (i = 0; i < metricData->eventGroups->numEventGroups; i++)
      CUPTI_CALL(cuptiEventGroupDisable(metricData->eventGroups->eventGroups[i]));
  }
}

static void
cleanUp(int *h_A, int *h_B, int *h_C, int *d_A, int *d_B, int *d_C)
{
  if (d_A)
    cudaFree(d_A);
  if (d_B)
    cudaFree(d_B);
  if (d_C)
    cudaFree(d_C);

  // Free host memory
  if (h_A)
    free(h_A);
  if (h_B)
    free(h_B);
  if (h_C)
    free(h_C);
}

/*static void
runPass()
{
  int N = 50000;
  size_t size = N * sizeof(int);
  int threadsPerBlock = 0;
  int blocksPerGrid = 0;
  int *h_A, *h_B, *h_C;
  int *d_A, *d_B, *d_C;
  int i, sum;

  // Allocate input vectors h_A and h_B in host memory
  h_A = (int*)malloc(size);
  h_B = (int*)malloc(size);
  h_C = (int*)malloc(size);

  // Initialize input vectors
  initVec(h_A, N);
  initVec(h_B, N);
  memset(h_C, 0, size);

  // Allocate vectors in device memory
  cudaMalloc((void**)&d_A, size);
  cudaMalloc((void**)&d_B, size);
  cudaMalloc((void**)&d_C, size);

  // Copy vectors from host memory to device memory
  cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

  // Invoke kernel
  threadsPerBlock = 256;
  blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
  printf("Launching kernel: blocks %d, thread/block %d\n",
         blocksPerGrid, threadsPerBlock);

  VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

  // Copy result from device memory to host memory
  // h_C contains the result in host memory
  cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

  // Verify result
  for (i = 0; i < N; ++i) {
    sum = h_A[i] + h_B[i];
    if (h_C[i] != sum) {
      fprintf(stderr, "error: result verification failed\n");
      exit(-1);
    }
  }

  cleanUp(h_A, h_B, h_C, d_A, d_B, d_C);
}*/

static void CUPTIAPI
bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords)
{
  uint8_t *rawBuffer;

  *size = 16 * 1024;
  rawBuffer = (uint8_t *)malloc(*size + ALIGN_SIZE);

  *buffer = ALIGN_BUFFER(rawBuffer, ALIGN_SIZE);
  *maxNumRecords = 0;

  if (*buffer == NULL) {
    printf("Error: out of memory\n");
    exit(-1);
  }
}

static void CUPTIAPI
bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize)
{
  CUpti_Activity *record = NULL;
  CUpti_ActivityKernel3 *kernel;

  //since we launched only 1 kernel, we should have only 1 kernel record
  CUPTI_CALL(cuptiActivityGetNextRecord(buffer, validSize, &record));

  kernel = (CUpti_ActivityKernel3 *)record;
  if (kernel->kind != CUPTI_ACTIVITY_KIND_KERNEL) {
    fprintf(stderr, "Error: expected kernel activity record, got %d\n", (int)kernel->kind);
    exit(-1);
  }

  kernelDuration = kernel->end - kernel->start;
  free(buffer);
}

int
main(int argc, char *argv[])
{
  CUpti_SubscriberHandle subscriber;
  CUcontext context = 0;
  CUdevice device = 0;
  int computeCapabilityMajor=0;
  int computeCapabilityMinor=0;
  int deviceNum;
  int deviceCount;
  char deviceName[32];
  const char *metricName;
  CUpti_MetricID metricId;
  CUpti_EventGroupSets *passData;
  MetricData_t metricData;
  unsigned int pass;
  CUpti_MetricValue metricValue;

  printf("Usage: %s [device_num] [metric_name]\n", argv[0]);

  // make sure activity is enabled before any CUDA API
  CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));

  DRIVER_API_CALL(cuInit(0));
  DRIVER_API_CALL(cuDeviceGetCount(&deviceCount));
  if (deviceCount == 0) {
    printf("There is no device supporting CUDA.\n");
    return -2;
  }

  if (argc > 1)
    deviceNum = atoi(argv[1]);
  else
    deviceNum = 0;
  printf("CUDA Device Number: %d\n", deviceNum);

  DRIVER_API_CALL(cuDeviceGet(&device, deviceNum));
  DRIVER_API_CALL(cuDeviceGetName(deviceName, 32, device));
  printf("CUDA Device Name: %s\n", deviceName);

  DRIVER_API_CALL(cuDeviceComputeCapability(&computeCapabilityMajor,
                                            &computeCapabilityMinor,
                                            device));
  DRIVER_API_CALL(cuCtxCreate(&context, 0, device));


  // Get the name of the metric to collect
  if (argc > 2)
    metricName = argv[2];
  else {
    if (computeCapabilityMajor > 1) {
      metricName = METRIC_NAME_FERMI;
    }
    else {
      metricName = METRIC_NAME_TESLA;
    }
  }

  // need to collect duration of kernel execution without any event
  // collection enabled (some metrics need kernel duration as part of
  // calculation). The only accurate way to do this is by using the
  // activity API.
  {
    CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));
  //  runPass();
	pid_t child_pid, wpid;
	int status = 0;
	if((child_pid =fork()) == 0){
		system("./runpass");
		exit(0);
	}
	while((wpid = wait (&status)) > 0);
	
    cudaDeviceSynchronize();
    CUPTI_CALL(cuptiActivityFlushAll(0));
  }

  // setup launch callback for event collection
  CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)getMetricValueCallback, &metricData));
  CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API,
                                 CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020));

  // allocate space to hold all the events needed for the metric
  CUPTI_CALL(cuptiMetricGetIdFromName(device, metricName, &metricId));
  CUPTI_CALL(cuptiMetricGetNumEvents(metricId, &metricData.numEvents));
  metricData.device = device;
  metricData.eventIdArray = (CUpti_EventID *)malloc(metricData.numEvents * sizeof(CUpti_EventID));
  metricData.eventValueArray = (uint64_t *)malloc(metricData.numEvents * sizeof(uint64_t));
  metricData.eventIdx = 0;

  // get the number of passes required to collect all the events
  // needed for the metric and the event groups for each pass
  CUPTI_CALL(cuptiMetricCreateEventGroupSets(context, sizeof(metricId), &metricId, &passData));
  for (pass = 0; pass < passData->numSets; pass++) {
    printf("Pass %u\n", pass);
    metricData.eventGroups = passData->sets + pass;
  //  runPass();

		pid_t child_pid, wpid;
		int status = 0;
		if((child_pid =fork()) == 0){
			system("./runpass");
			exit(0);
		}
		while((wpid = wait (&status)) > 0);
  }

  if (metricData.eventIdx != metricData.numEvents) {
    fprintf(stderr, "error: expected %u metric events, got %u\n",
            metricData.numEvents, metricData.eventIdx);
    exit(-1);
  }

  // use all the collected events to calculate the metric value
  CUPTI_CALL(cuptiMetricGetValue(device, metricId,
                                 metricData.numEvents * sizeof(CUpti_EventID),
                                 metricData.eventIdArray,
                                 metricData.numEvents * sizeof(uint64_t),
                                 metricData.eventValueArray,
                                 kernelDuration, &metricValue));

  // print metric value, we format based on the value kind
  {
    CUpti_MetricValueKind valueKind;
    size_t valueKindSize = sizeof(valueKind);
    CUPTI_CALL(cuptiMetricGetAttribute(metricId, CUPTI_METRIC_ATTR_VALUE_KIND,
                                       &valueKindSize, &valueKind));
    switch (valueKind) {
    case CUPTI_METRIC_VALUE_KIND_DOUBLE:
      printf("Metric %s = %f\n", metricName, metricValue.metricValueDouble);
      break;
    case CUPTI_METRIC_VALUE_KIND_UINT64:
      printf("Metric %s = %llu\n", metricName,
             (unsigned long long)metricValue.metricValueUint64);
      break;
    case CUPTI_METRIC_VALUE_KIND_INT64:
      printf("Metric %s = %lld\n", metricName,
             (long long)metricValue.metricValueInt64);
      break;
    case CUPTI_METRIC_VALUE_KIND_PERCENT:
      printf("Metric %s = %f%%\n", metricName, metricValue.metricValuePercent);
      break;
    case CUPTI_METRIC_VALUE_KIND_THROUGHPUT:
      printf("Metric %s = %llu bytes/sec\n", metricName,
             (unsigned long long)metricValue.metricValueThroughput);
      break;
    case CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL:
      printf("Metric %s = utilization level %u\n", metricName,
             (unsigned int)metricValue.metricValueUtilizationLevel);
      break;
    default:
      fprintf(stderr, "error: unknown value kind\n");
      exit(-1);
    }
  }

  CUPTI_CALL(cuptiUnsubscribe(subscriber));
  return 0;
}