cudaLaunchHostFunc + cudaEventElapsedTime?

If it were me, I would not use a callback for timing:

$ cat t2084.cu
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

const unsigned long long my_delay = 1000000ULL;

unsigned long long dtime_usec(unsigned long long start=0){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

__global__ void k(unsigned long long delay){

  unsigned long long start = clock64();
  unsigned long long end = start+delay;
  while (clock64() < end);
}

struct d{
  unsigned long long start;
} my_parms;

void dt_cpu(void *parms){
  d *p = (d *) parms;
  std::cout << "CB Duration: " << dtime_usec(p->start) << "us" << std::endl;
}

int main(){
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaStream_t s;
  cudaStreamCreate(&s);
  k<<<1,1, 0, s>>>(my_delay); // warm-up
  cudaDeviceSynchronize();
#if defined USE_EVENT
  cudaEventRecord(start, s);
#elif defined USE_CB
  my_parms.start = dtime_usec(0);
#else
  unsigned long long scpu = dtime_usec(0);
#endif
  k<<<1,1, 0, s>>>(my_delay);
#if defined USE_EVENT
  cudaEventRecord(stop, s);
  cudaEventSynchronize(stop);
  float ms;
  cudaEventElapsedTime(&ms, start, stop);
  std::cout << "EV Duration: " << (ms*1000) << "us" << std::endl;
#elif defined USE_CB
  cudaLaunchHostFunc(s, dt_cpu, &my_parms);
#else
  cudaDeviceSynchronize();
  std::cout << "CPU Duration: " << dtime_usec(scpu) << "us" << std::endl;
#endif
  cudaDeviceSynchronize();
}
$ nvcc -o t2084 t2084.cu
$ ./t2084
CPU Duration: 880us
$ nvcc -o t2084 t2084.cu -DUSE_EVENT
$ ./t2084
EV Duration: 823.84us
$ nvcc -o t2084 t2084.cu -DUSE_CB
$ ./t2084
CB Duration: 1096us
$

CUDA 11.4, V100, CentOS7

About 99% of the time I use timing measurements like this for comparative purposes, e.g. to test the benefit of a presumed optimization. In that case, for the first two methods (not callback), it seems that they can “resolve” a ~1% change in kernel duration (for this test case). Modifying the code to:

const unsigned long long my_delay = 1010000ULL;

I get the following output:

$ nvcc -o t2084 t2084.cu
$ ./t2084
CPU Duration: 896us
$ nvcc -o t2084 t2084.cu -DUSE_EVENT
$ ./t2084
EV Duration: 827.52us
$ nvcc -o t2084 t2084.cu -DUSE_CB
$ ./t2084
CB Duration: 1083us
$

For completeness, nvprof reports for the original duration:

$ nvprof ./t2084
==7309== NVPROF is profiling process 7309, command: ./t2084
CPU Duration: 874us
==7309== Profiling application: ./t2084
==7309== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  1.6225ms         2  811.24us  811.16us  811.32us  k(__int64)

and for the 1% increased duration:

$ nvprof ./t2084
==6697== NVPROF is profiling process 6697, command: ./t2084
CPU Duration: 879us
==6697== Profiling application: ./t2084
==6697== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  1.6386ms         2  819.32us  819.22us  819.41us  k(__int64)

YMMV

1 Like