If it were me, I would not use a callback for timing:
$ cat t2084.cu
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const unsigned long long my_delay = 1000000ULL;
unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void k(unsigned long long delay){
unsigned long long start = clock64();
unsigned long long end = start+delay;
while (clock64() < end);
}
struct d{
unsigned long long start;
} my_parms;
void dt_cpu(void *parms){
d *p = (d *) parms;
std::cout << "CB Duration: " << dtime_usec(p->start) << "us" << std::endl;
}
int main(){
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaStream_t s;
cudaStreamCreate(&s);
k<<<1,1, 0, s>>>(my_delay); // warm-up
cudaDeviceSynchronize();
#if defined USE_EVENT
cudaEventRecord(start, s);
#elif defined USE_CB
my_parms.start = dtime_usec(0);
#else
unsigned long long scpu = dtime_usec(0);
#endif
k<<<1,1, 0, s>>>(my_delay);
#if defined USE_EVENT
cudaEventRecord(stop, s);
cudaEventSynchronize(stop);
float ms;
cudaEventElapsedTime(&ms, start, stop);
std::cout << "EV Duration: " << (ms*1000) << "us" << std::endl;
#elif defined USE_CB
cudaLaunchHostFunc(s, dt_cpu, &my_parms);
#else
cudaDeviceSynchronize();
std::cout << "CPU Duration: " << dtime_usec(scpu) << "us" << std::endl;
#endif
cudaDeviceSynchronize();
}
$ nvcc -o t2084 t2084.cu
$ ./t2084
CPU Duration: 880us
$ nvcc -o t2084 t2084.cu -DUSE_EVENT
$ ./t2084
EV Duration: 823.84us
$ nvcc -o t2084 t2084.cu -DUSE_CB
$ ./t2084
CB Duration: 1096us
$
CUDA 11.4, V100, CentOS7
About 99% of the time I use timing measurements like this for comparative purposes, e.g. to test the benefit of a presumed optimization. In that case, for the first two methods (not callback), it seems that they can “resolve” a ~1% change in kernel duration (for this test case). Modifying the code to:
const unsigned long long my_delay = 1010000ULL;
I get the following output:
$ nvcc -o t2084 t2084.cu
$ ./t2084
CPU Duration: 896us
$ nvcc -o t2084 t2084.cu -DUSE_EVENT
$ ./t2084
EV Duration: 827.52us
$ nvcc -o t2084 t2084.cu -DUSE_CB
$ ./t2084
CB Duration: 1083us
$
For completeness, nvprof
reports for the original duration:
$ nvprof ./t2084
==7309== NVPROF is profiling process 7309, command: ./t2084
CPU Duration: 874us
==7309== Profiling application: ./t2084
==7309== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 1.6225ms 2 811.24us 811.16us 811.32us k(__int64)
and for the 1% increased duration:
$ nvprof ./t2084
==6697== NVPROF is profiling process 6697, command: ./t2084
CPU Duration: 879us
==6697== Profiling application: ./t2084
==6697== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 1.6386ms 2 819.32us 819.22us 819.41us k(__int64)
YMMV