CUDA graphs error with cudaEventElapsedTime (cudaEventInvalidValue)

Hi everyone,

I recently started to work with the CUDA graph API. However, I bumped into some issues while trying to measure execution time of a part of my graph.
As suggested in another post, I tried to put two timing events in the graph through cudaEventRecord(), then calling cudaEventElapsedTime after the graph execution.

However, it seems I can’t get it right with this implementation. cudaEvents captured inside the graph cause a cudaEventInvalidValue when calling cudaEventElapsedTime() right after the graph execution.
I tried to change cudaEventRecord() with cudaEventRecordWithFlags(event, stream, cudaEventRecordExternal, but this unexpectedly slows down heavily the graph execution.

Am I doing something wrong? I’m not so sure that we can time event in a graph such as I do with «regular» CUDA programming. Thanks.

Example :

cudaGraph_t cuGraph;

cudaEvent_t start;
cudaEvent_t stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
int count = 0;

checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));

randomKernel<<<1,1,0,stream>>>(…);

checkCudaErrors(cudaEventRecord(start, stream));

kernelWeWantToMeasure<<<1,1,0,stream>>>(…);

checkCudaErrors(cudaEventRecord(stop, stream));

checkCudaErrors(cudaStreamEndCapture(stream, &cuGraph));

cudaGraphExec_t graphExec = NULL;

checkCudaErrors(cudaGraphInstantiate(&graphExec, cuGraph, NULL, NULL, 0));

//cudaGraphDebugDotPrint(cuGraph, “debugGraphTimer.txt”, 0);

checkCudaErrors(cudaGraphDestroy(cuGraph));

for (int k = 0; k < maxIter; k++) {
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
checkCudaErrors(cudaEventElapsedTime(&times[k], start, stop));
}

1 Like

that seems to be the way to go

according to my simple test, it adds about 5us to the graph execution:

$ cat t1975.cu
#include <helper_cuda.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start=0){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void randomKernel(){};

int main(){
  const int maxIter=2;
  cudaGraph_t cuGraph;
  cudaStream_t stream;
  checkCudaErrors(cudaStreamCreate(&stream));
#ifdef USE_EVENT
  float times[maxIter];
  cudaEvent_t start;
  cudaEvent_t stop;
  checkCudaErrors(cudaEventCreate(&start));
  checkCudaErrors(cudaEventCreate(&stop));
#endif
  checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));

  randomKernel<<<1,1,0,stream>>>();
#ifdef USE_EVENT
  checkCudaErrors(cudaEventRecordWithFlags(start, stream,cudaEventRecordExternal));
#endif
  randomKernel<<<1,1,0,stream>>>();
#ifdef USE_EVENT
  checkCudaErrors(cudaEventRecordWithFlags(stop, stream,cudaEventRecordExternal));
#endif
  checkCudaErrors(cudaStreamEndCapture(stream, &cuGraph));

  cudaGraphExec_t graphExec = NULL;

  checkCudaErrors(cudaGraphInstantiate(&graphExec, cuGraph, NULL, NULL, 0));

  //cudaGraphDebugDotPrint(cuGraph, “debugGraphTimer.txt”, 0);

  checkCudaErrors(cudaGraphDestroy(cuGraph));
  for (int k = 0; k < maxIter; k++) {
    long long dt = dtime_usec(0);
    checkCudaErrors(cudaGraphLaunch(graphExec, stream));
    checkCudaErrors(cudaStreamSynchronize(stream));
    dt = dtime_usec(dt);
    std::cout << "graph elapsed time: " << dt  << "us" << std::endl;
#ifdef USE_EVENT
    checkCudaErrors(cudaEventElapsedTime(&times[k], start, stop));
    std::cout << "event elapsed time: " <<  times[k] << "ms" <<  std::endl;
#endif
  }
}
$ nvcc -o t1975 t1975.cu -I/usr/local/cuda/samples/common/inc -DUSE_EVENT
$ ./t1975
graph elapsed time: 49us
event elapsed time: 0.004992ms
graph elapsed time: 19us
event elapsed time: 0.004192ms
$ nvcc -o t1975 t1975.cu -I/usr/local/cuda/samples/common/inc
$ ./t1975
graph elapsed time: 44us
graph elapsed time: 14us
$
2 Likes

You are right! Using cudaEventRecordWithFlags seems to be the way to go. I should have used a simpler test case such as yours. In my program, it is a host callback which seems to be slowing down my pipeline, but this is not relevant in this thread.

Thank you for your fast reply!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.