Hi everyone,
I recently started to work with the CUDA graph API. However, I bumped into some issues while trying to measure execution time of a part of my graph.
As suggested in another post, I tried to put two timing events in the graph through cudaEventRecord(), then calling cudaEventElapsedTime after the graph execution.
However, it seems I can’t get it right with this implementation. cudaEvents captured inside the graph cause a cudaEventInvalidValue when calling cudaEventElapsedTime() right after the graph execution.
I tried to change cudaEventRecord() with cudaEventRecordWithFlags(event, stream, cudaEventRecordExternal, but this unexpectedly slows down heavily the graph execution.
Am I doing something wrong? I’m not so sure that we can time event in a graph such as I do with «regular» CUDA programming. Thanks.
Example :
cudaGraph_t cuGraph;
cudaEvent_t start;
cudaEvent_t stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
int count = 0;
checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
randomKernel<<<1,1,0,stream>>>(…);
checkCudaErrors(cudaEventRecord(start, stream));
kernelWeWantToMeasure<<<1,1,0,stream>>>(…);
checkCudaErrors(cudaEventRecord(stop, stream));
checkCudaErrors(cudaStreamEndCapture(stream, &cuGraph));
cudaGraphExec_t graphExec = NULL;
checkCudaErrors(cudaGraphInstantiate(&graphExec, cuGraph, NULL, NULL, 0));
//cudaGraphDebugDotPrint(cuGraph, “debugGraphTimer.txt”, 0);
checkCudaErrors(cudaGraphDestroy(cuGraph));
for (int k = 0; k < maxIter; k++) {
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
checkCudaErrors(cudaEventElapsedTime(×[k], start, stop));
}