Hi, I’m testing the CUDA 12 Device Graph Launch feature and I came across an issue with a quite simple case.
When executing the following code, the call of cudaStreamSynchronize after the loop never return. It happens when the number of iteration is 2 or above. The number of time empty_kernel is call is not constant, some tests showed value between 0 and 3.
When adding a sleep in the for loop (100ms or more), the executable works fine. This seems to indicate there is a concurrent access to the exec graph which is forbidden. I would I thought that the cuda stream would have guaranty the sequential execution.
#include <cstdio>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
__global__ void empty_kernel() { }
__global__ void launch_kernel(cudaGraphExec_t exec) {
cudaGraphLaunch(exec, cudaStreamGraphTailLaunch);
}
int main() {
gpuErrchk(cudaSetDevice(0));
cudaStream_t stream;
gpuErrchk(cudaStreamCreate(&stream));
cudaGraphExec_t device_graph_exec;
// create pipeline graph and upload it on device
{
gpuErrchk(cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal));
empty_kernel<<<1, 1, 0, stream>>>();
cudaGraph_t graph;
gpuErrchk(cudaStreamEndCapture(stream, &graph));
// instantiate graph exec on device
gpuErrchk(cudaGraphInstantiate(&device_graph_exec, graph, cudaGraphInstantiateFlagDeviceLaunch));
gpuErrchk(cudaGraphUpload(device_graph_exec, stream));
gpuErrchk(cudaGraphDestroy(graph));
}
cudaGraphExec_t graph_exec;
// create launcher graph
{
gpuErrchk(cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal));
launch_kernel<<<1, 1, 0, stream>>>(device_graph_exec);
cudaGraph_t graph;
gpuErrchk(cudaStreamEndCapture(stream, &graph));
// instantiate graph exec
gpuErrchk(cudaGraphInstantiate(&graph_exec, graph, 0));
gpuErrchk(cudaGraphDestroy(graph));
}
// put the graph on the stream 10 times
for (int i = 0; i < 10; ++i) {
gpuErrchk(cudaGraphLaunch(graph_exec, stream));
// calling cudaGraphLaunch(device_graph_exec, stream) works when exec created with the flags = 0
// no problem when the following line is uncommented
// gpuErrchk(cudaStreamSynchronize(stream));
}
gpuErrchk(cudaStreamSynchronize(stream));
// clean up
gpuErrchk(cudaGraphExecDestroy(device_graph_exec));
gpuErrchk(cudaGraphExecDestroy(graph_exec));
gpuErrchk(cudaStreamDestroy(stream));
}
The source file was compiled using the following command for a tesla V100
nvcc --generate-code=arch=compute_70,code=[compute_70,sm_70] main.cu -o multi_launch
I’m using CUDA 12.1 on AlmaLinux 8.4 with nvidia driver version 530.30.02.