Hi,
I am using CUDA 10.0 and its graph features and want to add a kernel using dynamic parallelism to the graph:
#include <cuda.h>
#include <cuda_runtime.h>
#include <cstdint>
#include <cstdio>
#include <cuda_device_runtime_api.h>
__global__ void childKernel(){
if(threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0){
printf("hello from childKernel\n");
}
}
__global__ void parentKernel() {
childKernel<<<dim3(60000,10), 64>>>();
auto ret = cudaDeviceSynchronize();
if(ret != cudaSuccess){
printf("CudaStreamSynchronize failed with %i",ret);
}
printf("done\n");
}
int main() {
cudaGraph_t graph;
cudaGraphCreate(&graph,0);
cudaGraphNode_t node;
cudaKernelNodeParams params;
params.func = (void*) parentKernel;
params.extra = nullptr;
params.gridDim = dim3(1);
params.blockDim = dim3(1);
params.sharedMemBytes = 0;
params.kernelParams = nullptr;
cudaGraphAddKernelNode(&node,graph,nullptr,0,¶ms);
cudaGraphExec_t instance;
cudaGraphInstantiate(&instance,graph,nullptr,nullptr,0);
cudaStream_t myStream;
cudaStreamCreate(&myStream);
for(int i = 0; i < 100000; ++i){
cudaGraphLaunch(instance,myStream);
auto err = cudaStreamSynchronize(myStream);
if (err != cudaSuccess) {
printf("CUDA Error %d occured\n", err);
break;
}
}
cudaGraphExecDestroy(instance);
cudaGraphDestroy(graph);
cudaStreamDestroy(myStream);
return 0;
}
The output of this program is:
hello from childKernel
done
hello from childKernel
CUDA Error 4 occured
The first graph-launch completes successfully, but the second one fails.
For the example+cmake script, please see GitHub - weynaa/cudagraphcdplaunchfailure
I tried this with a GeForce RTX2060 on both Linux and Windows and get the same output.
If i do one of the following steps, the error vanishes:
- remove cudaDeviceSynchronize() from the parent kernel
- create a new cudaGraphInstance for every graph-launch,
- make the launch-size of the child-kernel smaller (e.g. remove the second dimension)
Are limits for child-kernels stricter than the general limit? But if so, how does this only happen if I reuse a graph instance?
Any advice, tips or experience would be greatly appreciated as neither of the fixes I found are viable in my application.
Best,
Michael