I want to understand the specific reason why the kernel nodes in the device graph cannot use dynamic parallelism
// cuda_graph_dynamic_parallelism.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#define XSIZE 32
// CUDA error checking macro
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error in %s (%s:%d): %s\n", \
#call, __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Subkernel for dynamic parallel execution
__global__ void child_kernel(int *data, int size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size)
{
data[idx] += 1; // Simple plus one operation
}
}
// The parent kernel dynamically starts the child kernel during execution
__global__ void parent_kernel(int *data, int size)
{
// Define startup parameters for the subkernel
int child_threads = 256;
int child_blocks = (size + child_threads - 1) / child_threads;
// Promoter kernel
child_kernel <<<child_blocks, child_threads>>>(data, size);
}
int main()
{
// Initializes the data size
int size = 1024;
size_t bytes = size * sizeof(int);
// Allocate device memory and initialize it to 0
int *d_data;
CUDA_CHECK(cudaMalloc(&d_data, bytes));
CUDA_CHECK(cudaMemset(d_data, 0, bytes));
// Create CUDA graph
cudaGraph_t graph;
CUDA_CHECK(cudaGraphCreate(&graph, 0));
// Define parameters for the parent kernel
void *kernelArgs[] = { &d_data, &size };
// Set kernel node parameters
cudaKernelNodeParams kernelNodeParams = {0};
kernelNodeParams.func = (void*)parent_kernel;
kernelNodeParams.gridDim = dim3(1);
kernelNodeParams.blockDim = dim3(1);
kernelNodeParams.kernelParams = kernelArgs;
kernelNodeParams.extra = nullptr;
cudaStream_t streamForGraph;
CUDA_CHECK(cudaStreamCreate(&streamForGraph));
// Add the kernel node to the graph
cudaGraphNode_t kernelNode;
CUDA_CHECK(cudaGraphAddKernelNode(&kernelNode, graph, nullptr, 0, &kernelNodeParams));
// Set graph instantiation parameters (default parameters are used here)
cudaGraphInstantiateParams instantiateParams = {0};
instantiateParams.uploadStream = streamForGraph;
instantiateParams.flags = cudaGraphInstantiateFlagDeviceLaunch | cudaGraphInstantiateFlagUpload;
// instantiateParams.flags = cudaGraphInstantiateFlagAutoFreeOnLaunch;
// Optional: Set other parameters as required, such as flags
constexpr int width{XSIZE};
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(sizeof(int)*8,
0, 0, 0, cudaChannelFormatKindSigned);
cudaArray *devArray1, *devArray2;
CUDA_CHECK(cudaMalloc3DArray(&devArray1, &channelDesc,
make_cudaExtent(width, 0, 0), cudaArrayDefault));
CUDA_CHECK(cudaMalloc3DArray(&devArray2, &channelDesc,
make_cudaExtent(width, 0, 0), cudaArrayDefault));
cudaMemcpy3DParms myparams;
memset(&myparams, 0x0, sizeof(cudaMemcpy3DParms));
myparams.srcPos = make_cudaPos(0, 0, 0);
myparams.dstPos = make_cudaPos(0, 0, 0);
myparams.srcArray = devArray1;
myparams.dstArray = devArray2;
myparams.extent = make_cudaExtent(width, 1, 1);
myparams.kind = cudaMemcpyDeviceToDevice;
cudaGraphNode_t memcpyNode;
cudaGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams);
// Instantiate the CUDA diagram
cudaGraphExec_t graphExec;
CUDA_CHECK(cudaGraphInstantiateWithParams(&graphExec, graph, &instantiateParams));
// Start and execute CUDA diagram
CUDA_CHECK(cudaGraphLaunch(graphExec, 0));
// Wait for the graph to complete
CUDA_CHECK(cudaDeviceSynchronize());
// Copy the result back to the host and verify
int *h_data = (int*)malloc(bytes);
CUDA_CHECK(cudaMemcpy(h_data, d_data, bytes, cudaMemcpyDeviceToHost));
bool success = true;
for(int i = 0; i < size; ++i)
{
if(h_data[i] != 1)
{
success = false;
printf("Data does not match in index %d: %d != 1\n", i, h_data[i]);
break;
}
}
if(success)
{
printf("The CUDA graph is successfully executed and all data has been increased.\n");
}
// Clean up resources
free(h_data);
CUDA_CHECK(cudaFree(d_data));
CUDA_CHECK(cudaGraphExecDestroy(graphExec));
CUDA_CHECK(cudaGraphDestroy(graph));
return 0;
}
test code
nvcc cuda_graph_dynamic_parallelism.cu -rdc=true -o cuda_graph_dynamic_parallelism
./cuda_graph_dynamic_parallelism
CUDA error in cudaGraphInstantiateWithParams(&graphExec, graph, &instantiateParams) (cuda_graph_dynamic_parallelism.cu:110): invalid argument