Why the kernel nodes in the device graph cannot use dynamic parallelismc

I want to understand the specific reason why the kernel nodes in the device graph cannot use dynamic parallelism

// cuda_graph_dynamic_parallelism.cu

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

#define XSIZE 32


// CUDA error checking macro
#define CUDA_CHECK(call)                                                   \
    do {                                                                   \
        cudaError_t err = call;                                            \
        if (err != cudaSuccess) {                                          \
            fprintf(stderr, "CUDA error in %s (%s:%d): %s\n",              \
                    #call, __FILE__, __LINE__, cudaGetErrorString(err));   \
            exit(EXIT_FAILURE);                                            \
        }                                                                  \
    } while (0)

// Subkernel for dynamic parallel execution
__global__ void child_kernel(int *data, int size)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size)
    {
        data[idx] += 1; // Simple plus one operation
    }
}

// The parent kernel dynamically starts the child kernel during execution
__global__ void parent_kernel(int *data, int size)
{
    // Define startup parameters for the subkernel
    int child_threads = 256; 
    int child_blocks = (size + child_threads - 1) / child_threads;

    // Promoter kernel
    child_kernel <<<child_blocks, child_threads>>>(data, size);
}

int main()
{
    // Initializes the data size
    int size = 1024;
    size_t bytes = size * sizeof(int);

    // Allocate device memory and initialize it to 0
    int *d_data;
    CUDA_CHECK(cudaMalloc(&d_data, bytes));
    CUDA_CHECK(cudaMemset(d_data, 0, bytes));

    // Create CUDA graph
    cudaGraph_t graph;
    CUDA_CHECK(cudaGraphCreate(&graph, 0));

    // Define parameters for the parent kernel
    void *kernelArgs[] = { &d_data, &size };

    // Set kernel node parameters
    cudaKernelNodeParams kernelNodeParams = {0};
    kernelNodeParams.func = (void*)parent_kernel;
    kernelNodeParams.gridDim = dim3(1);
    kernelNodeParams.blockDim = dim3(1);
    kernelNodeParams.kernelParams = kernelArgs;
    kernelNodeParams.extra = nullptr;

    cudaStream_t streamForGraph;
    CUDA_CHECK(cudaStreamCreate(&streamForGraph));


    // Add the kernel node to the graph
    cudaGraphNode_t kernelNode;
    CUDA_CHECK(cudaGraphAddKernelNode(&kernelNode, graph, nullptr, 0, &kernelNodeParams));

    // Set graph instantiation parameters (default parameters are used here)
    cudaGraphInstantiateParams instantiateParams = {0};
    instantiateParams.uploadStream = streamForGraph;
    instantiateParams.flags = cudaGraphInstantiateFlagDeviceLaunch | cudaGraphInstantiateFlagUpload;
    // instantiateParams.flags = cudaGraphInstantiateFlagAutoFreeOnLaunch;
    // Optional: Set other parameters as required, such as flags

    constexpr int width{XSIZE};

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(sizeof(int)*8,
                                                        0, 0, 0, cudaChannelFormatKindSigned);
    cudaArray *devArray1, *devArray2;
    CUDA_CHECK(cudaMalloc3DArray(&devArray1, &channelDesc,
                       make_cudaExtent(width, 0, 0), cudaArrayDefault));
    CUDA_CHECK(cudaMalloc3DArray(&devArray2, &channelDesc,
                       make_cudaExtent(width, 0, 0), cudaArrayDefault));

    cudaMemcpy3DParms myparams;
    memset(&myparams, 0x0, sizeof(cudaMemcpy3DParms));
    myparams.srcPos = make_cudaPos(0, 0, 0);
    myparams.dstPos = make_cudaPos(0, 0, 0);
    myparams.srcArray = devArray1;
    myparams.dstArray = devArray2;
    myparams.extent = make_cudaExtent(width, 1, 1);
    myparams.kind = cudaMemcpyDeviceToDevice;
    cudaGraphNode_t memcpyNode;
    cudaGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams);

    // Instantiate the CUDA diagram
    cudaGraphExec_t graphExec;
    CUDA_CHECK(cudaGraphInstantiateWithParams(&graphExec, graph, &instantiateParams));

    // Start and execute CUDA diagram
    CUDA_CHECK(cudaGraphLaunch(graphExec, 0));

    // Wait for the graph to complete
    CUDA_CHECK(cudaDeviceSynchronize());

    // Copy the result back to the host and verify
    int *h_data = (int*)malloc(bytes);
    CUDA_CHECK(cudaMemcpy(h_data, d_data, bytes, cudaMemcpyDeviceToHost));

    bool success = true;
    for(int i = 0; i < size; ++i)
    {
        if(h_data[i] != 1)
        {
            success = false;
            printf("Data does not match in index %d: %d != 1\n", i, h_data[i]);
            break;
        }
    }

    if(success)
    {
        printf("The CUDA graph is successfully executed and all data has been increased.\n");
    }

    // Clean up resources
    free(h_data);
    CUDA_CHECK(cudaFree(d_data));
    CUDA_CHECK(cudaGraphExecDestroy(graphExec));
    CUDA_CHECK(cudaGraphDestroy(graph));

    return 0;
}

test code
nvcc cuda_graph_dynamic_parallelism.cu -rdc=true -o cuda_graph_dynamic_parallelism
./cuda_graph_dynamic_parallelism
CUDA error in cudaGraphInstantiateWithParams(&graphExec, graph, &instantiateParams) (cuda_graph_dynamic_parallelism.cu:110): invalid argument