Unable to create CUDA graph nodes with null placeholder values

For my project, I want to create a graph whose structure is always the same, but the parameters will change for each invocation. For this I create the graph with null values for memcpy pointers and kernel arguments, but get a runtime error “invalid argument”. Is it possible to do it without having to create fake placeholders? Here is an example:

#include <cstdio>
#include <vector>

#define gpuErrchk(ans) \
  { gpuAssert((ans), __FILE__, __LINE__); }
__host__ inline void gpuAssert(cudaError_t code, const char* file, int line,
                               bool abort = true) {
  if (code != cudaSuccess) {
    printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
  }
}

__global__ void kernel() {}

int main() {
  cudaGraph_t graph;
  gpuErrchk(cudaGraphCreate(&graph, 0));
  void* indices;
  gpuErrchk(cudaMalloc(&indices, 10));

  std::vector<int> indices_host(10);

  // Works
  cudaGraphNode_t copy_indices_node;
  gpuErrchk(cudaGraphAddMemcpyNode1D(&copy_indices_node, graph, nullptr, 0,
                                     indices, indices_host.data(), 10,
                                     cudaMemcpyHostToDevice));

  // Fails
  cudaGraphNode_t copy_indices_node2;
  gpuErrchk(cudaGraphAddMemcpyNode1D(&copy_indices_node2, graph, nullptr, 0,
                                     nullptr, nullptr, 0,
                                     cudaMemcpyHostToDevice));

  // Works
  cudaGraphNode_t kernel_node;
  auto placeholder_params = cudaKernelNodeParams{.func = (void*)kernel,
                                                 .gridDim = dim3(1, 1, 1),
                                                 .blockDim = dim3(1, 1, 1),
                                                 .sharedMemBytes = 0,
                                                 .kernelParams = nullptr,
                                                 .extra = nullptr};

  gpuErrchk(cudaGraphAddKernelNode(&kernel_node, graph, nullptr, 0,
                                   &placeholder_params));

  // Fails
  cudaGraphNode_t kernel_node2;
  gpuErrchk(cudaGraphAddKernelNode(&kernel_node2, graph, nullptr, 0, nullptr));

  cudaGraphExec_t graphExec;
  gpuErrchk(cudaGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));

  gpuErrchk(cudaFree(indices));
  return 0;
}

Compiled with: nvcc -o example -arch=sm_61 test.cu, with CUDA 12.5. Output:

GPUassert: invalid argument test.cu 31
GPUassert: invalid argument test.cu 49

And another quick question since I’m here: the documentation for cudaGraphInstantiate states:

Memcpy nodes:

  • Only copies involving device memory and/or pinned device-mapped host memory are permitted.[…]

Why is this the case? Doesnt the graph use cudaMemcpy/Async?

And another one (sorry, but I really cant find any info): How can I debug a CUDA graph, I have found that the Memcpy before a kernel launch does not copy correctly, but do not know how I can assure that the copy has happened before the kernel launch.

Nsight Systems will show you the timeline relationship between the copy and launch.

Thanks for the reply! I have now solved that issue, but on Nsight Systems the whole execution shows up as 1 graph. Is it possible to view the breakdown of what is executing when?

Sorry, graphs are I area I have yet to explore, but looking at this, Nsight Systems seems to be doing that under a graph scenario, unless I misunderstand what you’re after.