For my project, I want to create a graph whose structure is always the same, but the parameters will change for each invocation. For this I create the graph with null values for memcpy pointers and kernel arguments, but get a runtime error “invalid argument”. Is it possible to do it without having to create fake placeholders? Here is an example:
#include <cstdio>
#include <vector>
#define gpuErrchk(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
__host__ inline void gpuAssert(cudaError_t code, const char* file, int line,
bool abort = true) {
if (code != cudaSuccess) {
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
}
}
__global__ void kernel() {}
int main() {
cudaGraph_t graph;
gpuErrchk(cudaGraphCreate(&graph, 0));
void* indices;
gpuErrchk(cudaMalloc(&indices, 10));
std::vector<int> indices_host(10);
// Works
cudaGraphNode_t copy_indices_node;
gpuErrchk(cudaGraphAddMemcpyNode1D(©_indices_node, graph, nullptr, 0,
indices, indices_host.data(), 10,
cudaMemcpyHostToDevice));
// Fails
cudaGraphNode_t copy_indices_node2;
gpuErrchk(cudaGraphAddMemcpyNode1D(©_indices_node2, graph, nullptr, 0,
nullptr, nullptr, 0,
cudaMemcpyHostToDevice));
// Works
cudaGraphNode_t kernel_node;
auto placeholder_params = cudaKernelNodeParams{.func = (void*)kernel,
.gridDim = dim3(1, 1, 1),
.blockDim = dim3(1, 1, 1),
.sharedMemBytes = 0,
.kernelParams = nullptr,
.extra = nullptr};
gpuErrchk(cudaGraphAddKernelNode(&kernel_node, graph, nullptr, 0,
&placeholder_params));
// Fails
cudaGraphNode_t kernel_node2;
gpuErrchk(cudaGraphAddKernelNode(&kernel_node2, graph, nullptr, 0, nullptr));
cudaGraphExec_t graphExec;
gpuErrchk(cudaGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
gpuErrchk(cudaFree(indices));
return 0;
}
Compiled with: nvcc -o example -arch=sm_61 test.cu
, with CUDA 12.5. Output:
GPUassert: invalid argument test.cu 31
GPUassert: invalid argument test.cu 49
And another quick question since I’m here: the documentation for cudaGraphInstantiate
states:
Memcpy nodes:
- Only copies involving device memory and/or pinned device-mapped host memory are permitted.[…]
Why is this the case? Doesnt the graph use cudaMemcpy/Async?
And another one (sorry, but I really cant find any info): How can I debug a CUDA graph, I have found that the Memcpy before a kernel launch does not copy correctly, but do not know how I can assure that the copy has happened before the kernel launch.