I need to update MemCpy2D nodes within a child graph of an instantiated cuda graph (cudaGraphExecChildGraphNodeSetParams) and receive the following error:
the graph update was not performed because it included changes which violated constraints specific to instantiated graph update
Updating a top level MemCpy2D node within an exec graph (cudaGraphExecMemcpyNodeSetParams) doesn’t work either.
The exact same code works for MemCpy1D nodes.
The cuda API documentation mentions that Only 1D memsets can be changed
Does this apply to MemCpy operations as well or am I violating one of the other limitations?
The following code test the three afore mentioned variations. The result in all variations should be an array filled with the value 2 (e.g. the node update was successful).
I am working with cuda 11.4
#include <cuda_runtime.h>
#include <iostream>
#define gpuErrChk(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void test1DChildNode(float* src1, float* src2, float* dest, size_t n_frames) {
cudaGraph_t child_graph;
gpuErrChk(cudaGraphCreate(&child_graph, 0));
cudaGraphNode_t copy_node;
cudaMemcpy3DParms copy_param = {
.srcArray = nullptr,
.srcPos = {0, 0, 0},
.srcPtr = make_cudaPitchedPtr(src1, n_frames * sizeof(float), sizeof(float), 1),
.dstArray = nullptr,
.dstPos = {0, 0, 0},
.dstPtr = make_cudaPitchedPtr(dest, n_frames * sizeof(float), sizeof(float), 1),
.extent = {sizeof(float) * n_frames, 1, 1},
.kind = cudaMemcpyDeviceToDevice};
gpuErrChk(cudaGraphAddMemcpyNode(©_node, child_graph, nullptr, 0, ©_param));
gpuErrChk(cudaGraphMemcpyNodeGetParams(copy_node, ©_param));
cudaGraph_t graph;
cudaGraphNode_t child_graph_node;
gpuErrChk(cudaGraphCreate(&graph, 0));
gpuErrChk(cudaGraphAddChildGraphNode(&child_graph_node, graph, nullptr, 0, child_graph));
cudaStream_t stream;
gpuErrChk(cudaStreamCreate(&stream));
cudaGraphExec_t graph_exec;
gpuErrChk(cudaGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
copy_param.srcPtr.ptr = src2;
cudaGraphMemcpyNodeSetParams(copy_node, ©_param);
gpuErrChk(cudaGraphExecChildGraphNodeSetParams(graph_exec, child_graph_node, child_graph));
gpuErrChk(cudaGraphLaunch(graph_exec, stream));
gpuErrChk(cudaStreamSynchronize(stream));
}
void test2DChildNode(float* src1, float* src2, float* dest, size_t n_frames) {
cudaGraph_t child_graph;
gpuErrChk(cudaGraphCreate(&child_graph, 0));
cudaGraphNode_t copy_node;
cudaMemcpy3DParms copy_param = {
.srcArray = nullptr,
.srcPos = {0, 0, 0},
.srcPtr = make_cudaPitchedPtr(src1, sizeof(float), sizeof(float), n_frames),
.dstArray = nullptr,
.dstPos = {0, 0, 0},
.dstPtr = make_cudaPitchedPtr(dest, sizeof(float), sizeof(float), n_frames),
.extent = {sizeof(float), n_frames, 1},
.kind = cudaMemcpyDeviceToDevice};
gpuErrChk(cudaGraphAddMemcpyNode(©_node, child_graph, nullptr, 0, ©_param));
gpuErrChk(cudaGraphMemcpyNodeGetParams(copy_node, ©_param));
cudaGraph_t graph;
cudaGraphNode_t child_graph_node;
gpuErrChk(cudaGraphCreate(&graph, 0));
gpuErrChk(cudaGraphAddChildGraphNode(&child_graph_node, graph, nullptr, 0, child_graph));
cudaStream_t stream;
gpuErrChk(cudaStreamCreate(&stream));
cudaGraphExec_t graph_exec;
gpuErrChk(cudaGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
copy_param.srcPtr.ptr = src2;
cudaGraphMemcpyNodeSetParams(copy_node, ©_param);
gpuErrChk(cudaGraphExecChildGraphNodeSetParams(graph_exec, child_graph_node, child_graph));
gpuErrChk(cudaGraphLaunch(graph_exec, stream));
gpuErrChk(cudaStreamSynchronize(stream));
}
void test2DNode(float* src1, float* src2, float* dest, size_t n_frames) {
cudaGraph_t graph;
gpuErrChk(cudaGraphCreate(&graph, 0));
cudaGraphNode_t copy_node;
cudaMemcpy3DParms copy_param = {
.srcArray = nullptr,
.srcPos = {0, 0, 0},
.srcPtr = make_cudaPitchedPtr(src1, sizeof(float), sizeof(float), n_frames),
.dstArray = nullptr,
.dstPos = {0, 0, 0},
.dstPtr = make_cudaPitchedPtr(dest, sizeof(float), sizeof(float), n_frames),
.extent = {sizeof(float), n_frames, 1},
.kind = cudaMemcpyDeviceToDevice};
gpuErrChk(cudaGraphAddMemcpyNode(©_node, graph, nullptr, 0, ©_param));
gpuErrChk(cudaGraphMemcpyNodeGetParams(copy_node, ©_param));
cudaStream_t stream;
gpuErrChk(cudaStreamCreate(&stream));
cudaGraphExec_t graph_exec;
gpuErrChk(cudaGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
copy_param.srcPtr.ptr = src2;
gpuErrChk(cudaGraphExecMemcpyNodeSetParams(graph_exec, copy_node, ©_param));
gpuErrChk(cudaGraphLaunch(graph_exec, stream));
gpuErrChk(cudaStreamSynchronize(stream));
}
int main() {
size_t n_frames = 8;
float* src_host;
float* dest_host;
float* src1;
float* src2;
float* dest;
src_host = new float[n_frames];
for (size_t i = 0; i < n_frames; i++) {
src_host[i] = 1;
}
gpuErrChk(cudaMalloc(&src1, sizeof(float) * n_frames));
gpuErrChk(cudaMemcpy(src1, src_host, sizeof(float) * n_frames, cudaMemcpyHostToDevice));
for (size_t i = 0; i < n_frames; i++) {
src_host[i] = 2;
}
gpuErrChk(cudaMalloc(&src2, sizeof(float) * n_frames));
gpuErrChk(cudaMemcpy(src2, src_host, sizeof(float) * n_frames, cudaMemcpyHostToDevice));
dest_host = new float[n_frames];
for (size_t j = 0; j < n_frames; j++) {
dest_host[j] = 0;
}
gpuErrChk(cudaMalloc(&dest, sizeof(float) * n_frames));
gpuErrChk(cudaMemset(dest, 0, sizeof(float) * n_frames));
test1DChildNode(src1, src2, dest, n_frames);
gpuErrChk(cudaMemcpy(dest_host, dest, sizeof(float) * n_frames, cudaMemcpyDeviceToHost));
for (size_t j = 0; j < n_frames; j++) {
std::cout << "dest_host[" << j << "] = " << dest_host[j] << std::endl;
}
gpuErrChk(cudaMemset(dest, 0, sizeof(float) * n_frames));
test2DChildNode(src1, src2, dest, n_frames);
gpuErrChk(cudaMemcpy(dest_host, dest, sizeof(float) * n_frames, cudaMemcpyDeviceToHost));
for (size_t j = 0; j < n_frames; j++) {
std::cout << "dest_host[" << j << "] = " << dest_host[j] << std::endl;
}
gpuErrChk(cudaMemset(dest, 0, sizeof(float) * n_frames));
test2DNode(src1, src2, dest, n_frames);
gpuErrChk(cudaMemcpy(dest_host, dest, sizeof(float) * n_frames, cudaMemcpyDeviceToHost));
for (size_t j = 0; j < n_frames; j++) {
std::cout << "dest_host[" << j << "] = " << dest_host[j] << std::endl;
}
}