Hi, I was giving a try to the graph api and I have this weird behavior with the following code:
#include <cuda_runtime.h>
#include <stdio.h>
#include <array>
#include <iostream>
#include <vector>
#include "helper_cuda.h"
__global__ void debugPrint(float* data, int size) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
int jump = blockDim.x * gridDim.x;
for (int idx = index; idx < size; idx += jump) {
if (idx < size) {
printf("Device: data[%d] = %f\n", idx, data[idx]);
data[idx] *= 2;
}
}
}
int main(int argc, char* argv[]) {
constexpr int data_size = 100;
std::array<float, data_size> data_in;
std::array<float, data_size> data_out;
for (size_t idx = 0; idx < data_size; idx++) {
data_in[idx] = 3;
data_out[idx] = -1;
}
cudaGraph_t graph;
cudaGraphExec_t graph_exec;
cudaStream_t stream;
cudaGraphNode_t alloc, host_set, copy_to_device, copy_to_host, exec_kernel,
free;
cudaMemAllocNodeParams alloc_pars{};
memset(&alloc_pars, 0, sizeof(alloc_pars));
alloc_pars.poolProps.allocType = cudaMemAllocationTypePinned;
alloc_pars.poolProps.location.id = 0;
alloc_pars.poolProps.location.type = cudaMemLocationTypeDevice;
alloc_pars.bytesize = sizeof(float) * data_in.size();
auto set_value = [](void* data) {
float* par = static_cast<float*>(data);
for (int i = 0; i < data_size; i++) {
par[i] = 42;
std::cout << "Host: data[" << i << "] = " << par[i] << std::endl;
}
};
cudaHostNodeParams host_pars{set_value, data_in.data()};
checkCudaErrors(cudaGraphCreate(&graph, 0));
checkCudaErrors(
cudaGraphAddHostNode(&host_set, graph, nullptr, 0, &host_pars));
checkCudaErrors(
cudaGraphAddMemAllocNode(&alloc, graph, nullptr, 0, &alloc_pars));
size_t array_size = data_in.size();
cudaKernelNodeParams kernel_pars = {0};
kernel_pars.func = (void*)debugPrint;
kernel_pars.gridDim = dim3(15, 1, 1);
kernel_pars.blockDim = dim3(1, 1, 1);
kernel_pars.extra = NULL;
kernel_pars.sharedMemBytes = 0;
void* parameters[2] = {(void*)&alloc_pars.dptr, &array_size};
kernel_pars.kernelParams = parameters;
std::vector<cudaGraphNode_t> copy_to_device_dep;
copy_to_device_dep.push_back(alloc);
copy_to_device_dep.push_back(host_set);
cudaMemcpy3DParms to_device_pars = {0};
to_device_pars.dstPos = make_cudaPos(0, 0, 0);
to_device_pars.dstPtr = make_cudaPitchedPtr(
alloc_pars.dptr, array_size * sizeof(float), array_size, 1);
to_device_pars.extent = make_cudaExtent(sizeof(float) * array_size, 1, 1);
to_device_pars.kind = cudaMemcpyHostToDevice;
to_device_pars.srcPos = make_cudaPos(0, 0, 0);
to_device_pars.srcPtr = make_cudaPitchedPtr(
data_in.data(), array_size * sizeof(float), array_size, 1);
cudaMemcpy3DParms to_host_pars = {0};
to_host_pars.dstPos = make_cudaPos(0, 0, 0);
to_host_pars.dstPtr = make_cudaPitchedPtr(
data_out.data(), array_size * sizeof(float), array_size, 1);
to_host_pars.extent = make_cudaExtent(sizeof(float) * array_size, 1, 1);
to_host_pars.kind = cudaMemcpyDeviceToHost;
to_host_pars.srcPos = make_cudaPos(0, 0, 0);
to_host_pars.srcPtr = make_cudaPitchedPtr(
alloc_pars.dptr, array_size * sizeof(float), array_size, 1);
checkCudaErrors(
cudaGraphAddMemcpyNode(©_to_device, graph, copy_to_device_dep.data(),
copy_to_device_dep.size(), &to_device_pars));
checkCudaErrors(cudaGraphAddKernelNode(&exec_kernel, graph, ©_to_device,
1, &kernel_pars));
checkCudaErrors(cudaGraphAddMemcpyNode(©_to_host, graph, &exec_kernel, 1,
&to_host_pars));
checkCudaErrors(
cudaGraphAddMemFreeNode(&free, graph, ©_to_host, 1, alloc_pars.dptr));
checkCudaErrors(cudaStreamCreate(&stream));
checkCudaErrors(cudaGraphInstantiate(&graph_exec, graph, NULL, NULL, 0));
checkCudaErrors(cudaGraphLaunch(graph_exec, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
checkCudaErrors(cudaGraphExecDestroy(graph_exec));
std::cout << "Final result" << std::endl;
for (size_t idx = 0; idx < data_out.size(); idx++) {
std::cout << "data_in[" << idx << "] = " << data_in[idx] << " data_out["
<< idx << "] = " << data_out[idx] << std::endl;
}
}
this is the output
❯ ./main
Host: data[0] = 42
Host: data[1] = 42
Host: data[2] = 42
Host: data[3] = 42
Host: data[4] = 42
Device: data[1] = 3.000000
Device: data[3] = 3.000000
Device: data[0] = 3.000000
Device: data[4] = 3.000000
Device: data[2] = 3.000000
Final result
data_in[0] = 42 data_out[0] = 6
data_in[1] = 42 data_out[1] = 6
data_in[2] = 42 data_out[2] = 6
data_in[3] = 42 data_out[3] = 6
data_in[4] = 42 data_out[4] = 6
I am expecting to see 84 in the data_out as the host_set node should change the values. Instead it seems that the data copied to the device is of the first initialization therefore before the host_node function is executed.
I tried comparing the pointers but all seems to be consistent.
Also compute-sanitizer reports no issue.
Am I doing something wrong?

