Setting host memory via an hostnode before a memcopy node to device is not reflected in the device kernel execution

Hi, I was giving a try to the graph api and I have this weird behavior with the following code:

#include <cuda_runtime.h>
#include <stdio.h>

#include <array>
#include <iostream>
#include <vector>

#include "helper_cuda.h"

__global__ void debugPrint(float* data, int size) {
  int index = threadIdx.x + blockDim.x * blockIdx.x;
  int jump = blockDim.x * gridDim.x;
  for (int idx = index; idx < size; idx += jump) {
    if (idx < size) {
      printf("Device: data[%d] = %f\n", idx, data[idx]);
      data[idx] *= 2;
    }
  }
}

int main(int argc, char* argv[]) {
  constexpr int data_size = 100;
  std::array<float, data_size> data_in;
  std::array<float, data_size> data_out;

  for (size_t idx = 0; idx < data_size; idx++) {
    data_in[idx] = 3;
    data_out[idx] = -1;
  }

  cudaGraph_t graph;
  cudaGraphExec_t graph_exec;
  cudaStream_t stream;
  cudaGraphNode_t alloc, host_set, copy_to_device, copy_to_host, exec_kernel,
      free;
  cudaMemAllocNodeParams alloc_pars{};
  memset(&alloc_pars, 0, sizeof(alloc_pars));
  alloc_pars.poolProps.allocType = cudaMemAllocationTypePinned;
  alloc_pars.poolProps.location.id = 0;
  alloc_pars.poolProps.location.type = cudaMemLocationTypeDevice;
  alloc_pars.bytesize = sizeof(float) * data_in.size();
  auto set_value = [](void* data) {
    float* par = static_cast<float*>(data);
    for (int i = 0; i < data_size; i++) {
      par[i] = 42;
      std::cout << "Host: data[" << i << "] = " << par[i] << std::endl;
    }
  };

  cudaHostNodeParams host_pars{set_value, data_in.data()};

  checkCudaErrors(cudaGraphCreate(&graph, 0));
  checkCudaErrors(
      cudaGraphAddHostNode(&host_set, graph, nullptr, 0, &host_pars));

  checkCudaErrors(
      cudaGraphAddMemAllocNode(&alloc, graph, nullptr, 0, &alloc_pars));

  size_t array_size = data_in.size();
  cudaKernelNodeParams kernel_pars = {0};
  kernel_pars.func = (void*)debugPrint;
  kernel_pars.gridDim = dim3(15, 1, 1);
  kernel_pars.blockDim = dim3(1, 1, 1);
  kernel_pars.extra = NULL;
  kernel_pars.sharedMemBytes = 0;
  void* parameters[2] = {(void*)&alloc_pars.dptr, &array_size};
  kernel_pars.kernelParams = parameters;

  std::vector<cudaGraphNode_t> copy_to_device_dep;
  copy_to_device_dep.push_back(alloc);
  copy_to_device_dep.push_back(host_set);

  cudaMemcpy3DParms to_device_pars = {0};
  to_device_pars.dstPos = make_cudaPos(0, 0, 0);
  to_device_pars.dstPtr = make_cudaPitchedPtr(
      alloc_pars.dptr, array_size * sizeof(float), array_size, 1);
  to_device_pars.extent = make_cudaExtent(sizeof(float) * array_size, 1, 1);
  to_device_pars.kind = cudaMemcpyHostToDevice;
  to_device_pars.srcPos = make_cudaPos(0, 0, 0);
  to_device_pars.srcPtr = make_cudaPitchedPtr(
      data_in.data(), array_size * sizeof(float), array_size, 1);

  cudaMemcpy3DParms to_host_pars = {0};
  to_host_pars.dstPos = make_cudaPos(0, 0, 0);
  to_host_pars.dstPtr = make_cudaPitchedPtr(
      data_out.data(), array_size * sizeof(float), array_size, 1);
  to_host_pars.extent = make_cudaExtent(sizeof(float) * array_size, 1, 1);
  to_host_pars.kind = cudaMemcpyDeviceToHost;
  to_host_pars.srcPos = make_cudaPos(0, 0, 0);
  to_host_pars.srcPtr = make_cudaPitchedPtr(
      alloc_pars.dptr, array_size * sizeof(float), array_size, 1);

  checkCudaErrors(
      cudaGraphAddMemcpyNode(&copy_to_device, graph, copy_to_device_dep.data(),
                             copy_to_device_dep.size(), &to_device_pars));
  checkCudaErrors(cudaGraphAddKernelNode(&exec_kernel, graph, &copy_to_device,
                                         1, &kernel_pars));

  checkCudaErrors(cudaGraphAddMemcpyNode(&copy_to_host, graph, &exec_kernel, 1,
                                         &to_host_pars));
  checkCudaErrors(
      cudaGraphAddMemFreeNode(&free, graph, &copy_to_host, 1, alloc_pars.dptr));

  checkCudaErrors(cudaStreamCreate(&stream));
  checkCudaErrors(cudaGraphInstantiate(&graph_exec, graph, NULL, NULL, 0));
  checkCudaErrors(cudaGraphLaunch(graph_exec, stream));
  checkCudaErrors(cudaStreamSynchronize(stream));
  checkCudaErrors(cudaGraphExecDestroy(graph_exec));
  std::cout << "Final result" << std::endl;
  for (size_t idx = 0; idx < data_out.size(); idx++) {
    std::cout << "data_in[" << idx << "] = " << data_in[idx] << "  data_out["
              << idx << "] = " << data_out[idx] << std::endl;
  }
}

this is the output

❯ ./main
Host: data[0] = 42
Host: data[1] = 42
Host: data[2] = 42
Host: data[3] = 42
Host: data[4] = 42
Device: data[1] = 3.000000
Device: data[3] = 3.000000
Device: data[0] = 3.000000
Device: data[4] = 3.000000
Device: data[2] = 3.000000
Final result
data_in[0] = 42  data_out[0] = 6
data_in[1] = 42  data_out[1] = 6
data_in[2] = 42  data_out[2] = 6
data_in[3] = 42  data_out[3] = 6
data_in[4] = 42  data_out[4] = 6

I am expecting to see 84 in the data_out as the host_set node should change the values. Instead it seems that the data copied to the device is of the first initialization therefore before the host_node function is executed.
I tried comparing the pointers but all seems to be consistent.
Also compute-sanitizer reports no issue.
Am I doing something wrong?

The topology looks correct. Have you tried creating the graph with stream capture?

Good idea! Unfortunately though I still get the same result…
Here is the revised code:

#include <cuda_runtime.h>
#include <stdio.h>

#include <array>
#include <iostream>
#include <vector>

#include "helper_cuda.h"

__global__ void debugPrint(float* data, int size) {
  int index = threadIdx.x + blockDim.x * blockIdx.x;
  int jump = blockDim.x * gridDim.x;
  for (int idx = index; idx < size; idx += jump) {
    if (idx < size) {
      printf("Device: data[%d] = %f\n", idx, data[idx]);
      data[idx] *= 2;
    }
  }
}

void init_input(float* data, size_t size, float value) {
  for (size_t i = 0; i < size; i++) {
    data[i] = value;
  }
}

void print_array(float* data, size_t size, const std::string& name) {
  for (int i = 0; i < size; i++) {
    std::cout << name << "[" << i << "] = " << data[i] << std::endl;
  }
}

int main(int argc, char* argv[]) {
  constexpr int data_size = 5;
  std::array<float, data_size> data_in;
  std::array<float, data_size> data_out;
  dim3 gridDim{15, 1, 1};
  dim3 blockDim{1, 1, 1};

  init_input(data_in.data(), data_size, 3);
  init_input(data_out.data(), data_size, 1);
  std::cout << "Input data" << std::endl;
  print_array(data_in.data(), data_in.size(), "data_in");
  print_array(data_out.data(), data_out.size(), "data_out");
  cudaGraph_t graph;
  cudaGraphExec_t graph_exec;
  cudaStream_t stream;
  cudaGraphNode_t alloc, host_set, copy_to_device, copy_to_host, exec_kernel,
      free;
  cudaMemAllocNodeParams alloc_pars{};
  memset(&alloc_pars, 0, sizeof(alloc_pars));
  alloc_pars.poolProps.allocType = cudaMemAllocationTypePinned;
  alloc_pars.poolProps.location.id = 0;
  alloc_pars.poolProps.location.type = cudaMemLocationTypeDevice;
  alloc_pars.bytesize = sizeof(float) * data_in.size();
  auto set_value = [](void* data) {
    float* par = static_cast<float*>(data);
    for (int i = 0; i < data_size; i++) {
      par[i] = 42;
      std::cout << "Host: data[" << i << "] = " << par[i] << std::endl;
    }
  };

  cudaHostNodeParams host_pars{set_value, data_in.data()};

  checkCudaErrors(cudaGraphCreate(&graph, 0));
  checkCudaErrors(
      cudaGraphAddHostNode(&host_set, graph, nullptr, 0, &host_pars));

  checkCudaErrors(
      cudaGraphAddMemAllocNode(&alloc, graph, nullptr, 0, &alloc_pars));

  size_t array_size = data_in.size();
  cudaKernelNodeParams kernel_pars = {0};
  kernel_pars.func = (void*)debugPrint;
  kernel_pars.gridDim = dim3(15, 1, 1);
  kernel_pars.blockDim = dim3(1, 1, 1);
  kernel_pars.extra = NULL;
  kernel_pars.sharedMemBytes = 0;
  void* parameters[2] = {(void*)&alloc_pars.dptr, &array_size};
  kernel_pars.kernelParams = parameters;

  std::vector<cudaGraphNode_t> copy_to_device_dep;
  copy_to_device_dep.push_back(alloc);
  copy_to_device_dep.push_back(host_set);

  cudaMemcpy3DParms to_device_pars = {0};
  to_device_pars.dstPos = make_cudaPos(0, 0, 0);
  to_device_pars.dstPtr = make_cudaPitchedPtr(
      alloc_pars.dptr, array_size * sizeof(float), array_size, 1);
  to_device_pars.extent = make_cudaExtent(sizeof(float) * array_size, 1, 1);
  to_device_pars.kind = cudaMemcpyHostToDevice;
  to_device_pars.srcPos = make_cudaPos(0, 0, 0);
  to_device_pars.srcPtr = make_cudaPitchedPtr(
      data_in.data(), array_size * sizeof(float), array_size, 1);

  cudaMemcpy3DParms to_host_pars = {0};
  to_host_pars.dstPos = make_cudaPos(0, 0, 0);
  to_host_pars.dstPtr = make_cudaPitchedPtr(
      data_out.data(), array_size * sizeof(float), array_size, 1);
  to_host_pars.extent = make_cudaExtent(sizeof(float) * array_size, 1, 1);
  to_host_pars.kind = cudaMemcpyDeviceToHost;
  to_host_pars.srcPos = make_cudaPos(0, 0, 0);
  to_host_pars.srcPtr = make_cudaPitchedPtr(
      alloc_pars.dptr, array_size * sizeof(float), array_size, 1);

  checkCudaErrors(
      cudaGraphAddMemcpyNode(&copy_to_device, graph, copy_to_device_dep.data(),
                             copy_to_device_dep.size(), &to_device_pars));
  checkCudaErrors(cudaGraphAddKernelNode(&exec_kernel, graph, &copy_to_device,
                                         1, &kernel_pars));

  checkCudaErrors(cudaGraphAddMemcpyNode(&copy_to_host, graph, &exec_kernel, 1,
                                         &to_host_pars));
  checkCudaErrors(
      cudaGraphAddMemFreeNode(&free, graph, &copy_to_host, 1, alloc_pars.dptr));

  checkCudaErrors(cudaStreamCreate(&stream));
  checkCudaErrors(cudaGraphInstantiate(&graph_exec, graph, NULL, NULL, 0));
  checkCudaErrors(cudaGraphLaunch(graph_exec, stream));
  checkCudaErrors(cudaStreamSynchronize(stream));
  checkCudaErrors(cudaGraphExecDestroy(graph_exec));
  cudaGraphDebugDotPrint(graph, "graph.dot",
                         cudaGraphDebugDotFlags::cudaGraphDebugDotFlagsVerbose);
  std::cout << "Final result" << std::endl;
  for (size_t idx = 0; idx < data_out.size(); idx++) {
    std::cout << "data_in[" << idx << "] = " << data_in[idx] << "  data_out["
              << idx << "] = " << data_out[idx] << std::endl;
  }

  /*
    Using stream capture to create the graph as suggested in nvidia forum
  */
  init_input(data_in.data(), data_in.size(), 3);
  init_input(data_out.data(), data_out.size(), 1);

  cudaGraph_t graph_cptr;
  cudaGraphExec_t graph_exec_cptr;
  std::cout << "Now using stream capture to create the graph" << std::endl;

  std::cout << "Input data" << std::endl;
  print_array(data_in.data(), data_in.size(), "data_in");
  print_array(data_out.data(), data_out.size(), "data_out");
  void* array_d;
  checkCudaErrors(cudaStreamBeginCapture(
      stream, cudaStreamCaptureMode::cudaStreamCaptureModeRelaxed));

  checkCudaErrors(
      cudaMallocAsync(&array_d, sizeof(float) * data_in.size(), stream));

  cudaHostFn_t host_func = set_value;
  checkCudaErrors(cudaLaunchHostFunc(stream, host_func, data_in.data()));

  cudaMemcpy3DParms to_device_pars_cptr = {0};
  to_device_pars.dstPos = make_cudaPos(0, 0, 0);
  to_device_pars.dstPtr =
      make_cudaPitchedPtr(array_d, array_size * sizeof(float), array_size, 1);
  to_device_pars.extent = make_cudaExtent(sizeof(float) * array_size, 1, 1);
  to_device_pars.kind = cudaMemcpyHostToDevice;
  to_device_pars.srcPos = make_cudaPos(0, 0, 0);
  to_device_pars.srcPtr = make_cudaPitchedPtr(
      data_in.data(), array_size * sizeof(float), array_size, 1);

  checkCudaErrors(cudaMemcpy3DAsync(&to_device_pars_cptr, stream));

  debugPrint<<<gridDim, blockDim, 0, stream>>>((float*)array_d, array_size);
  cudaMemcpy3DParms to_host_pars_cptr = {0};
  to_host_pars.dstPos = make_cudaPos(0, 0, 0);
  to_host_pars.dstPtr = make_cudaPitchedPtr(
      data_out.data(), array_size * sizeof(float), array_size, 1);
  to_host_pars.extent = make_cudaExtent(sizeof(float) * array_size, 1, 1);
  to_host_pars.kind = cudaMemcpyDeviceToHost;
  to_host_pars.srcPos = make_cudaPos(0, 0, 0);
  to_host_pars.srcPtr =
      make_cudaPitchedPtr(array_d, array_size * sizeof(float), array_size, 1);

  checkCudaErrors(cudaMemcpy3DAsync(&to_host_pars_cptr, stream));
  checkCudaErrors(cudaFreeAsync(array_d, stream));

  checkCudaErrors(cudaStreamEndCapture(stream, &graph_cptr));

  cudaGraphDebugDotPrint(graph_cptr, "graph_cptr.dot",
                         cudaGraphDebugDotFlags::cudaGraphDebugDotFlagsVerbose);
  checkCudaErrors(cudaGraphInstantiate(&graph_exec_cptr, graph, NULL, NULL, 0));
  checkCudaErrors(cudaGraphLaunch(graph_exec_cptr, stream));
  checkCudaErrors(cudaStreamSynchronize(stream));
  std::cout << "Final result" << std::endl;
  for (size_t idx = 0; idx < data_out.size(); idx++) {
    std::cout << "data_in[" << idx << "] = " << data_in[idx] << "  data_out["
              << idx << "] = " << data_out[idx] << std::endl;
  }
}


❯ ./main
Input data
data_in[0] = 3
data_in[1] = 3
data_in[2] = 3
data_in[3] = 3
data_in[4] = 3
data_out[0] = 1
data_out[1] = 1
data_out[2] = 1
data_out[3] = 1
data_out[4] = 1
Host: data[0] = 42
Host: data[1] = 42
Host: data[2] = 42
Host: data[3] = 42
Host: data[4] = 42
Device: data[1] = 3.000000
Device: data[3] = 3.000000
Device: data[0] = 3.000000
Device: data[4] = 3.000000
Device: data[2] = 3.000000
Final result
data_in[0] = 42  data_out[0] = 6
data_in[1] = 42  data_out[1] = 6
data_in[2] = 42  data_out[2] = 6
data_in[3] = 42  data_out[3] = 6
data_in[4] = 42  data_out[4] = 6
Now using stream capture to create the graph
Input data
data_in[0] = 3
data_in[1] = 3
data_in[2] = 3
data_in[3] = 3
data_in[4] = 3
data_out[0] = 1
data_out[1] = 1
data_out[2] = 1
data_out[3] = 1
data_out[4] = 1
Host: data[0] = 42
Host: data[1] = 42
Host: data[2] = 42
Host: data[3] = 42
Host: data[4] = 42
Device: data[1] = 3.000000
Device: data[3] = 3.000000
Device: data[4] = 3.000000
Device: data[0] = 3.000000
Device: data[2] = 3.000000
Final result
data_in[0] = 42  data_out[0] = 6
data_in[1] = 42  data_out[1] = 6
data_in[2] = 42  data_out[2] = 6
data_in[3] = 42  data_out[3] = 6
data_in[4] = 42  data_out[4] = 6

Also on top of it the stream capture does the mem copies but they are not shown in the debug dot print… As you can see in the graphs I attached.

Manually created graph


Graph created with stream capture

The programming guide states:

3.2.8.7.7.1.1. Device Graph Requirements
…
Memcpy nodes:

  • Only copies involving device memory and/or pinned device-mapped host memory are permitted.

Your usage of std::array does not constitute pinned device-mapped host memory. The following adaptation of your code seems to give the correct result for me:

# cat t332.cu
#include <cuda_runtime.h>
#include <stdio.h>

#include <array>
#include <iostream>
#include <vector>

#define checkCudaErrors(x) x

__global__ void debugPrint(float* data, int size) {
  int index = threadIdx.x + blockDim.x * blockIdx.x;
  int jump = blockDim.x * gridDim.x;
  for (int idx = index; idx < size; idx += jump) {
    if (idx < size) {
      printf("Device: data[%d] = %f\n", idx, data[idx]);
      data[idx] *= 2;
    }
  }
}

int main(int argc, char* argv[]) {
  constexpr int data_size = 100;
  std::array<float, data_size> data_in;
  std::array<float, data_size> data_out;
#ifdef USE_FIX
  cudaHostRegister(data_in.data(),  sizeof(data_in[0])*data_in.size(),   cudaHostRegisterDefault);
  cudaHostRegister(data_out.data(), sizeof(data_out[0])*data_out.size(), cudaHostRegisterDefault);
#endif
  for (size_t idx = 0; idx < data_size; idx++) {
    data_in[idx] = 3;
    data_out[idx] = -1;
  }

  cudaGraph_t graph;
  cudaGraphExec_t graph_exec;
  cudaStream_t stream;
  cudaGraphNode_t alloc, host_set, copy_to_device, copy_to_host, exec_kernel,
      free;
  cudaMemAllocNodeParams alloc_pars{};
  memset(&alloc_pars, 0, sizeof(alloc_pars));
  alloc_pars.poolProps.allocType = cudaMemAllocationTypePinned;
  alloc_pars.poolProps.location.id = 0;
  alloc_pars.poolProps.location.type = cudaMemLocationTypeDevice;
  alloc_pars.bytesize = sizeof(float) * data_in.size();
  auto set_value = [](void* data) {
    float* par = static_cast<float*>(data);
    for (int i = 0; i < data_size; i++) {
      par[i] = 42;
      std::cout << "Host: data[" << i << "] = " << par[i] << std::endl;
    }
  };

  cudaHostNodeParams host_pars{set_value, data_in.data()};

  checkCudaErrors(cudaGraphCreate(&graph, 0));
  checkCudaErrors(
      cudaGraphAddHostNode(&host_set, graph, nullptr, 0, &host_pars));

  checkCudaErrors(
      cudaGraphAddMemAllocNode(&alloc, graph, nullptr, 0, &alloc_pars));

  size_t array_size = data_in.size();
  cudaKernelNodeParams kernel_pars = {0};
  kernel_pars.func = (void*)debugPrint;
  kernel_pars.gridDim = dim3(15, 1, 1);
  kernel_pars.blockDim = dim3(1, 1, 1);
  kernel_pars.extra = NULL;
  kernel_pars.sharedMemBytes = 0;
  void* parameters[2] = {(void*)&alloc_pars.dptr, &array_size};
  kernel_pars.kernelParams = parameters;

  std::vector<cudaGraphNode_t> copy_to_device_dep;
  copy_to_device_dep.push_back(alloc);
  copy_to_device_dep.push_back(host_set);

  cudaMemcpy3DParms to_device_pars = {0};
  to_device_pars.dstPos = make_cudaPos(0, 0, 0);
  to_device_pars.dstPtr = make_cudaPitchedPtr(
      alloc_pars.dptr, array_size * sizeof(float), array_size, 1);
  to_device_pars.extent = make_cudaExtent(sizeof(float) * array_size, 1, 1);
  to_device_pars.kind = cudaMemcpyHostToDevice;
  to_device_pars.srcPos = make_cudaPos(0, 0, 0);
  to_device_pars.srcPtr = make_cudaPitchedPtr(
      data_in.data(), array_size * sizeof(float), array_size, 1);

  cudaMemcpy3DParms to_host_pars = {0};
  to_host_pars.dstPos = make_cudaPos(0, 0, 0);
  to_host_pars.dstPtr = make_cudaPitchedPtr(
      data_out.data(), array_size * sizeof(float), array_size, 1);
  to_host_pars.extent = make_cudaExtent(sizeof(float) * array_size, 1, 1);
  to_host_pars.kind = cudaMemcpyDeviceToHost;
  to_host_pars.srcPos = make_cudaPos(0, 0, 0);
  to_host_pars.srcPtr = make_cudaPitchedPtr(
      alloc_pars.dptr, array_size * sizeof(float), array_size, 1);

  checkCudaErrors(
      cudaGraphAddMemcpyNode(&copy_to_device, graph, copy_to_device_dep.data(),
                             copy_to_device_dep.size(), &to_device_pars));
  checkCudaErrors(cudaGraphAddKernelNode(&exec_kernel, graph, &copy_to_device,
                                         1, &kernel_pars));

  checkCudaErrors(cudaGraphAddMemcpyNode(&copy_to_host, graph, &exec_kernel, 1,
                                         &to_host_pars));
  checkCudaErrors(
      cudaGraphAddMemFreeNode(&free, graph, &copy_to_host, 1, alloc_pars.dptr));

  checkCudaErrors(cudaStreamCreate(&stream));
  checkCudaErrors(cudaGraphInstantiate(&graph_exec, graph, NULL, NULL, 0));
  checkCudaErrors(cudaGraphLaunch(graph_exec, stream));
  checkCudaErrors(cudaStreamSynchronize(stream));
  checkCudaErrors(cudaGraphExecDestroy(graph_exec));
  std::cout << "Final result" << std::endl;
  for (size_t idx = 0; idx < data_out.size(); idx++) {
    std::cout << "data_in[" << idx << "] = " << data_in[idx] << "  data_out["
              << idx << "] = " << data_out[idx] << std::endl;
  }
}

# nvcc -o t332 t332.cu -arch=sm_89 -DUSE_FIX
# compute-sanitizer ./t332
========= COMPUTE-SANITIZER
Host: data[0] = 42
Host: data[1] = 42
Host: data[2] = 42
Host: data[3] = 42
Host: data[4] = 42
Host: data[5] = 42
Host: data[6] = 42
Host: data[7] = 42
Host: data[8] = 42
Host: data[9] = 42
Host: data[10] = 42
Host: data[11] = 42
Host: data[12] = 42
Host: data[13] = 42
Host: data[14] = 42
Host: data[15] = 42
Host: data[16] = 42
Host: data[17] = 42
Host: data[18] = 42
Host: data[19] = 42
Host: data[20] = 42
Host: data[21] = 42
Host: data[22] = 42
Host: data[23] = 42
Host: data[24] = 42
Host: data[25] = 42
Host: data[26] = 42
Host: data[27] = 42
Host: data[28] = 42
Host: data[29] = 42
Host: data[30] = 42
Host: data[31] = 42
Host: data[32] = 42
Host: data[33] = 42
Host: data[34] = 42
Host: data[35] = 42
Host: data[36] = 42
Host: data[37] = 42
Host: data[38] = 42
Host: data[39] = 42
Host: data[40] = 42
Host: data[41] = 42
Host: data[42] = 42
Host: data[43] = 42
Host: data[44] = 42
Host: data[45] = 42
Host: data[46] = 42
Host: data[47] = 42
Host: data[48] = 42
Host: data[49] = 42
Host: data[50] = 42
Host: data[51] = 42
Host: data[52] = 42
Host: data[53] = 42
Host: data[54] = 42
Host: data[55] = 42
Host: data[56] = 42
Host: data[57] = 42
Host: data[58] = 42
Host: data[59] = 42
Host: data[60] = 42
Host: data[61] = 42
Host: data[62] = 42
Host: data[63] = 42
Host: data[64] = 42
Host: data[65] = 42
Host: data[66] = 42
Host: data[67] = 42
Host: data[68] = 42
Host: data[69] = 42
Host: data[70] = 42
Host: data[71] = 42
Host: data[72] = 42
Host: data[73] = 42
Host: data[74] = 42
Host: data[75] = 42
Host: data[76] = 42
Host: data[77] = 42
Host: data[78] = 42
Host: data[79] = 42
Host: data[80] = 42
Host: data[81] = 42
Host: data[82] = 42
Host: data[83] = 42
Host: data[84] = 42
Host: data[85] = 42
Host: data[86] = 42
Host: data[87] = 42
Host: data[88] = 42
Host: data[89] = 42
Host: data[90] = 42
Host: data[91] = 42
Host: data[92] = 42
Host: data[93] = 42
Host: data[94] = 42
Host: data[95] = 42
Host: data[96] = 42
Host: data[97] = 42
Host: data[98] = 42
Host: data[99] = 42
Device: data[11] = 42.000000
Device: data[6] = 42.000000
Device: data[10] = 42.000000
Device: data[1] = 42.000000
Device: data[5] = 42.000000
Device: data[12] = 42.000000
Device: data[0] = 42.000000
Device: data[7] = 42.000000
Device: data[2] = 42.000000
Device: data[9] = 42.000000
Device: data[4] = 42.000000
Device: data[13] = 42.000000
Device: data[14] = 42.000000
Device: data[8] = 42.000000
Device: data[3] = 42.000000
Device: data[26] = 42.000000
Device: data[21] = 42.000000
Device: data[25] = 42.000000
Device: data[16] = 42.000000
Device: data[27] = 42.000000
Device: data[20] = 42.000000
Device: data[22] = 42.000000
Device: data[28] = 42.000000
Device: data[15] = 42.000000
Device: data[17] = 42.000000
Device: data[23] = 42.000000
Device: data[24] = 42.000000
Device: data[18] = 42.000000
Device: data[19] = 42.000000
Device: data[29] = 42.000000
Device: data[41] = 42.000000
Device: data[36] = 42.000000
Device: data[40] = 42.000000
Device: data[42] = 42.000000
Device: data[31] = 42.000000
Device: data[35] = 42.000000
Device: data[39] = 42.000000
Device: data[37] = 42.000000
Device: data[43] = 42.000000
Device: data[30] = 42.000000
Device: data[32] = 42.000000
Device: data[38] = 42.000000
Device: data[34] = 42.000000
Device: data[33] = 42.000000
Device: data[44] = 42.000000
Device: data[56] = 42.000000
Device: data[51] = 42.000000
Device: data[55] = 42.000000
Device: data[57] = 42.000000
Device: data[50] = 42.000000
Device: data[46] = 42.000000
Device: data[54] = 42.000000
Device: data[52] = 42.000000
Device: data[58] = 42.000000
Device: data[45] = 42.000000
Device: data[53] = 42.000000
Device: data[47] = 42.000000
Device: data[49] = 42.000000
Device: data[59] = 42.000000
Device: data[48] = 42.000000
Device: data[71] = 42.000000
Device: data[66] = 42.000000
Device: data[70] = 42.000000
Device: data[72] = 42.000000
Device: data[65] = 42.000000
Device: data[61] = 42.000000
Device: data[69] = 42.000000
Device: data[67] = 42.000000
Device: data[73] = 42.000000
Device: data[60] = 42.000000
Device: data[64] = 42.000000
Device: data[68] = 42.000000
Device: data[62] = 42.000000
Device: data[74] = 42.000000
Device: data[63] = 42.000000
Device: data[86] = 42.000000
Device: data[81] = 42.000000
Device: data[85] = 42.000000
Device: data[87] = 42.000000
Device: data[80] = 42.000000
Device: data[76] = 42.000000
Device: data[84] = 42.000000
Device: data[82] = 42.000000
Device: data[88] = 42.000000
Device: data[75] = 42.000000
Device: data[79] = 42.000000
Device: data[83] = 42.000000
Device: data[77] = 42.000000
Device: data[89] = 42.000000
Device: data[78] = 42.000000
Device: data[96] = 42.000000
Device: data[95] = 42.000000
Device: data[91] = 42.000000
Device: data[99] = 42.000000
Device: data[97] = 42.000000
Device: data[90] = 42.000000
Device: data[94] = 42.000000
Device: data[98] = 42.000000
Device: data[92] = 42.000000
Device: data[93] = 42.000000
Final result
data_in[0] = 42  data_out[0] = 84
data_in[1] = 42  data_out[1] = 84
data_in[2] = 42  data_out[2] = 84
data_in[3] = 42  data_out[3] = 84
data_in[4] = 42  data_out[4] = 84
data_in[5] = 42  data_out[5] = 84
data_in[6] = 42  data_out[6] = 84
data_in[7] = 42  data_out[7] = 84
data_in[8] = 42  data_out[8] = 84
data_in[9] = 42  data_out[9] = 84
data_in[10] = 42  data_out[10] = 84
data_in[11] = 42  data_out[11] = 84
data_in[12] = 42  data_out[12] = 84
data_in[13] = 42  data_out[13] = 84
data_in[14] = 42  data_out[14] = 84
data_in[15] = 42  data_out[15] = 84
data_in[16] = 42  data_out[16] = 84
data_in[17] = 42  data_out[17] = 84
data_in[18] = 42  data_out[18] = 84
data_in[19] = 42  data_out[19] = 84
data_in[20] = 42  data_out[20] = 84
data_in[21] = 42  data_out[21] = 84
data_in[22] = 42  data_out[22] = 84
data_in[23] = 42  data_out[23] = 84
data_in[24] = 42  data_out[24] = 84
data_in[25] = 42  data_out[25] = 84
data_in[26] = 42  data_out[26] = 84
data_in[27] = 42  data_out[27] = 84
data_in[28] = 42  data_out[28] = 84
data_in[29] = 42  data_out[29] = 84
data_in[30] = 42  data_out[30] = 84
data_in[31] = 42  data_out[31] = 84
data_in[32] = 42  data_out[32] = 84
data_in[33] = 42  data_out[33] = 84
data_in[34] = 42  data_out[34] = 84
data_in[35] = 42  data_out[35] = 84
data_in[36] = 42  data_out[36] = 84
data_in[37] = 42  data_out[37] = 84
data_in[38] = 42  data_out[38] = 84
data_in[39] = 42  data_out[39] = 84
data_in[40] = 42  data_out[40] = 84
data_in[41] = 42  data_out[41] = 84
data_in[42] = 42  data_out[42] = 84
data_in[43] = 42  data_out[43] = 84
data_in[44] = 42  data_out[44] = 84
data_in[45] = 42  data_out[45] = 84
data_in[46] = 42  data_out[46] = 84
data_in[47] = 42  data_out[47] = 84
data_in[48] = 42  data_out[48] = 84
data_in[49] = 42  data_out[49] = 84
data_in[50] = 42  data_out[50] = 84
data_in[51] = 42  data_out[51] = 84
data_in[52] = 42  data_out[52] = 84
data_in[53] = 42  data_out[53] = 84
data_in[54] = 42  data_out[54] = 84
data_in[55] = 42  data_out[55] = 84
data_in[56] = 42  data_out[56] = 84
data_in[57] = 42  data_out[57] = 84
data_in[58] = 42  data_out[58] = 84
data_in[59] = 42  data_out[59] = 84
data_in[60] = 42  data_out[60] = 84
data_in[61] = 42  data_out[61] = 84
data_in[62] = 42  data_out[62] = 84
data_in[63] = 42  data_out[63] = 84
data_in[64] = 42  data_out[64] = 84
data_in[65] = 42  data_out[65] = 84
data_in[66] = 42  data_out[66] = 84
data_in[67] = 42  data_out[67] = 84
data_in[68] = 42  data_out[68] = 84
data_in[69] = 42  data_out[69] = 84
data_in[70] = 42  data_out[70] = 84
data_in[71] = 42  data_out[71] = 84
data_in[72] = 42  data_out[72] = 84
data_in[73] = 42  data_out[73] = 84
data_in[74] = 42  data_out[74] = 84
data_in[75] = 42  data_out[75] = 84
data_in[76] = 42  data_out[76] = 84
data_in[77] = 42  data_out[77] = 84
data_in[78] = 42  data_out[78] = 84
data_in[79] = 42  data_out[79] = 84
data_in[80] = 42  data_out[80] = 84
data_in[81] = 42  data_out[81] = 84
data_in[82] = 42  data_out[82] = 84
data_in[83] = 42  data_out[83] = 84
data_in[84] = 42  data_out[84] = 84
data_in[85] = 42  data_out[85] = 84
data_in[86] = 42  data_out[86] = 84
data_in[87] = 42  data_out[87] = 84
data_in[88] = 42  data_out[88] = 84
data_in[89] = 42  data_out[89] = 84
data_in[90] = 42  data_out[90] = 84
data_in[91] = 42  data_out[91] = 84
data_in[92] = 42  data_out[92] = 84
data_in[93] = 42  data_out[93] = 84
data_in[94] = 42  data_out[94] = 84
data_in[95] = 42  data_out[95] = 84
data_in[96] = 42  data_out[96] = 84
data_in[97] = 42  data_out[97] = 84
data_in[98] = 42  data_out[98] = 84
data_in[99] = 42  data_out[99] = 84
========= ERROR SUMMARY: 0 errors
#

(CUDA 12.2)

1 Like

Hi Robert, indeed you are right that fixes it.
I have completely overlooked that section thank you for pointing it out to me.
I also noticed in the sames section the following line:

The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.

Does that mean that memalloc and host nodes will not be supported or it is just a new feature and the guide was not updated?

my expectation is that is a doc oversight.

If its of concern you can can always request CUDA documentation updates by filing a bug.

sorry, I read the programming guide incorrectly. The section I linked pertains to device graph launch. You are not doing a device graph launch. So although using pinned memory seems to affect the behavior of your test case, I cannot explain why. And the reason why the memalloc and host nodes are not listed is because these cannot be used in a device graph launch.

Sorry for the confusion.

A relevant sample code also uses pinned memory for the data input to the H->D memcpy node, but I haven’t found the basis of a requirement for it. Since graph usage implies something like copy/compute overlap, and we know that pinned memory is required for copy/compute overlap, there may be some logic here, but I haven’t found an explicit statement to that effect.

1 Like

Thank you Robert for clarifying it. I also noticed in the memcpy debug graph there you can see that it expects the host memory to be pageable. So maybe even though it is not documented that is what is expected.

Pageable is the opposite of pinned=page-locked!

exactly my point. I think there is an error even in the debug graph. In the debug graph you would expect the memcopy to happen after the host code from a not pinned memory. But that doesnt seem to be the case.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.