CUDA graph: kernel execution and DtoH/HtoD memcpy not concurrent when destination of DtoH memcpy is not mapped

In this example, I create a graph that simulates work done in 10 chunks, where the kernels are executed serially, and the HtoD memcpys are performed to a buffer with space for two chunks. The graph looks like this:

Here’s the code:

#include <cstdio>
#include <vector>

#define gpuErrchk(ans)                    \
  {                                       \
    gpuAssert((ans), __FILE__, __LINE__); \
  }
__host__ inline void gpuAssert(cudaError_t code, const char* file, int line,
                               bool abort = true) {
  if (code != cudaSuccess) {
    printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
  }
}

__global__ void kernel(int* indices, size_t num_indices) {
  size_t sleep_time = 10'000'000;
  size_t start = clock();
  size_t clock_offset = 0;
  while (clock_offset < sleep_time) {
    clock_offset = clock() - start;
  }
}

int main() {
  cudaDeviceProp prop;
  cudaGetDeviceProperties(&prop, 0);

  size_t num_indices = prop.totalGlobalMem / (30 * sizeof(int));
  size_t num_chunks = 10;
  size_t chunk_sizes = num_indices / num_chunks;
  size_t num_buffers = 2;
  cudaGraph_t graph;
  gpuErrchk(cudaGraphCreate(&graph, 0));
  int* d_indices;
  gpuErrchk(cudaMalloc(&d_indices, num_buffers * chunk_sizes * sizeof(int)));

  int* d_results;
  gpuErrchk(cudaMalloc(&d_results, num_indices * sizeof(int)));

  std::vector<int> indices(num_indices);
  std::vector<int> results(num_indices);

  // If host results not mapped, memcpys do not run paralell to kernels
  // gpuErrchk(cudaHostRegister(indices.data(), num_indices * sizeof(int),
  //                           cudaHostRegisterMapped));
  gpuErrchk(cudaHostRegister(results.data(), num_indices * sizeof(int),
                             cudaHostRegisterMapped));

  std::vector<cudaGraphNode_t> copy_indices_nodes(num_chunks);
  std::vector<cudaGraphNode_t> kernel_nodes(num_chunks);
  std::vector<cudaGraphNode_t> copy_results_nodes(num_chunks);

  for (int i = 0; i < num_chunks; i++) {
    auto current_chunk = d_indices + chunk_sizes * (i % num_buffers);

    gpuErrchk(cudaGraphAddMemcpyNode1D(
        &copy_indices_nodes[i], graph, nullptr, 0, current_chunk,
        indices.data() + i * chunk_sizes, chunk_sizes * sizeof(int),
        cudaMemcpyHostToDevice));
    if (i >= num_buffers) {
      gpuErrchk(cudaGraphAddDependencies(graph, &kernel_nodes[i - num_buffers],
                                         &copy_indices_nodes[i], 1));
    }

    cudaKernelNodeParams kernel_params;
    kernel_params.func = (void*)kernel;
    kernel_params.gridDim = dim3(10, 1, 1);
    kernel_params.blockDim = dim3(256, 1, 1);
    kernel_params.sharedMemBytes = 0;
    void* args[2] = {&current_chunk, &chunk_sizes};
    kernel_params.kernelParams = args;
    kernel_params.extra = nullptr;

    gpuErrchk(cudaGraphAddKernelNode(
        &kernel_nodes[i], graph, &copy_indices_nodes[i], 1, &kernel_params));
    if (i > 0) {
      // Sequential execution of kernels
      gpuErrchk(cudaGraphAddDependencies(graph, &kernel_nodes[i - 1],
                                         &kernel_nodes[i], 1));
    }

    gpuErrchk(cudaGraphAddMemcpyNode1D(
        &copy_results_nodes[i], graph, &kernel_nodes[i], 1,
        results.data() + i * chunk_sizes, d_results + i * chunk_sizes,
        chunk_sizes * sizeof(int), cudaMemcpyDeviceToHost));
  }

  cudaGraphExec_t graphExec;
  gpuErrchk(cudaGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  gpuErrchk(cudaGraphLaunch(graphExec, 0));
  gpuErrchk(cudaDeviceSynchronize());

  gpuErrchk(cudaFree(d_indices));
  gpuErrchk(cudaFree(d_results));

  return 0;
}

The code is compiled with: nvcc -o example -arch=sm_75 test.cu with CUDA 12.1, and run on an RTX 2080 Ti. When the host memory of the destination of the DtoH memcpy (results in the code) is not mapped, everything executes serially. When mapped the graph executes as expected. Mapping the source of the HtoD memcpy does not affect anything. In the images you can see the execution as well as confirmation on which memory is mapped or pageable:

What’s the reason for this? Is this expected behaviour? Here is also a link to the profiling files. Thank you for the help!

Without pinned memory, the device-to-host transfers will block until the transfer is complete which in turn waits for all preceding work in the stream. During this time, no other work can be issued (as the thread is blocked).
That is expected behavior.

1 Like

Thank you for the quick reply! To make sure that I have understood it correctly: since all three operations in a stream get launched at once, and the last one blocks the host thread, the operations in another stream have to wait to be launched. Correct? Is there a way to have the graph launch one node at a time instead of a whole stream? Otherwise, do you know of a way to avoid this behaviour?

A graph only models the dependencies. The order of execution is unspecified.

However, in this blog post Constant Time Launch for Straight-Line CUDA Graphs and Other Performance Enhancements | NVIDIA Technical Blog it says

In CUDA Toolkit 12.0, CUDA started paying attention to node creation order as a heuristic to scheduling decisions. CUDA started loosely preferring to schedule node operations for nodes created earlier over nodes created later.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.