Millisecond-scale D2D memcpy admission latency in secondary CUDA context while primary-context kernel is running

Millisecond-scale D2D memcpy start latency in secondary CUDA context while primary-context kernel is running

I observe millisecond-scale GPU start latency for cudaMemcpyAsync device-to-device copies submitted in a secondary CUDA context while a long-running kernel is active in the primary CUDA context on the same GPU.

Environment:

  • GPU: Tesla V100-PCIE-32GB
  • Driver: 580.159.03
  • CUDA Toolkit: 12.6.85

Minimal test:

  1. Create the CUDA primary context with the runtime API.
  2. Run case 1: create a secondary context and submit 1 MiB D2D cudaMemcpyAsync copies.
  3. Run case 2: launch a long-running kernel in the primary context, then submit the same D2D copies in the primary context.
  4. Run case 3: launch a long-running kernel in the primary context, then create a secondary context and submit the same D2D copies in the secondary context.

In each case, the D2D source and destination buffers are allocated in the same CUDA context that submits the D2D memcpy. This is local D2D within one context, not a cross-context copy.

Observed in Nsight Systems:

  • Case 1, direct secondary-context D2D:
    API return → GPU start is ~1.6 us.
  • Case 2, primary-context kernel + primary-context D2D:
    API return → GPU start is ~1.5 us.
  • Case 3, primary-context kernel + secondary-context D2D:
    API return → GPU start is ~2.26 ms.

The D2D API call returns quickly, but the GPU copy starts much later only in case 3.

Is this expected CUDA scheduling behavior? If yes, is there a recommended way to avoid this latency when using a separate CUDA context for communication work?

Test code:

#include <cuda.h>
#include <cuda_runtime.h>

#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <thread>

#define CHECK_CUDA(call)                                                             \
  do {                                                                               \
    cudaError_t st = (call);                                                         \
    if (st != cudaSuccess) {                                                         \
      std::fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__,             \
                   cudaGetErrorString(st));                                         \
      std::exit(1);                                                                  \
    }                                                                                \
  } while (0)

#define CHECK_DRV(call)                                                              \
  do {                                                                               \
    CUresult st = (call);                                                            \
    if (st != CUDA_SUCCESS) {                                                        \
      const char* name = nullptr;                                                    \
      const char* text = nullptr;                                                    \
      cuGetErrorName(st, &name);                                                     \
      cuGetErrorString(st, &text);                                                   \
      std::fprintf(stderr, "CUDA driver error %s:%d: %s %s\n", __FILE__, __LINE__,   \
                   name ? name : "?", text ? text : "?");                          \
      std::exit(1);                                                                  \
    }                                                                                \
  } while (0)

__global__ void long_kernel(volatile unsigned long long* sink,
                            volatile const unsigned int* stop_flag,
                            unsigned long long iters) {
  unsigned long long x = threadIdx.x + blockIdx.x * blockDim.x;
  for (unsigned long long i = 0; i < iters; ++i) {
    x = x * 2862933555777941757ULL + 3037000493ULL;
    if ((i & 0x3fffffULL) == 0 && *stop_flag)
      break;
  }
  sink[blockIdx.x * blockDim.x + threadIdx.x] = x;
}

struct KernelRun {
  cudaStream_t stream = nullptr;
  unsigned int* stop_host = nullptr;
  unsigned int* stop_dev = nullptr;
  unsigned long long* sink = nullptr;
};

static int env_int(const char* name, int def) {
  const char* value = std::getenv(name);
  return value && value[0] ? std::atoi(value) : def;
}

static void start_primary_kernel(KernelRun* k, int blocks, unsigned long long iters) {
  CHECK_CUDA(cudaSetDevice(0));
  CHECK_CUDA(cudaStreamCreateWithFlags(&k->stream, cudaStreamNonBlocking));
  CHECK_CUDA(cudaHostAlloc(reinterpret_cast<void**>(&k->stop_host), sizeof(unsigned int),
                           cudaHostAllocMapped));
  CHECK_CUDA(cudaHostGetDevicePointer(reinterpret_cast<void**>(&k->stop_dev), k->stop_host, 0));
  *k->stop_host = 0;
  CHECK_CUDA(cudaMalloc(reinterpret_cast<void**>(&k->sink),
                        static_cast<size_t>(blocks) * 256 * sizeof(*k->sink)));
  long_kernel<<<blocks, 256, 0, k->stream>>>(k->sink, k->stop_dev, iters);
  CHECK_CUDA(cudaGetLastError());
}

static void stop_primary_kernel(KernelRun* k) {
  *k->stop_host = 1;
  CHECK_CUDA(cudaStreamSynchronize(k->stream));
  CHECK_CUDA(cudaFree(k->sink));
  CHECK_CUDA(cudaFreeHost(k->stop_host));
  CHECK_CUDA(cudaStreamDestroy(k->stream));
}

static void d2d_batch(const char* label, int chunks) {
  constexpr size_t chunk_bytes = 1 << 20;
  const size_t total = static_cast<size_t>(chunks) * chunk_bytes;
  cudaStream_t stream = nullptr;
  void* src = nullptr;
  void* dst = nullptr;

  CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  CHECK_CUDA(cudaMalloc(&src, total));
  CHECK_CUDA(cudaMalloc(&dst, total));
  CHECK_CUDA(cudaMemsetAsync(src, 0x5a, total, stream));
  CHECK_CUDA(cudaStreamSynchronize(stream));

  for (int i = 0; i < chunks; ++i) {
    char* s = static_cast<char*>(src) + static_cast<size_t>(i) * chunk_bytes;
    char* d = static_cast<char*>(dst) + static_cast<size_t>(i) * chunk_bytes;
    CHECK_CUDA(cudaMemcpyAsync(d, s, chunk_bytes, cudaMemcpyDeviceToDevice, stream));
  }
  CHECK_CUDA(cudaStreamSynchronize(stream));

  CHECK_CUDA(cudaFree(dst));
  CHECK_CUDA(cudaFree(src));
  CHECK_CUDA(cudaStreamDestroy(stream));
}

static void secondary_d2d_case(const char* label, int chunks) {
  std::printf("%s\n", label);
  CHECK_DRV(cuCtxSetCurrent(nullptr));
  CUdevice dev;
  CHECK_DRV(cuDeviceGet(&dev, 0));
  CUcontext ctx = nullptr;
  CHECK_DRV(cuCtxCreate(&ctx, CU_CTX_SCHED_SPIN | CU_CTX_MAP_HOST, dev));
  d2d_batch("secondary_d2d_batch", chunks);
  CHECK_DRV(cuCtxDestroy(ctx));
  CHECK_CUDA(cudaSetDevice(0));
}

static void primary_d2d_case(const char* label, int chunks) {
  std::printf("%s\n", label);
  CHECK_CUDA(cudaSetDevice(0));
  d2d_batch("primary_d2d_batch", chunks);
}

int main() {
  const int blocks = env_int("REPRO_BLOCKS", 16);
  const int chunks = env_int("REPRO_CHUNKS", 8);
  const int gap_ms = env_int("REPRO_GAP_MS", 2);
  const unsigned long long iters =
      static_cast<unsigned long long>(env_int("REPRO_ITERS_M", 80)) * 1000000ULL;

  CHECK_CUDA(cudaSetDevice(0));
  CHECK_CUDA(cudaFree(0));
  std::printf("blocks=%d chunks=%d gap_ms=%d iters=%llu\n", blocks, chunks, gap_ms, iters);

  secondary_d2d_case("case1_direct_secondary_d2d", chunks);

  KernelRun k1;
  start_primary_kernel(&k1, blocks, iters);
  std::this_thread::sleep_for(std::chrono::milliseconds(gap_ms));
  primary_d2d_case("case2_primary_kernel_primary_d2d", chunks);
  stop_primary_kernel(&k1);

  KernelRun k2;
  start_primary_kernel(&k2, blocks, iters);
  std::this_thread::sleep_for(std::chrono::milliseconds(gap_ms));
  secondary_d2d_case("case3_primary_kernel_secondary_d2d", chunks);
  stop_primary_kernel(&k2);

  return 0;
}

The context creation itself is not meant to be done within the performance critical part/loops of your program.
Can you create the context beforehand?

D2D copies are implemented as kernels as copy engines are only designed to saturate interconnect bandwidth. The GR engine (responsible for compute, 2D, and 3D) only runs one GPU context at a time. The default time slice is 2048 microseconds. If you issue a long running kernel on the primary context and it starts execution and then enqueue a D2D on the secondary context the D2D will not start until the GR engine context switches to the secondary context. This will occur when the primary context timeslice expires (resulting in instruction level preemption of the long running kernel) or the primary kernel completes the long running kernel and has no other work.