Millisecond-scale D2D memcpy start latency in secondary CUDA context while primary-context kernel is running
I observe millisecond-scale GPU start latency for cudaMemcpyAsync device-to-device copies submitted in a secondary CUDA context while a long-running kernel is active in the primary CUDA context on the same GPU.
Environment:
- GPU: Tesla V100-PCIE-32GB
- Driver: 580.159.03
- CUDA Toolkit: 12.6.85
Minimal test:
- Create the CUDA primary context with the runtime API.
- Run case 1: create a secondary context and submit 1 MiB D2D
cudaMemcpyAsynccopies. - Run case 2: launch a long-running kernel in the primary context, then submit the same D2D copies in the primary context.
- Run case 3: launch a long-running kernel in the primary context, then create a secondary context and submit the same D2D copies in the secondary context.
In each case, the D2D source and destination buffers are allocated in the same CUDA context that submits the D2D memcpy. This is local D2D within one context, not a cross-context copy.
Observed in Nsight Systems:
- Case 1, direct secondary-context D2D:
API return → GPU start is ~1.6 us. - Case 2, primary-context kernel + primary-context D2D:
API return → GPU start is ~1.5 us. - Case 3, primary-context kernel + secondary-context D2D:
API return → GPU start is ~2.26 ms.
The D2D API call returns quickly, but the GPU copy starts much later only in case 3.
Is this expected CUDA scheduling behavior? If yes, is there a recommended way to avoid this latency when using a separate CUDA context for communication work?
Test code:
#include <cuda.h>
#include <cuda_runtime.h>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <thread>
#define CHECK_CUDA(call) \
do { \
cudaError_t st = (call); \
if (st != cudaSuccess) { \
std::fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(st)); \
std::exit(1); \
} \
} while (0)
#define CHECK_DRV(call) \
do { \
CUresult st = (call); \
if (st != CUDA_SUCCESS) { \
const char* name = nullptr; \
const char* text = nullptr; \
cuGetErrorName(st, &name); \
cuGetErrorString(st, &text); \
std::fprintf(stderr, "CUDA driver error %s:%d: %s %s\n", __FILE__, __LINE__, \
name ? name : "?", text ? text : "?"); \
std::exit(1); \
} \
} while (0)
__global__ void long_kernel(volatile unsigned long long* sink,
volatile const unsigned int* stop_flag,
unsigned long long iters) {
unsigned long long x = threadIdx.x + blockIdx.x * blockDim.x;
for (unsigned long long i = 0; i < iters; ++i) {
x = x * 2862933555777941757ULL + 3037000493ULL;
if ((i & 0x3fffffULL) == 0 && *stop_flag)
break;
}
sink[blockIdx.x * blockDim.x + threadIdx.x] = x;
}
struct KernelRun {
cudaStream_t stream = nullptr;
unsigned int* stop_host = nullptr;
unsigned int* stop_dev = nullptr;
unsigned long long* sink = nullptr;
};
static int env_int(const char* name, int def) {
const char* value = std::getenv(name);
return value && value[0] ? std::atoi(value) : def;
}
static void start_primary_kernel(KernelRun* k, int blocks, unsigned long long iters) {
CHECK_CUDA(cudaSetDevice(0));
CHECK_CUDA(cudaStreamCreateWithFlags(&k->stream, cudaStreamNonBlocking));
CHECK_CUDA(cudaHostAlloc(reinterpret_cast<void**>(&k->stop_host), sizeof(unsigned int),
cudaHostAllocMapped));
CHECK_CUDA(cudaHostGetDevicePointer(reinterpret_cast<void**>(&k->stop_dev), k->stop_host, 0));
*k->stop_host = 0;
CHECK_CUDA(cudaMalloc(reinterpret_cast<void**>(&k->sink),
static_cast<size_t>(blocks) * 256 * sizeof(*k->sink)));
long_kernel<<<blocks, 256, 0, k->stream>>>(k->sink, k->stop_dev, iters);
CHECK_CUDA(cudaGetLastError());
}
static void stop_primary_kernel(KernelRun* k) {
*k->stop_host = 1;
CHECK_CUDA(cudaStreamSynchronize(k->stream));
CHECK_CUDA(cudaFree(k->sink));
CHECK_CUDA(cudaFreeHost(k->stop_host));
CHECK_CUDA(cudaStreamDestroy(k->stream));
}
static void d2d_batch(const char* label, int chunks) {
constexpr size_t chunk_bytes = 1 << 20;
const size_t total = static_cast<size_t>(chunks) * chunk_bytes;
cudaStream_t stream = nullptr;
void* src = nullptr;
void* dst = nullptr;
CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CHECK_CUDA(cudaMalloc(&src, total));
CHECK_CUDA(cudaMalloc(&dst, total));
CHECK_CUDA(cudaMemsetAsync(src, 0x5a, total, stream));
CHECK_CUDA(cudaStreamSynchronize(stream));
for (int i = 0; i < chunks; ++i) {
char* s = static_cast<char*>(src) + static_cast<size_t>(i) * chunk_bytes;
char* d = static_cast<char*>(dst) + static_cast<size_t>(i) * chunk_bytes;
CHECK_CUDA(cudaMemcpyAsync(d, s, chunk_bytes, cudaMemcpyDeviceToDevice, stream));
}
CHECK_CUDA(cudaStreamSynchronize(stream));
CHECK_CUDA(cudaFree(dst));
CHECK_CUDA(cudaFree(src));
CHECK_CUDA(cudaStreamDestroy(stream));
}
static void secondary_d2d_case(const char* label, int chunks) {
std::printf("%s\n", label);
CHECK_DRV(cuCtxSetCurrent(nullptr));
CUdevice dev;
CHECK_DRV(cuDeviceGet(&dev, 0));
CUcontext ctx = nullptr;
CHECK_DRV(cuCtxCreate(&ctx, CU_CTX_SCHED_SPIN | CU_CTX_MAP_HOST, dev));
d2d_batch("secondary_d2d_batch", chunks);
CHECK_DRV(cuCtxDestroy(ctx));
CHECK_CUDA(cudaSetDevice(0));
}
static void primary_d2d_case(const char* label, int chunks) {
std::printf("%s\n", label);
CHECK_CUDA(cudaSetDevice(0));
d2d_batch("primary_d2d_batch", chunks);
}
int main() {
const int blocks = env_int("REPRO_BLOCKS", 16);
const int chunks = env_int("REPRO_CHUNKS", 8);
const int gap_ms = env_int("REPRO_GAP_MS", 2);
const unsigned long long iters =
static_cast<unsigned long long>(env_int("REPRO_ITERS_M", 80)) * 1000000ULL;
CHECK_CUDA(cudaSetDevice(0));
CHECK_CUDA(cudaFree(0));
std::printf("blocks=%d chunks=%d gap_ms=%d iters=%llu\n", blocks, chunks, gap_ms, iters);
secondary_d2d_case("case1_direct_secondary_d2d", chunks);
KernelRun k1;
start_primary_kernel(&k1, blocks, iters);
std::this_thread::sleep_for(std::chrono::milliseconds(gap_ms));
primary_d2d_case("case2_primary_kernel_primary_d2d", chunks);
stop_primary_kernel(&k1);
KernelRun k2;
start_primary_kernel(&k2, blocks, iters);
std::this_thread::sleep_for(std::chrono::milliseconds(gap_ms));
secondary_d2d_case("case3_primary_kernel_secondary_d2d", chunks);
stop_primary_kernel(&k2);
return 0;
}