cudaMemcpyAsync H2D launch takes more time with <= 24576 bytes

Hi bros,
I’m writing a program that launch H2D/D2H in a fixed size, and notice H2D launch takes more time (2us → 200us) if memory pinned with cudaHostAllocWriteCombined flag and size <= 24576 bytes.
A reproduction demo:

#include <cuda_runtime.h>
#include <iostream>
#include <unistd.h>
#include <thread>
#include <chrono>

int main() {
  void* ptr = nullptr;
  int num_blocks = 100;
//   int block_size = 1024;
//   int block_size = 2048;
//   int block_size = 4096;
//   int block_size = 8192;
//   int block_size = 16384;

  int block_size = 24576;
  // int block_size = 24577;

//   int block_size = 32768;
//   int block_size = 65536;
//   int block_size = 131072;
  // auto _err = cudaHostAlloc(&ptr, 2 * num_blocks * block_size, cudaHostAllocWriteCombined);
  auto _err = cudaHostAlloc(&ptr, 2 * num_blocks * block_size, cudaHostAllocDefault);
  if (ptr == nullptr || _err != cudaSuccess) {
    std::cout << "cudaHostAlloc " << 2 * num_blocks * block_size << " bytes fail: "
              << cudaGetErrorString(_err) << std::endl;
    return -1;
  } else {
    std::cout << "cudaHostAlloc " << 2 * num_blocks * block_size << " bytes success. " << std::endl;
  }

  void* gpu_ptr = nullptr;
  _err = cudaMalloc((void**)&gpu_ptr, num_blocks * block_size);
  if (ptr == nullptr || _err != cudaSuccess) {
    std::cout << "cudaMalloc " << num_blocks * block_size << " bytes fail: "
              << cudaGetErrorString(_err) << std::endl;
    return -1;
  } else {
    std::cout << "cudaMalloc " << num_blocks * block_size << " bytes success. " << std::endl;
  }

  cudaStream_t d2h_stream;
  _err = cudaStreamCreate(&d2h_stream);
  if (_err != cudaSuccess) {
      std::cerr << "Error creating CUDA stream: " << cudaGetErrorString(_err) << std::endl;
      return -1;
  }

  cudaStream_t h2d_stream;
  _err = cudaStreamCreate(&h2d_stream);
  if (_err != cudaSuccess) {
      std::cerr << "Error creating CUDA stream: " << cudaGetErrorString(_err) << std::endl;
      return -1;
  }

  void* tgt_ptr;
  void* src_ptr;
  for (int i = 0; i < num_blocks; i++) {
    // usleep(1);
    std::this_thread::sleep_for(std::chrono::milliseconds(1));
    tgt_ptr = (void*)(reinterpret_cast<char*>(ptr) + i * block_size);
    src_ptr = (void*)(reinterpret_cast<char*>(gpu_ptr) + i * block_size);
    auto start = std::chrono::high_resolution_clock::now();
    cudaMemcpyAsync(tgt_ptr, src_ptr, block_size, cudaMemcpyDeviceToHost, d2h_stream);
    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    std::cout << "D2H launch takes " << duration.count() << " microseconds." << std::endl;
  }

//   usleep(100);
  std::this_thread::sleep_for(std::chrono::milliseconds(100));

  for (int i = 0; i < num_blocks; i++) {
    // usleep(1);
    std::this_thread::sleep_for(std::chrono::milliseconds(1));
    tgt_ptr = (void*)(reinterpret_cast<char*>(gpu_ptr) + i * block_size);
    src_ptr = (void*)(reinterpret_cast<char*>(ptr) + i * block_size);
    auto start = std::chrono::high_resolution_clock::now();
    cudaMemcpyAsync(tgt_ptr, src_ptr, block_size, cudaMemcpyHostToDevice, h2d_stream);
    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    std::cout << "H2D launch takes " << duration.count() << " microseconds." << std::endl;
  }


// //   usleep(100);
//   std::this_thread::sleep_for(std::chrono::milliseconds(100));

//   for (int i = 0; i < num_blocks; i++) {
//     // usleep(1);
//     std::this_thread::sleep_for(std::chrono::milliseconds(1));
//     tgt_ptr = (void*)(reinterpret_cast<char*>(ptr) + 2* i * block_size);
//     src_ptr = (void*)(reinterpret_cast<char*>(gpu_ptr) + i * block_size);
//     cudaMemcpyAsync(tgt_ptr, src_ptr, block_size, cudaMemcpyDeviceToHost, d2h_stream);
//     // usleep(1);
//     std::this_thread::sleep_for(std::chrono::milliseconds(1));
//     tgt_ptr = (void*)(reinterpret_cast<char*>(gpu_ptr) + i * block_size);
//     src_ptr = (void*)(reinterpret_cast<char*>(ptr) + i * block_size);
//     cudaMemcpyAsync(tgt_ptr, src_ptr, block_size, cudaMemcpyHostToDevice, h2d_stream);
//   }


  return 1;
}

with block_size = 24576:

D2H launch takes 3 microseconds.  
D2H launch takes 2 microseconds.  
D2H launch takes 2 microseconds.  
D2H launch takes 2 microseconds.  
D2H launch takes 3 microseconds.  
D2H launch takes 3 microseconds.  
H2D launch takes 279 microseconds.
H2D launch takes 221 microseconds.                                                                  
H2D launch takes 264 microseconds.
H2D launch takes 212 microseconds.
H2D launch takes 267 microseconds.
H2D launch takes 219 microseconds.
H2D launch takes 269 microseconds.

with int block_size = 24577

D2H launch takes 2 microseconds.                                                                                                                                                                [88/1844]
D2H launch takes 1 microseconds.
D2H launch takes 1 microseconds.
D2H launch takes 1 microseconds.
D2H launch takes 1 microseconds.
D2H launch takes 2 microseconds.
D2H launch takes 1 microseconds.
D2H launch takes 19 microseconds.
D2H launch takes 2 microseconds.
D2H launch takes 1 microseconds.
H2D launch takes 15 microseconds.
H2D launch takes 3 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 2 microseconds.

The performance degradation would be less (2us → 4~5us) if we pin memory with cudaHostAllocDefault.

Is it in line with expectation?

Unable to reproduce. All operations take approximately the same time when I run the program.

I would suggest adding information on your GPU and host system hardware, as well as the operating system and NVIDIA driver versions used.

Thx for reply!
Im using NVIDIA L20 on nvcr.io/nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04, Driver Version: 535.161.07 CUDA Version: 12.5.

I’m able to reproduce the observation on L4, CUDA 12.2, linux. I can’t really explain it.

If this is a matter of concern for you, you may wish to confirm that the behavior persists on the latest CUDA version, and if so file a bug.

Not sure what the link references. A cloud instance of some sort? If so: I have zero experience with those. It is certainly conceivable that any kind of virtualization could create non-intuitive performance artifacts. If you have access to a bare-metal server, you could use that as a comparison point.

The link references a (docker) container in NVIDIA NGC repository. Specifically one of these.

Thx for reply!
I file a bug for it Log in | NVIDIA Developer

This is tracked in ticket ID 4859755 . We will bring back conclusion here once it is completed .