cudaMemcpyAsync H2D launch takes more time with <= 24576 bytes

zhongying_m · September 10, 2024, 11:09am

Hi bros,
I’m writing a program that launch H2D/D2H in a fixed size, and notice H2D launch takes more time (2us → 200us) if memory pinned with cudaHostAllocWriteCombined flag and size <= 24576 bytes.
A reproduction demo:

#include <cuda_runtime.h>
#include <iostream>
#include <unistd.h>
#include <thread>
#include <chrono>

int main() {
  void* ptr = nullptr;
  int num_blocks = 100;
//   int block_size = 1024;
//   int block_size = 2048;
//   int block_size = 4096;
//   int block_size = 8192;
//   int block_size = 16384;

  int block_size = 24576;
  // int block_size = 24577;

//   int block_size = 32768;
//   int block_size = 65536;
//   int block_size = 131072;
  // auto _err = cudaHostAlloc(&ptr, 2 * num_blocks * block_size, cudaHostAllocWriteCombined);
  auto _err = cudaHostAlloc(&ptr, 2 * num_blocks * block_size, cudaHostAllocDefault);
  if (ptr == nullptr || _err != cudaSuccess) {
    std::cout << "cudaHostAlloc " << 2 * num_blocks * block_size << " bytes fail: "
              << cudaGetErrorString(_err) << std::endl;
    return -1;
  } else {
    std::cout << "cudaHostAlloc " << 2 * num_blocks * block_size << " bytes success. " << std::endl;
  }

  void* gpu_ptr = nullptr;
  _err = cudaMalloc((void**)&gpu_ptr, num_blocks * block_size);
  if (ptr == nullptr || _err != cudaSuccess) {
    std::cout << "cudaMalloc " << num_blocks * block_size << " bytes fail: "
              << cudaGetErrorString(_err) << std::endl;
    return -1;
  } else {
    std::cout << "cudaMalloc " << num_blocks * block_size << " bytes success. " << std::endl;
  }

  cudaStream_t d2h_stream;
  _err = cudaStreamCreate(&d2h_stream);
  if (_err != cudaSuccess) {
      std::cerr << "Error creating CUDA stream: " << cudaGetErrorString(_err) << std::endl;
      return -1;
  }

  cudaStream_t h2d_stream;
  _err = cudaStreamCreate(&h2d_stream);
  if (_err != cudaSuccess) {
      std::cerr << "Error creating CUDA stream: " << cudaGetErrorString(_err) << std::endl;
      return -1;
  }

  void* tgt_ptr;
  void* src_ptr;
  for (int i = 0; i < num_blocks; i++) {
    // usleep(1);
    std::this_thread::sleep_for(std::chrono::milliseconds(1));
    tgt_ptr = (void*)(reinterpret_cast<char*>(ptr) + i * block_size);
    src_ptr = (void*)(reinterpret_cast<char*>(gpu_ptr) + i * block_size);
    auto start = std::chrono::high_resolution_clock::now();
    cudaMemcpyAsync(tgt_ptr, src_ptr, block_size, cudaMemcpyDeviceToHost, d2h_stream);
    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    std::cout << "D2H launch takes " << duration.count() << " microseconds." << std::endl;
  }

//   usleep(100);
  std::this_thread::sleep_for(std::chrono::milliseconds(100));

  for (int i = 0; i < num_blocks; i++) {
    // usleep(1);
    std::this_thread::sleep_for(std::chrono::milliseconds(1));
    tgt_ptr = (void*)(reinterpret_cast<char*>(gpu_ptr) + i * block_size);
    src_ptr = (void*)(reinterpret_cast<char*>(ptr) + i * block_size);
    auto start = std::chrono::high_resolution_clock::now();
    cudaMemcpyAsync(tgt_ptr, src_ptr, block_size, cudaMemcpyHostToDevice, h2d_stream);
    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    std::cout << "H2D launch takes " << duration.count() << " microseconds." << std::endl;
  }


// //   usleep(100);
//   std::this_thread::sleep_for(std::chrono::milliseconds(100));

//   for (int i = 0; i < num_blocks; i++) {
//     // usleep(1);
//     std::this_thread::sleep_for(std::chrono::milliseconds(1));
//     tgt_ptr = (void*)(reinterpret_cast<char*>(ptr) + 2* i * block_size);
//     src_ptr = (void*)(reinterpret_cast<char*>(gpu_ptr) + i * block_size);
//     cudaMemcpyAsync(tgt_ptr, src_ptr, block_size, cudaMemcpyDeviceToHost, d2h_stream);
//     // usleep(1);
//     std::this_thread::sleep_for(std::chrono::milliseconds(1));
//     tgt_ptr = (void*)(reinterpret_cast<char*>(gpu_ptr) + i * block_size);
//     src_ptr = (void*)(reinterpret_cast<char*>(ptr) + i * block_size);
//     cudaMemcpyAsync(tgt_ptr, src_ptr, block_size, cudaMemcpyHostToDevice, h2d_stream);
//   }


  return 1;
}

with block_size = 24576:

D2H launch takes 3 microseconds.  
D2H launch takes 2 microseconds.  
D2H launch takes 2 microseconds.  
D2H launch takes 2 microseconds.  
D2H launch takes 3 microseconds.  
D2H launch takes 3 microseconds.  
H2D launch takes 279 microseconds.
H2D launch takes 221 microseconds.                                                                  
H2D launch takes 264 microseconds.
H2D launch takes 212 microseconds.
H2D launch takes 267 microseconds.
H2D launch takes 219 microseconds.
H2D launch takes 269 microseconds.

with int block_size = 24577

D2H launch takes 2 microseconds.                                                                                                                                                                [88/1844]
D2H launch takes 1 microseconds.
D2H launch takes 1 microseconds.
D2H launch takes 1 microseconds.
D2H launch takes 1 microseconds.
D2H launch takes 2 microseconds.
D2H launch takes 1 microseconds.
D2H launch takes 19 microseconds.
D2H launch takes 2 microseconds.
D2H launch takes 1 microseconds.
H2D launch takes 15 microseconds.
H2D launch takes 3 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 2 microseconds.

The performance degradation would be less (2us → 4~5us) if we pin memory with cudaHostAllocDefault.

Is it in line with expectation?

njuffa · September 10, 2024, 11:25am

Unable to reproduce. All operations take approximately the same time when I run the program.

I would suggest adding information on your GPU and host system hardware, as well as the operating system and NVIDIA driver versions used.

zhongying_m · September 11, 2024, 1:49am

Thx for reply!
Im using NVIDIA L20 on nvcr.io/nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04, Driver Version: 535.161.07 CUDA Version: 12.5.

Robert_Crovella · September 11, 2024, 2:02am

I’m able to reproduce the observation on L4, CUDA 12.2, linux. I can’t really explain it.

If this is a matter of concern for you, you may wish to confirm that the behavior persists on the latest CUDA version, and if so file a bug.

njuffa · September 11, 2024, 2:03am

Not sure what the link references. A cloud instance of some sort? If so: I have zero experience with those. It is certainly conceivable that any kind of virtualization could create non-intuitive performance artifacts. If you have access to a bare-metal server, you could use that as a comparison point.

Robert_Crovella · September 11, 2024, 2:08am

The link references a (docker) container in NVIDIA NGC repository. Specifically one of these.

zhongying_m · September 11, 2024, 3:35am

Thx for reply!
I file a bug for it Log in | NVIDIA Developer

Yuki_Ni · September 11, 2024, 12:11pm

This is tracked in ticket ID 4859755 . We will bring back conclusion here once it is completed .

Topic		Replies	Views
performance problem CUDA Programming and Performance	2	606	July 16, 2018
concurrent D2H+H2D transfers? CUDA Programming and Performance	5	2445	May 10, 2016
Why it is so slow to use cudamemcpy（cudaMemcpyHostToHost）on tx2 Jetson TX2 cuda	5	1166	October 18, 2021
massive hiccups when transferring flags back to host challenging the commonly quoted 2-10us latency CUDA Programming and Performance	14	38113	February 1, 2011
Cuda code performance CUDA Programming and Performance	14	3101	December 16, 2014
cudaMemcpy2DAsync a lot slower than cudaMemcpy normally CUDA Programming and Performance	6	93	August 22, 2024
cudaMemcpy2DAsync not always fully synchronous CUDA Programming and Performance	11	1145	February 4, 2021
Cuda slow performance after process sleep/wait CUDA Programming and Performance	1	1242	June 14, 2022
cudaFreeHost very slow for buffers > 2GB [newbee here!] CUDA Programming and Performance	30	2497	October 3, 2019
Why is this slow CUDA Programming and Performance	7	3729	February 7, 2012

cudaMemcpyAsync H2D launch takes more time with <= 24576 bytes

Related topics