Hi bros,
I’m writing a program that launch H2D/D2H in a fixed size, and notice H2D launch takes more time (2us → 200us) if memory pinned with cudaHostAllocWriteCombined
flag and size <= 24576 bytes.
A reproduction demo:
#include <cuda_runtime.h>
#include <iostream>
#include <unistd.h>
#include <thread>
#include <chrono>
int main() {
void* ptr = nullptr;
int num_blocks = 100;
// int block_size = 1024;
// int block_size = 2048;
// int block_size = 4096;
// int block_size = 8192;
// int block_size = 16384;
int block_size = 24576;
// int block_size = 24577;
// int block_size = 32768;
// int block_size = 65536;
// int block_size = 131072;
// auto _err = cudaHostAlloc(&ptr, 2 * num_blocks * block_size, cudaHostAllocWriteCombined);
auto _err = cudaHostAlloc(&ptr, 2 * num_blocks * block_size, cudaHostAllocDefault);
if (ptr == nullptr || _err != cudaSuccess) {
std::cout << "cudaHostAlloc " << 2 * num_blocks * block_size << " bytes fail: "
<< cudaGetErrorString(_err) << std::endl;
return -1;
} else {
std::cout << "cudaHostAlloc " << 2 * num_blocks * block_size << " bytes success. " << std::endl;
}
void* gpu_ptr = nullptr;
_err = cudaMalloc((void**)&gpu_ptr, num_blocks * block_size);
if (ptr == nullptr || _err != cudaSuccess) {
std::cout << "cudaMalloc " << num_blocks * block_size << " bytes fail: "
<< cudaGetErrorString(_err) << std::endl;
return -1;
} else {
std::cout << "cudaMalloc " << num_blocks * block_size << " bytes success. " << std::endl;
}
cudaStream_t d2h_stream;
_err = cudaStreamCreate(&d2h_stream);
if (_err != cudaSuccess) {
std::cerr << "Error creating CUDA stream: " << cudaGetErrorString(_err) << std::endl;
return -1;
}
cudaStream_t h2d_stream;
_err = cudaStreamCreate(&h2d_stream);
if (_err != cudaSuccess) {
std::cerr << "Error creating CUDA stream: " << cudaGetErrorString(_err) << std::endl;
return -1;
}
void* tgt_ptr;
void* src_ptr;
for (int i = 0; i < num_blocks; i++) {
// usleep(1);
std::this_thread::sleep_for(std::chrono::milliseconds(1));
tgt_ptr = (void*)(reinterpret_cast<char*>(ptr) + i * block_size);
src_ptr = (void*)(reinterpret_cast<char*>(gpu_ptr) + i * block_size);
auto start = std::chrono::high_resolution_clock::now();
cudaMemcpyAsync(tgt_ptr, src_ptr, block_size, cudaMemcpyDeviceToHost, d2h_stream);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
std::cout << "D2H launch takes " << duration.count() << " microseconds." << std::endl;
}
// usleep(100);
std::this_thread::sleep_for(std::chrono::milliseconds(100));
for (int i = 0; i < num_blocks; i++) {
// usleep(1);
std::this_thread::sleep_for(std::chrono::milliseconds(1));
tgt_ptr = (void*)(reinterpret_cast<char*>(gpu_ptr) + i * block_size);
src_ptr = (void*)(reinterpret_cast<char*>(ptr) + i * block_size);
auto start = std::chrono::high_resolution_clock::now();
cudaMemcpyAsync(tgt_ptr, src_ptr, block_size, cudaMemcpyHostToDevice, h2d_stream);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
std::cout << "H2D launch takes " << duration.count() << " microseconds." << std::endl;
}
// // usleep(100);
// std::this_thread::sleep_for(std::chrono::milliseconds(100));
// for (int i = 0; i < num_blocks; i++) {
// // usleep(1);
// std::this_thread::sleep_for(std::chrono::milliseconds(1));
// tgt_ptr = (void*)(reinterpret_cast<char*>(ptr) + 2* i * block_size);
// src_ptr = (void*)(reinterpret_cast<char*>(gpu_ptr) + i * block_size);
// cudaMemcpyAsync(tgt_ptr, src_ptr, block_size, cudaMemcpyDeviceToHost, d2h_stream);
// // usleep(1);
// std::this_thread::sleep_for(std::chrono::milliseconds(1));
// tgt_ptr = (void*)(reinterpret_cast<char*>(gpu_ptr) + i * block_size);
// src_ptr = (void*)(reinterpret_cast<char*>(ptr) + i * block_size);
// cudaMemcpyAsync(tgt_ptr, src_ptr, block_size, cudaMemcpyHostToDevice, h2d_stream);
// }
return 1;
}
with block_size = 24576:
D2H launch takes 3 microseconds.
D2H launch takes 2 microseconds.
D2H launch takes 2 microseconds.
D2H launch takes 2 microseconds.
D2H launch takes 3 microseconds.
D2H launch takes 3 microseconds.
H2D launch takes 279 microseconds.
H2D launch takes 221 microseconds.
H2D launch takes 264 microseconds.
H2D launch takes 212 microseconds.
H2D launch takes 267 microseconds.
H2D launch takes 219 microseconds.
H2D launch takes 269 microseconds.
with int block_size = 24577
D2H launch takes 2 microseconds. [88/1844]
D2H launch takes 1 microseconds.
D2H launch takes 1 microseconds.
D2H launch takes 1 microseconds.
D2H launch takes 1 microseconds.
D2H launch takes 2 microseconds.
D2H launch takes 1 microseconds.
D2H launch takes 19 microseconds.
D2H launch takes 2 microseconds.
D2H launch takes 1 microseconds.
H2D launch takes 15 microseconds.
H2D launch takes 3 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 1 microseconds.
H2D launch takes 2 microseconds.
The performance degradation would be less (2us → 4~5us) if we pin memory with cudaHostAllocDefault
.
Is it in line with expectation?