I try this code on A100 with PCIE-4.0, I want to know the H2D and D2H throughput.
I get 24 GB/s for each run when running them singly. But the throughput gets much lower when running D2H and H2D together. Can you give some advice?
H2D Transfer Time: 0.0783002 seconds
D2H Transfer Time: 0.0808056 seconds
Parallel H2D and D2H Transfer Time: 0.122456 seconds
H2D Bandwidth: 24.944 GB/s
D2H Bandwidth: 24.1707 GB/s
Parallel Bandwidth: 15.9496 GB/s
#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
void checkCudaError(cudaError_t err, const char* msg) {
if (err != cudaSuccess) {
std::cerr << msg << ": " << cudaGetErrorString(err) << std::endl;
exit(-1);
}
}
int main() {
size_t dataSize = 500 * (1 << 20); // 数据大小: 50MB,可以修改为其他大小
// 1. Host and Device
float *h_data = nullptr, *d_data = nullptr;
// pinned memory
checkCudaError(cudaMallocHost(&h_data, dataSize * sizeof(float)), "cudaMallocHost failed");
checkCudaError(cudaMalloc(&d_data, dataSize * sizeof(float)), "cudaMalloc failed");
// init
for (size_t i = 0; i < dataSize; ++i) {
h_data[i] = static_cast<float>(i);
}
checkCudaError(cudaMemcpy(d_data, h_data, dataSize * sizeof(float), cudaMemcpyHostToDevice), "H2D memcpy failed");
checkCudaError(cudaMemcpy(h_data, d_data, dataSize * sizeof(float), cudaMemcpyDeviceToHost), "D2H memcpy failed");
// 2. H2D (Host to Device)
auto start = std::chrono::high_resolution_clock::now();
checkCudaError(cudaMemcpy(d_data, h_data, dataSize * sizeof(float), cudaMemcpyHostToDevice), "H2D memcpy failed");
auto end = std::chrono::high_resolution_clock::now();
auto durationH2D = std::chrono::duration<double>(end - start).count();
std::cout << "H2D Transfer Time: " << durationH2D << " seconds" << std::endl;
// 3. D2H (Device to Host)
start = std::chrono::high_resolution_clock::now();
checkCudaError(cudaMemcpy(h_data, d_data, dataSize * sizeof(float), cudaMemcpyDeviceToHost), "D2H memcpy failed");
end = std::chrono::high_resolution_clock::now();
auto durationD2H = std::chrono::duration<double>(end - start).count();
std::cout << "D2H Transfer Time: " << durationD2H << " seconds" << std::endl;
// 4.
cudaStream_t stream1, stream2;
checkCudaError(cudaStreamCreate(&stream1), "cudaStreamCreate failed");
checkCudaError(cudaStreamCreate(&stream2), "cudaStreamCreate failed");
cudaDeviceSynchronize();
// stream1 H2D
start = std::chrono::high_resolution_clock::now();
checkCudaError(cudaMemcpyAsync(d_data, h_data, dataSize * sizeof(float), cudaMemcpyHostToDevice, stream1), "Async H2D failed");
// stream2 D2H
checkCudaError(cudaMemcpyAsync(h_data, d_data, dataSize * sizeof(float), cudaMemcpyDeviceToHost, stream2), "Async D2H failed");
// wait
checkCudaError(cudaStreamSynchronize(stream1), "Stream1 sync failed");
checkCudaError(cudaStreamSynchronize(stream2), "Stream2 sync failed");
end = std::chrono::high_resolution_clock::now();
auto durationParallel = std::chrono::duration<double>(end - start).count();
std::cout << "Parallel H2D and D2H Transfer Time: " << durationParallel << " seconds" << std::endl;
//(GB/s)
double h2d_bandwidth = dataSize * sizeof(float) / durationH2D / (1 << 30); // GB/s
double d2h_bandwidth = dataSize * sizeof(float) / durationD2H / (1 << 30); // GB/s
double parallel_bandwidth = dataSize * sizeof(float) / durationParallel / (1 << 30); // GB/s
std::cout << "H2D Bandwidth: " << h2d_bandwidth << " GB/s" << std::endl;
std::cout << "D2H Bandwidth: " << d2h_bandwidth << " GB/s" << std::endl;
std::cout << "Parallel Bandwidth: " << parallel_bandwidth << " GB/s" << std::endl;
// clear
checkCudaError(cudaFree(d_data), "cudaFree failed");
checkCudaError(cudaFreeHost(h_data), "cudaFreeHost failed");
return 0;
}