My needs are very simple.
I got 32MB data calculated by gpu, now I want to transfer the data to host and store the data to heap memory(int* data = malloc()) or stack memory(vector data.resize())
I decide to use pinned memory, show my code and test on my tx2
#include <stdio.h>
#include <cuda_runtime.h>
#include <iostream>
#include <memory>
#include <string>
#include <chrono>
#include <vector>
using namespace std;
#define MEMCOPY_ITERATIONS 1
#define DEFAULT_SIZE ( 32 * (1e6) ) //32 M
int main(int argc, char **argv) {
printf("%s Starting...\n", argv[0]);
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
printf("Using Device %d: %s\n", dev, deviceProp.name);
//cpu bandwidthTest
int num = DEFAULT_SIZE / sizeof(float);
vector<float> h_A_vec;
vector<float> h_B_vec;
h_A_vec.resize(num);
h_B_vec.resize(num);
float* h_A = (float*)malloc(DEFAULT_SIZE);
float* h_B = (float*)malloc(DEFAULT_SIZE);
//malloc2malloc
{
auto t1 = std::chrono::steady_clock::now();
memcpy(h_A, h_B, DEFAULT_SIZE);
auto t2 = std::chrono::steady_clock::now();
double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
printf("cpu memcpy malloc2malloc Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps/1e3, (DEFAULT_SIZE/1e9)/(iElaps/1e3));
}
//vector2vector
{
auto t1 = std::chrono::steady_clock::now();
memcpy(h_A_vec.data(), h_B_vec.data(), DEFAULT_SIZE);
auto t2 = std::chrono::steady_clock::now();
double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
printf("cpu memcpy vector2vecotr Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps/1e3, (DEFAULT_SIZE/1e9)/(iElaps/1e3));
}
//malloc2vector
{
auto t1 = std::chrono::steady_clock::now();
memcpy(h_A, h_B_vec.data(), DEFAULT_SIZE);
auto t2 = std::chrono::steady_clock::now();
double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
printf("cpu memcpy malloc2vector Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps/1e3, (DEFAULT_SIZE/1e9)/(iElaps/1e3));
}
float *d_A;
float *h_pinned_A;
cudaMalloc((void**)&d_A, DEFAULT_SIZE);
cudaHostAlloc((void**)&h_pinned_A, DEFAULT_SIZE, cudaHostAllocDefault);//在主机上分配页锁定内存
//pinned2vector
{
auto t1 = std::chrono::steady_clock::now();
memcpy(h_A_vec.data(), h_pinned_A, DEFAULT_SIZE);
auto t2 = std::chrono::steady_clock::now();
double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
printf("cpu memcpy pinned2vector Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps/1e3, (DEFAULT_SIZE/1e9)/(iElaps/1e3));
}
//device2vector
{
auto t1 = std::chrono::steady_clock::now();
cudaMemcpy(h_B_vec.data(), d_A, DEFAULT_SIZE, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
auto t2 = std::chrono::steady_clock::now();
double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
printf("cudaMemcpy device2vector Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps/1e3, (DEFAULT_SIZE/1e9)/(iElaps/1e3));
}
//device2malloc
{
auto t1 = std::chrono::steady_clock::now();
cudaMemcpy(h_B, d_A, DEFAULT_SIZE, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
auto t2 = std::chrono::steady_clock::now();
double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
printf("cudaMemcpy device2malloc Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps/1e3, (DEFAULT_SIZE/1e9)/(iElaps/1e3));
}
//device2pinned
{
auto t1 = std::chrono::steady_clock::now();
cudaMemcpy(h_pinned_A, d_A, DEFAULT_SIZE, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
auto t2 = std::chrono::steady_clock::now();
double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
printf("cudaMemcpy device2pinned Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps/1e3, (DEFAULT_SIZE/1e9)/(iElaps/1e3));
}
//device2pinned gpu time
{
// 实例化CUDA event
cudaEvent_t e_start, e_stop;
//创建事件
cudaEventCreate(&e_start);
cudaEventCreate(&e_stop);
//记录事件,开始计算时间
cudaEventRecord(e_start, 0);
cudaMemcpy(h_pinned_A, d_A, DEFAULT_SIZE, cudaMemcpyDeviceToHost);
//记录结束时事件
cudaEventRecord(e_stop, 0);// 0 代表CUDA流0
//等待事件同步后
cudaEventSynchronize(e_stop);
//计算对应的时间,评估代码性能
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
printf("cudaMemcpy device2pinned gpu Time elapsed %f sec, Bandwidth =%.1f GB/s\n", elapsedTime / 1e3, (DEFAULT_SIZE / 1e9) / (elapsedTime / 1e3));
}
//pinned2malloc
{
auto t1 = std::chrono::steady_clock::now();
memcpy(h_B, h_pinned_A, DEFAULT_SIZE);
auto t2 = std::chrono::steady_clock::now();
double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
printf("cpu memcpy pinned2malloc Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps / 1e3, (DEFAULT_SIZE / 1e9) / (iElaps / 1e3));
}
//pinned2malloc gpu time
{
// 实例化CUDA event
cudaEvent_t e_start, e_stop;
//创建事件
cudaEventCreate(&e_start);
cudaEventCreate(&e_stop);
//记录事件,开始计算时间
cudaEventRecord(e_start, 0);
cudaMemcpy(h_B, h_pinned_A, DEFAULT_SIZE, cudaMemcpyHostToHost);
//记录结束时事件
cudaEventRecord(e_stop, 0);// 0 代表CUDA流0
//等待事件同步后
cudaEventSynchronize(e_stop);
//计算对应的时间,评估代码性能
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
printf("cudaMemcpy pinned2malloc gpu Time elapsed %f sec, Bandwidth =%.1f GB/s\n", elapsedTime / 1e3, (DEFAULT_SIZE / 1e9) / (elapsedTime / 1e3));
cudaEventDestroy(e_start);
cudaEventDestroy(e_stop);
}
//pinned2vector
{
// 实例化CUDA event
cudaEvent_t e_start, e_stop;
//创建事件
cudaEventCreate(&e_start);
cudaEventCreate(&e_stop);
//记录事件,开始计算时间
cudaEventRecord(e_start, 0);
cudaMemcpy(h_A_vec.data(), h_pinned_A, DEFAULT_SIZE, cudaMemcpyHostToHost);
//记录结束时事件
cudaEventRecord(e_stop, 0);// 0 代表CUDA流0
//等待事件同步后
cudaEventSynchronize(e_stop);
//计算对应的时间,评估代码性能
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
printf("cudaMemcpy pinned2vector gpu Time elapsed %f sec, Bandwidth =%.1f GB/s\n", elapsedTime / 1e3, (DEFAULT_SIZE / 1e9) / (elapsedTime / 1e3));
cudaEventDestroy(e_start);
cudaEventDestroy(e_stop);
}
return 0;
}
you can see , pinned2malloc or pinned2vector bandwidth is only 1.5GB/s, it become my bottleneck, Who can give me some suggestion or method to make the transfer more fast ? Thanks anyway.