Why it is so slow to use cudamemcpy(cudaMemcpyHostToHost)on tx2

My needs are very simple.
I got 32MB data calculated by gpu, now I want to transfer the data to host and store the data to heap memory(int* data = malloc()) or stack memory(vector data.resize())
I decide to use pinned memory, show my code and test on my tx2

#include <stdio.h>
#include <cuda_runtime.h>

#include <iostream>
#include <memory>
#include <string>
#include <chrono>
#include <vector>
using namespace std;

#define MEMCOPY_ITERATIONS  1
#define DEFAULT_SIZE        ( 32 * (1e6) )     //32 M

int main(int argc, char **argv) {
	printf("%s Starting...\n", argv[0]);

	// set up device
	int dev = 0;
	cudaDeviceProp deviceProp;
	cudaGetDeviceProperties(&deviceProp, 0);
	printf("Using Device %d: %s\n", dev, deviceProp.name);

	//cpu bandwidthTest
	int num = DEFAULT_SIZE / sizeof(float);
	vector<float> h_A_vec;
	vector<float> h_B_vec;
	h_A_vec.resize(num);
	h_B_vec.resize(num);
	float* h_A = (float*)malloc(DEFAULT_SIZE);
	float* h_B = (float*)malloc(DEFAULT_SIZE);
	

	//malloc2malloc
	{
		auto t1 = std::chrono::steady_clock::now();
		memcpy(h_A, h_B, DEFAULT_SIZE);
		auto t2 = std::chrono::steady_clock::now();
		double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
		printf("cpu memcpy malloc2malloc Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps/1e3, (DEFAULT_SIZE/1e9)/(iElaps/1e3));
	}

	//vector2vector
	{
		auto t1 = std::chrono::steady_clock::now();
		memcpy(h_A_vec.data(), h_B_vec.data(), DEFAULT_SIZE);
		auto t2 = std::chrono::steady_clock::now();
		double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
		printf("cpu memcpy vector2vecotr Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps/1e3, (DEFAULT_SIZE/1e9)/(iElaps/1e3));
	}

	//malloc2vector
	{
		auto t1 = std::chrono::steady_clock::now();
		memcpy(h_A, h_B_vec.data(), DEFAULT_SIZE);
		auto t2 = std::chrono::steady_clock::now();
		double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
		printf("cpu memcpy malloc2vector Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps/1e3, (DEFAULT_SIZE/1e9)/(iElaps/1e3));
	}

	float *d_A;
	float *h_pinned_A;
	cudaMalloc((void**)&d_A, DEFAULT_SIZE);
	cudaHostAlloc((void**)&h_pinned_A, DEFAULT_SIZE, cudaHostAllocDefault);//在主机上分配页锁定内存

	//pinned2vector
	{
		auto t1 = std::chrono::steady_clock::now();
		memcpy(h_A_vec.data(), h_pinned_A, DEFAULT_SIZE);
		auto t2 = std::chrono::steady_clock::now();
		double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
		printf("cpu memcpy pinned2vector Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps/1e3, (DEFAULT_SIZE/1e9)/(iElaps/1e3));
	}

	//device2vector
	{
		auto t1 = std::chrono::steady_clock::now();
		cudaMemcpy(h_B_vec.data(), d_A, DEFAULT_SIZE, cudaMemcpyDeviceToHost);
		cudaDeviceSynchronize();
		auto t2 = std::chrono::steady_clock::now();
		double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
		printf("cudaMemcpy device2vector Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps/1e3, (DEFAULT_SIZE/1e9)/(iElaps/1e3));
	}

	//device2malloc
	{
		auto t1 = std::chrono::steady_clock::now();
		cudaMemcpy(h_B, d_A, DEFAULT_SIZE, cudaMemcpyDeviceToHost);
		cudaDeviceSynchronize();
		auto t2 = std::chrono::steady_clock::now();
		double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
		printf("cudaMemcpy device2malloc Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps/1e3, (DEFAULT_SIZE/1e9)/(iElaps/1e3));
	}

	//device2pinned
	{
		auto t1 = std::chrono::steady_clock::now();
		cudaMemcpy(h_pinned_A, d_A, DEFAULT_SIZE, cudaMemcpyDeviceToHost);
		cudaDeviceSynchronize();
		auto t2 = std::chrono::steady_clock::now();
		double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
		printf("cudaMemcpy device2pinned Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps/1e3, (DEFAULT_SIZE/1e9)/(iElaps/1e3));
	}

	//device2pinned gpu time
	{
		// 实例化CUDA event
		cudaEvent_t e_start, e_stop;
		//创建事件
		cudaEventCreate(&e_start);
		cudaEventCreate(&e_stop);

		//记录事件,开始计算时间
		cudaEventRecord(e_start, 0);

		cudaMemcpy(h_pinned_A, d_A, DEFAULT_SIZE, cudaMemcpyDeviceToHost);
		//记录结束时事件
		cudaEventRecord(e_stop, 0);// 0 代表CUDA流0
		//等待事件同步后
		cudaEventSynchronize(e_stop);
		//计算对应的时间,评估代码性能
		float elapsedTime;
		cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
		printf("cudaMemcpy device2pinned gpu Time elapsed %f sec, Bandwidth =%.1f GB/s\n", elapsedTime / 1e3, (DEFAULT_SIZE / 1e9) / (elapsedTime / 1e3));
	}

	//pinned2malloc
	{
		auto t1 = std::chrono::steady_clock::now();
		memcpy(h_B, h_pinned_A, DEFAULT_SIZE);
		auto t2 = std::chrono::steady_clock::now();
		double iElaps = std::chrono::duration<double, std::milli>(t2 - t1).count();
		printf("cpu memcpy pinned2malloc Time elapsed %f sec, Bandwidth =%.1f GB/s\n", iElaps / 1e3, (DEFAULT_SIZE / 1e9) / (iElaps / 1e3));
	}

	//pinned2malloc gpu time
	{
		// 实例化CUDA event
		cudaEvent_t e_start, e_stop;
		//创建事件
		cudaEventCreate(&e_start);
		cudaEventCreate(&e_stop);

		//记录事件,开始计算时间
		cudaEventRecord(e_start, 0);

		cudaMemcpy(h_B, h_pinned_A, DEFAULT_SIZE, cudaMemcpyHostToHost);
		//记录结束时事件
		cudaEventRecord(e_stop, 0);// 0 代表CUDA流0
		//等待事件同步后
		cudaEventSynchronize(e_stop);
		//计算对应的时间,评估代码性能
		float elapsedTime;
		cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
		printf("cudaMemcpy pinned2malloc gpu Time elapsed %f sec, Bandwidth =%.1f GB/s\n", elapsedTime / 1e3, (DEFAULT_SIZE / 1e9) / (elapsedTime / 1e3));

		cudaEventDestroy(e_start);
		cudaEventDestroy(e_stop);
	}

	//pinned2vector
	{

		// 实例化CUDA event
		cudaEvent_t e_start, e_stop;
		//创建事件
		cudaEventCreate(&e_start);
		cudaEventCreate(&e_stop);

		//记录事件,开始计算时间
		cudaEventRecord(e_start, 0);

		cudaMemcpy(h_A_vec.data(), h_pinned_A, DEFAULT_SIZE, cudaMemcpyHostToHost);
		//记录结束时事件
		cudaEventRecord(e_stop, 0);// 0 代表CUDA流0
		//等待事件同步后
		cudaEventSynchronize(e_stop);
		//计算对应的时间,评估代码性能
		float elapsedTime;
		cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
		printf("cudaMemcpy pinned2vector gpu Time elapsed %f sec, Bandwidth =%.1f GB/s\n", elapsedTime / 1e3, (DEFAULT_SIZE / 1e9) / (elapsedTime / 1e3));

		cudaEventDestroy(e_start);
		cudaEventDestroy(e_stop);
	
	}
	return 0;
}

you can see , pinned2malloc or pinned2vector bandwidth is only 1.5GB/s, it become my bottleneck, Who can give me some suggestion or method to make the transfer more fast ? Thanks anyway.

Hi,

In case you don’t know.
Have you maximized the TX2 performance first?

$ sudo nvpmodel -m 0
$ sudo jetson_clocks

Thanks.

1 Like

Thanks a million, I follow your advice, it works!


I still have a doubt , Is 2.2GB/s have reached the limitation of tx2?

Hi,

We don’t have a software bandwidth value for TX2.
But you can run the below sample on performance mode to see if you already reach the limit.

/usr/local/cuda-10.2/samples/1_Utilities/bandwidthTest

Thanks.

官方提供带宽测试没有cpu memcpy的带宽测试代码
"ainstec@ainstec-desktop:/usr/local/cuda-10.2/samples/1_Utilities/bandwidthTest$ ./bandwidthTest --memory=pinned --mode=quick --csv
[CUDA Bandwidth Test] - Starting…
Running on…

Device 0: NVIDIA Tegra X2
Quick Mode

bandwidthTest-H2D-Pinned, Bandwidth = 18.1 GB/s, Time = 1.68836 s, Size = 32000000 bytes, NumDevsUsed = 1
bandwidthTest-D2H-Pinned, Bandwidth = 17.9 GB/s, Time = 1.70049 s, Size = 32000000 bytes, NumDevsUsed = 1
bandwidthTest-D2D, Bandwidth = 31.2 GB/s, Time = 0.97758 s, Size = 32000000 bytes, NumDevsUsed = 1
Result = PASS

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
"