[solved] strange cuda memcopy time

hi,

I’m benchmarking my GTX750, I wrote some code to test PCI-e bandwidth, but I have incredible results.
Data size is about 68MB, so 6ms means 11GB/s it’s the normal bandwidth of PCI-e 3.0.

My program’s output (milliseconds) :

time full1 : 5.98417
time full2 : 5.91689
time 10008 : 5.86158
time 1000 : 0.636335
time 100 : 0.132855
time 50 : 0.32672
time 10 : 4.69325
time 1 : 321.036

Here my test code, StartCounter and GetCounter work well, there is no time measuring error :

#include <windows.h>

double PCFreq = 0.0;
__int64 CounterStart = 0;

void * cuda_test1(){
	uchar * dst;
	uchar * data;
	int const height = 10008;
	int const width = 7092;
	size_t size = sizeof(uchar) * height * width;
	size_t packet;
	int i, lines;

	data = (uchar*)malloc(size);
	cudaMalloc(&dst, size);
	int err = cudaHostRegister(data, size, cudaHostRegisterDefault);

	if (err != cudaSuccess)
		return NULL;

	StartCounter();
	cudaMemcpy((void*)dst, (void*)data, size, cudaMemcpyHostToDevice);
	std::cout << "time full1 : " << GetCounter() << std::endl;

	StartCounter();
	cudaMemcpy((void*)dst, (void*)data, size, cudaMemcpyHostToDevice);
	std::cout << "time full2 : " << GetCounter() << std::endl;
	
	//-----------
	lines = height;
	packet = (size_t)lines * width;

	StartCounter();
	for (i = 0; i < height; i += lines)
	{
		cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
	}
	std::cout << "time " << lines << " : " << GetCounter() << std::endl;

	//-----------
	lines = 1000;
	packet = (size_t)lines * width;

	StartCounter();
	for (i = 0; i < height; i += lines)
	{
		cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
	}
	std::cout << "time " << lines << " : " << GetCounter() << std::endl;

	//-----------
	lines = 100;
	packet = (size_t)lines * width;

	StartCounter();
	for (i = 0; i < height; i += lines)
	{
		cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
	}
	std::cout << "time " << lines << " : " << GetCounter() << std::endl;

	//-----------
	lines = 50;
	packet = (size_t)lines * width;

	StartCounter();
	for (i = 0; i < height; i += lines)
	{
		cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
	}
	std::cout << "time " << lines << " : " << GetCounter() << std::endl;

	//-----------
	lines = 10;
	packet = (size_t)lines * width;

	StartCounter();
	for (i = 0; i < height; i += lines)
	{
		cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
	}
	std::cout << "time " << lines << " : " << GetCounter() << std::endl;

	//-----------
	lines = 1;
	packet = (size_t)lines * width;

	StartCounter();
	for (i = 0; i < height; i += lines)
	{
		cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
	}
	std::cout << "time " << lines << " : " << GetCounter() << std::endl;

	return dst;
}

void StartCounter()
{
	LARGE_INTEGER li;
	if (!QueryPerformanceFrequency(&li))
		std::cout << "QueryPerformanceFrequency failed!\n";

	PCFreq = double(li.QuadPart) / 1000.0;

	QueryPerformanceCounter(&li);
	CounterStart = li.QuadPart;
}

double GetCounter()
{
	LARGE_INTEGER li;
	QueryPerformanceCounter(&li);
	return double(li.QuadPart - CounterStart) / PCFreq;
}

As memory copies are not asynchronous, I don’t understand why I get these very short times.
Anybody has an explanation of these results ?

Thanks

For transfers from pageable host memory to device memory, a stream sync is
performed before the copy is initiated. The function will return once the pageable
buffer has been copied to the staging memory for DMA transfer to device memory,
but the DMA to final destination may not have completed.

Thank you for your answer,

I use pinned memory, according to http://developer.download.nvidia.com/compute/cuda/4_1/rel/toolkit/docs/online/sync_async.html the function waits until the end of the copy.

I tried without pinned memory and I add a cudaDeviceSynchronize after each transfer, I get these times :
time full1 : 15.4636
time full2 : 13.6709
time 10008 : 14.2705
time 1000 : 1.59255
time 100 : 0.335558
time 50 : 0.600413
time 10 : 5.60186
time 1 : 300.684

It’s slower than pinned memory, but still impossible to copy 68MB in 0.12ms (553 GB/s bandwidth)

New code :

void * cuda_test1(){
	uchar * dst;
	uchar * data;
	int const height = 10008;
	int const width = 7092;
	size_t size = sizeof(uchar) * height * width;
	size_t packet;
	int i, lines;

	data = (uchar*)malloc(size);
	cudaMalloc(&dst, size);
	/*int err = cudaHostRegister(data, size, cudaHostRegisterDefault);

	if (err != cudaSuccess)
		return NULL;*/
	
	StartCounter();
	cudaMemcpy((void*)dst, (void*)data, size, cudaMemcpyHostToDevice);
	cudaDeviceSynchronize();
	std::cout << "time full1 : " << GetCounter() << std::endl;

	StartCounter();
	cudaMemcpy((void*)dst, (void*)data, size, cudaMemcpyHostToDevice);
	cudaDeviceSynchronize();
	std::cout << "time full2 : " << GetCounter() << std::endl;
	
	//-----------
	lines = height;
	packet = (size_t)lines * width;

	StartCounter();
	for (i = 0; i < height; i += lines)
	{
		cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
	}
	cudaDeviceSynchronize();
	std::cout << "time " << lines << " : " << GetCounter() << std::endl;

	//-----------
	lines = 1000;
	packet = (size_t)lines * width;

	StartCounter();
	for (i = 0; i < height; i += lines)
	{
		cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
	}
	cudaDeviceSynchronize();
	std::cout << "time " << lines << " : " << GetCounter() << std::endl;

	//-----------
	lines = 100;
	packet = (size_t)lines * width;

	StartCounter();
	for (i = 0; i < height; i += lines)
	{
		cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
	}
	cudaDeviceSynchronize();
	std::cout << "time " << lines << " : " << GetCounter() << std::endl;

	//-----------
	lines = 50;
	packet = (size_t)lines * width;

	StartCounter();
	for (i = 0; i < height; i += lines)
	{
		cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
	}
	cudaDeviceSynchronize();
	std::cout << "time " << lines << " : " << GetCounter() << std::endl;

	//-----------
	lines = 10;
	packet = (size_t)lines * width;

	StartCounter();
	for (i = 0; i < height; i += lines)
	{
		cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
	}
	cudaDeviceSynchronize();
	std::cout << "time " << lines << " : " << GetCounter() << std::endl;

	//-----------
	lines = 1;
	packet = (size_t)lines * width;

	StartCounter();
	for (i = 0; i < height; i += lines)
	{
		cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
	}
	cudaDeviceSynchronize();
	std::cout << "time " << lines << " : " << GetCounter() << std::endl;

	return dst;
}

lines = height;
packet = (size_t)lines * width;

StartCounter();
for (i = 0; i < height; i += lines)
{
cudaMemcpy(dst + ipacket, data + ipacket, packet,

is the number of bytes to copy correct here…?

how do you get 0.12ms? regardless of the overall time, which individual copies are wrong, given that you have individual time measures?

and perhaps first run it in the debugger, or catch errors, to make sure you incur no segmentation faults, etc

Like little_jimmy said your code isn’t copying the right data and is returning errors that you’re not checking. So you’re only measuring the first transfer.

Something more like this should work.

cudaError_t err = cudaMemcpy(dst + i*width, data + i*width, min((int)packet, (int)(size - i*width)), cudaMemcpyHostToDevice);

Thank you, now times are more realistic :)

new code :

if ((i + lines) > height)
	packet = size - (size_t)(i * width);
err = cudaMemcpy(dst + i*width, data + i*width, packet, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
	return NULL;

It works, thanks a lot