Time for Splitting up Memory Transfers?

Why does this code perform like this:

float time_memcpy;

	

	cudaEvent_t start_event, stop_event;

	cudaEventCreateWithFlags(&start_event, cudaEventBlockingSync);

	cudaEventCreateWithFlags(&stop_event, cudaEventBlockingSync);

	

	int size = 128 * 128 * 128 * 128;

	

	float* h_a = new float;

	

	float* a;

	

	cudaMalloc((void**) &a, sizeof(float) * size);

	cudaEventRecord(start_event, 0);

	

	int split = 128;

	

	for (int i = 0; i < split; i++)

	{

		cudaMemcpyAsync(a + i * size / split, h_a + i * size / split, sizeof(float) * size / split, cudaMemcpyHostToDevice);

	}

	

	cudaEventRecord(stop_event, 0);

	cudaEventSynchronize(stop_event);

	cudaEventElapsedTime(&time_memcpy, start_event, stop_event);

	printf("memcopy:\t%.2f\n", time_memcpy);

	

	cudaFree(a);

	

	delete[] h_a;

split = 1: 357.66

split = 4: 359.68

split = 16: 182.08

split = 64: 119.02

split = 256: 117.83

split = 1028: 163.68

Any explanations for what is going on here? This is using a C2050.

Thanks,

Dan

Note using pinned memory:

float* h_a;

	cudaHostAlloc(&h_a, sizeof(float) * size, cudaHostAllocDefault);

I get these times:

split = 1: 178.06

split = 4: 178.37

split = 16: 89.11

split = 64: 89.22

split = 256: 89.54

split = 1028: 91.18

Which does look a lot more consistent, but it still confuses me why splitting these memory transfers up into 16 separate asynchronous transfers or more results in halving the time taken over doing the memory transfer in one big chunk?

In the programming guide, it mentions that host <-> device memory copies of a memory block of 64KB or less are asynchronous. With the split variable set to 16, we’re transferring exactly 64KB of memory per chunk, but I’m explicitly asking for the memory copies to be asynchronous anyway, so this shouldn’t make a difference?

Note using pinned memory:

float* h_a;

	cudaHostAlloc(&h_a, sizeof(float) * size, cudaHostAllocDefault);

I get these times:

split = 1: 178.06

split = 4: 178.37

split = 16: 89.11

split = 64: 89.22

split = 256: 89.54

split = 1028: 91.18

Which does look a lot more consistent, but it still confuses me why splitting these memory transfers up into 16 separate asynchronous transfers or more results in halving the time taken over doing the memory transfer in one big chunk?

In the programming guide, it mentions that host <-> device memory copies of a memory block of 64KB or less are asynchronous. With the split variable set to 16, we’re transferring exactly 64KB of memory per chunk, but I’m explicitly asking for the memory copies to be asynchronous anyway, so this shouldn’t make a difference?

You should be checking for returned error conditions. According to the CudaReferenceManual.pdf, cudaMemcpyAsync() fails if you pass it a host pointer that is not page-locked (“pinned”). The timings in the first test are probably bogus.

As for the second test, I don’t think synchronous or asynchronous should make a significant difference since you wait for a cudaEvent at the end.
It is quite curious that many small transfers are significantly faster…

You should be checking for returned error conditions. According to the CudaReferenceManual.pdf, cudaMemcpyAsync() fails if you pass it a host pointer that is not page-locked (“pinned”). The timings in the first test are probably bogus.

As for the second test, I don’t think synchronous or asynchronous should make a significant difference since you wait for a cudaEvent at the end.
It is quite curious that many small transfers are significantly faster…

You’re right, whether the transfer is synchronous or asynchronous makes negligible difference.

I figured it out, it was just a mistake in my code, I was using ints instead of longs, so there was insufficient precision to hold the offsets when splitting by larger values, meaning it wasn’t copying all the data.

Pleased to see there’s very little overhead in performing 1000 small memory transfers as opposed to 1 large memory transfer, which is what I was aiming to find out.

You’re right, whether the transfer is synchronous or asynchronous makes negligible difference.

I figured it out, it was just a mistake in my code, I was using ints instead of longs, so there was insufficient precision to hold the offsets when splitting by larger values, meaning it wasn’t copying all the data.

Pleased to see there’s very little overhead in performing 1000 small memory transfers as opposed to 1 large memory transfer, which is what I was aiming to find out.

Doing the same test with somewhat adapted code:

[codebox]#include <math.h>

#include <stdio.h>

#include <time.h>

#include “cuda_runtime_api.h”

#define DIGITS(a) ((int)log10((double)(a))+1)

#define LOG2SIZE 27 // < 32

#define MAXPARTS 4096

#define MAXSECSPERLOOP 3.0f

unsigned floats = (unsigned)pow(2.,LOG2SIZE);

int main(void)

{

cudaEvent_t start_event, stop_event;

clock_t cstart, cfinish, progstart, progend;

progstart=clock();

cudaEventCreateWithFlags( &start_event, cudaEventBlockingSync );

cudaEventCreateWithFlags( &stop_event, cudaEventBlockingSync );

float* a;

float* h_a = new float[floats];

if (!h_a) { perror("Host-allocation error"); return 1; }		// hardly likely..

if( cudaMalloc( (void**) &a, sizeof(*a) * floats) != cudaSuccess )

	{ printf("Need more memory on GPU to alloc %.2f MB..\n",(floats * sizeof(*h_a))/1024./1024.); return 2; }

for( unsigned n = 0 , parts = 1, chunk = floats / parts * sizeof(float) ; n < LOG2SIZE && parts <= MAXPARTS; n++ , parts*=2, chunk /= 2 )

//for( unsigned n = 0 , parts = MAXPARTS, chunk = floats / parts * sizeof(float) ; n < LOG2SIZE && parts > 0; n++ , parts/=2, chunk *= 2 )

{

	float time_memcpy;

	// warm-up

	//cudaMemcpyAsync( a , h_a , sizeof(float) * floats / MAXPARTS, cudaMemcpyHostToDevice );

	cudaThreadSynchronize();

	cudaEventRecord( start_event, 0 );

	cstart=clock();

	for ( unsigned i = 0; i < parts; i++ )

	{

		cudaMemcpyAsync( a + i * floats / parts, h_a + i * floats / parts, chunk, cudaMemcpyHostToDevice );

		//cudaMemcpy( a + i * floats / parts, h_a + i * floats / parts, chunk, cudaMemcpyHostToDevice );

	}

	cudaThreadSynchronize();

	cudaEventRecord( stop_event, 0 );

	cfinish=clock();

	cudaEventSynchronize( stop_event );

	cudaEventElapsedTime( &time_memcpy, start_event, stop_event );

	double duration = (double)(cfinish - cstart) / CLOCKS_PER_SEC * 1000.0;

	printf( "memcopy %u MB, %*u x %*u kb:\t%7.2f msec (%.0f)\n", (floats * sizeof(*a))/1024/1024, DIGITS(MAXPARTS), parts, DIGITS(floats*sizeof(floats)/1024), chunk/1024, time_memcpy, duration );

	if ( n>1 && time_memcpy > MAXSECSPERLOOP*1000.0) break;

}

progend=clock();

printf("running for %.0f milliseconds\n",(double)(progend - progstart) / CLOCKS_PER_SEC * 1000.0);

cudaFree( a );

delete[] h_a;

return 0;

}[/codebox]

I don’t see your point with the ints, no trouble I can see. Testing on visual C++ gave me the idea that copying in one block is faster in debug mode than in release. That is a strange result. I took care to have the same alignment.

I attached a graph.
memcpygraph.png

Doing the same test with somewhat adapted code:

[codebox]#include <math.h>

#include <stdio.h>

#include <time.h>

#include “cuda_runtime_api.h”

#define DIGITS(a) ((int)log10((double)(a))+1)

#define LOG2SIZE 27 // < 32

#define MAXPARTS 4096

#define MAXSECSPERLOOP 3.0f

unsigned floats = (unsigned)pow(2.,LOG2SIZE);

int main(void)

{

cudaEvent_t start_event, stop_event;

clock_t cstart, cfinish, progstart, progend;

progstart=clock();

cudaEventCreateWithFlags( &start_event, cudaEventBlockingSync );

cudaEventCreateWithFlags( &stop_event, cudaEventBlockingSync );

float* a;

float* h_a = new float[floats];

if (!h_a) { perror("Host-allocation error"); return 1; }		// hardly likely..

if( cudaMalloc( (void**) &a, sizeof(*a) * floats) != cudaSuccess )

	{ printf("Need more memory on GPU to alloc %.2f MB..\n",(floats * sizeof(*h_a))/1024./1024.); return 2; }

for( unsigned n = 0 , parts = 1, chunk = floats / parts * sizeof(float) ; n < LOG2SIZE && parts <= MAXPARTS; n++ , parts*=2, chunk /= 2 )

//for( unsigned n = 0 , parts = MAXPARTS, chunk = floats / parts * sizeof(float) ; n < LOG2SIZE && parts > 0; n++ , parts/=2, chunk *= 2 )

{

	float time_memcpy;

	// warm-up

	//cudaMemcpyAsync( a , h_a , sizeof(float) * floats / MAXPARTS, cudaMemcpyHostToDevice );

	cudaThreadSynchronize();

	cudaEventRecord( start_event, 0 );

	cstart=clock();

	for ( unsigned i = 0; i < parts; i++ )

	{

		cudaMemcpyAsync( a + i * floats / parts, h_a + i * floats / parts, chunk, cudaMemcpyHostToDevice );

		//cudaMemcpy( a + i * floats / parts, h_a + i * floats / parts, chunk, cudaMemcpyHostToDevice );

	}

	cudaThreadSynchronize();

	cudaEventRecord( stop_event, 0 );

	cfinish=clock();

	cudaEventSynchronize( stop_event );

	cudaEventElapsedTime( &time_memcpy, start_event, stop_event );

	double duration = (double)(cfinish - cstart) / CLOCKS_PER_SEC * 1000.0;

	printf( "memcopy %u MB, %*u x %*u kb:\t%7.2f msec (%.0f)\n", (floats * sizeof(*a))/1024/1024, DIGITS(MAXPARTS), parts, DIGITS(floats*sizeof(floats)/1024), chunk/1024, time_memcpy, duration );

	if ( n>1 && time_memcpy > MAXSECSPERLOOP*1000.0) break;

}

progend=clock();

printf("running for %.0f milliseconds\n",(double)(progend - progstart) / CLOCKS_PER_SEC * 1000.0);

cudaFree( a );

delete[] h_a;

return 0;

}[/codebox]

I don’t see your point with the ints, no trouble I can see. Testing on visual C++ gave me the idea that copying in one block is faster in debug mode than in release. That is a strange result. I took care to have the same alignment.

I attached a graph.