Huge differences in transfertime to/from device Please help me to understand CUDA memory

Sandra_Ullrich · January 2, 2010, 3:52am

Hello Forum,

I’m a raw beginner in CUDA programing. When I run the attached program with N=25000000 elements the results are:

upload to device: 73.559486 (ms)

calc. @ GPU: 0.001824 (ms)

download from device: 69.704865 (ms)

BUT! When I run the program with N=30000000 (or more) elements the results are:

upload to device: 0.001888 (ms)

calc. @ GPU: 0.001760 (ms)

download from device: 0.001856 (ms)

I don’t understand the extrem differences in the upload/download time. From my point of view the upload to / dowload from device with N>30000000 should exceed the time with N=25000000 elements.

As well I can’t explain myself the behaviour of the program, when I run it without changing any parameters. The calculation time at the GPU and the upload respectively download time differs everytime I run the program.

Enclosed I attached the source code and the output of the device query.

Thank you very much for your help and your time.

Best regards,

Sandra

#include <cuda.h>

#include <stdio.h>

#include <cutil_inline.h>

// Kernel that executes on the CUDA device

__global__ void square_array(float *a, int N)

{

	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	if (idx<N) a[idx] = a[idx] * a[idx];

}

// main routine that executes on the host

int main(void)

{

	float *a_h, *a_d;  // Pointer to host & device arrays

	const int N = 25000000;  // Number of elements in arrays

	size_t size = N * sizeof(float);

	a_h = (float *)malloc(size);		// Allocate array on host

	cudaMalloc((void **) &a_d, size);   // Allocate array on device

	float time=0;

	int block_size = 128;

	int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);

	// Initialize host array

	cudaEvent_t start1, stop1;

	cudaEventCreate(&start1);

	cudaEventCreate(&stop1);

	cudaEventRecord(start1, 0);

	for (int i=0; i<N; i++) a_h[i] = (float)i;

	

	cudaEventRecord(stop1, 0);

	cudaEventSynchronize(stop1);

	cudaEventElapsedTime(&time, start1, stop1);

	printf( "init array @ host: %f (ms)\n", time);

	cudaEventDestroy(start1);

	cudaEventDestroy(stop1);

	// Copy array to CUDA device

		cudaEvent_t start2, stop2;

	cudaEventCreate(&start2);

	cudaEventCreate(&stop2);

	cudaEventRecord(start2, 0);

	cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);

	cudaEventRecord(stop2, 0);

	cudaEventSynchronize(stop2);

	cudaEventElapsedTime(&time, start2, stop2);

	printf( "upload to device: %f (ms)\n", time);

	cudaEventDestroy(start2);

	cudaEventDestroy(stop2);

	// Do calculation on device:

	cudaEvent_t start3, stop3;

	cudaEventCreate(&start3);

	cudaEventCreate(&stop3);

	cudaEventRecord(start3, 0);

	square_array <<< n_blocks, block_size >>> (a_d, N);

	cudaEventRecord(stop3, 0);

	cudaEventSynchronize(stop3);

	cudaEventElapsedTime(&time, start3, stop3);

	printf( "calc. @ GPU: %f (ms)\n", time);

	cudaEventDestroy(start3);

	cudaEventDestroy(stop3);

	// Retrieve result from device and store it in host array

	cudaEvent_t start4, stop4;

	cudaEventCreate(&start4);

	cudaEventCreate(&stop4);

	cudaEventRecord(start4, 0);

	cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);

	cudaEventRecord(stop4, 0);

	cudaEventSynchronize(stop4);

	cudaEventElapsedTime(&time, start4, stop4);

	printf( "download from device: %f (ms)\n", time);

	cudaEventDestroy(start4);

	cudaEventDestroy(stop4);

	

	printf( "------------------------------------\n" );

	printf( "blocksize: %d // elements: %d\n", block_size, N);

	// Cleanup

	free(a_h); 

	cudaFree(a_d);

}

my_hardware.txt (1.44 KB)

SPWorley · January 2, 2010, 4:56am

Kernel launches are asynchronous, so your timing harness is timing launch queuing overhead, not execution times.

Put a cudaThreadSynchronize() before and after each kernel launch if you want to make sure you’re testing just the kernel’s execution time.