Hello Forum,
I’m a raw beginner in CUDA programing. When I run the attached program with N=25000000 elements the results are:
upload to device: 73.559486 (ms)
calc. @ GPU: 0.001824 (ms)
download from device: 69.704865 (ms)
BUT! When I run the program with N=30000000 (or more) elements the results are:
upload to device: 0.001888 (ms)
calc. @ GPU: 0.001760 (ms)
download from device: 0.001856 (ms)
I don’t understand the extrem differences in the upload/download time. From my point of view the upload to / dowload from device with N>30000000 should exceed the time with N=25000000 elements.
As well I can’t explain myself the behaviour of the program, when I run it without changing any parameters. The calculation time at the GPU and the upload respectively download time differs everytime I run the program.
Enclosed I attached the source code and the output of the device query.
Thank you very much for your help and your time.
Best regards,
Sandra
#include <cuda.h>
#include <stdio.h>
#include <cutil_inline.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 25000000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
float time=0;
int block_size = 128;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
// Initialize host array
cudaEvent_t start1, stop1;
cudaEventCreate(&start1);
cudaEventCreate(&stop1);
cudaEventRecord(start1, 0);
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaEventRecord(stop1, 0);
cudaEventSynchronize(stop1);
cudaEventElapsedTime(&time, start1, stop1);
printf( "init array @ host: %f (ms)\n", time);
cudaEventDestroy(start1);
cudaEventDestroy(stop1);
// Copy array to CUDA device
cudaEvent_t start2, stop2;
cudaEventCreate(&start2);
cudaEventCreate(&stop2);
cudaEventRecord(start2, 0);
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
cudaEventRecord(stop2, 0);
cudaEventSynchronize(stop2);
cudaEventElapsedTime(&time, start2, stop2);
printf( "upload to device: %f (ms)\n", time);
cudaEventDestroy(start2);
cudaEventDestroy(stop2);
// Do calculation on device:
cudaEvent_t start3, stop3;
cudaEventCreate(&start3);
cudaEventCreate(&stop3);
cudaEventRecord(start3, 0);
square_array <<< n_blocks, block_size >>> (a_d, N);
cudaEventRecord(stop3, 0);
cudaEventSynchronize(stop3);
cudaEventElapsedTime(&time, start3, stop3);
printf( "calc. @ GPU: %f (ms)\n", time);
cudaEventDestroy(start3);
cudaEventDestroy(stop3);
// Retrieve result from device and store it in host array
cudaEvent_t start4, stop4;
cudaEventCreate(&start4);
cudaEventCreate(&stop4);
cudaEventRecord(start4, 0);
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaEventRecord(stop4, 0);
cudaEventSynchronize(stop4);
cudaEventElapsedTime(&time, start4, stop4);
printf( "download from device: %f (ms)\n", time);
cudaEventDestroy(start4);
cudaEventDestroy(stop4);
printf( "------------------------------------\n" );
printf( "blocksize: %d // elements: %d\n", block_size, N);
// Cleanup
free(a_h);
cudaFree(a_d);
}
my_hardware.txt (1.44 KB)