Hello All,
I am Deepak and I am very new to CUDA. I started with the first program given in book CUDA BY EXAMPLE and i was shocked to see starnge results when i tried to measure the time of execution.
If I run the program using g++ compliler, it takes around 20ms to run the program but if I use CUDA, its takes 90 ms to run the program.
Following is my code in both versions.
include <stdio.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#define N 10
void add( float *a, float *b, float *c ) {
int tid = 0;
while (tid < N) {
c[tid] = (a[tid]/(a[tid]*a[tid])) + (b[tid]/(b[tid]*b[tid]));
tid += 1;
}
}
int main( void ) {
float elapsed;
float a[N], b[N], c[N];
int i;
clock_t timerStart, timerStop;
for (i=0; i<N; i++) {
a[i] = (float) (i)/(i+1);
b[i] = (float) (i)/(i+1);
c[i] = 0;
}
timerStart = clock();
add( a, b, c );
timerStop = clock();
elapsed = (float) ( timerStop - timerStart ) / CLOCKS_PER_SEC;
printf( "Time elapsed: %f ", elapsed);
for (i=0; i<N;i++)
printf(" %f \n",c[i]);
return 0;
}
MY CUDA VERSION is
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#include "cutil.h"
#include <time.h>
#define TIMECUDA
//#define TIMECPU
#define N 10
__global__ void add( float *a, float *b, float *c ) {
int tid = blockIdx.x; // TID is the block ID
if (tid < N) {
c[tid] = (a[tid]/(a[tid]*a[tid])) + (b[tid]/(b[tid]*b[tid]));
}
}
int main( void ) {
float a[N], b[N], c[N];
float *temp_a,*temp_b,*temp_c;
long i;
#ifdef TIMECUDA
float elapsed_time_cpu_gpu,elapsed_time_add,elapsed_time_gpu_cpu;
#else
#endif
#ifdef TIMECPU
float elapsed_time;
#else
#endif
#ifdef TIMECPU
clock_t timerStart, timerStop;
#else
#endif
#ifdef TIMECUDA
cudaEvent_t start,stop,startadd,stopadd,startback,stopback;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventCreate(&startadd);
cudaEventCreate(&stopadd);
cudaEventCreate(&startback);
cudaEventCreate(&stopback);
#else
#endif
cudaMalloc((void**)&temp_a,N*sizeof(int));
cudaMalloc((void**)&temp_b,N*sizeof(int));
cudaMalloc((void**)&temp_c,N*sizeof(int));
for (i=0; i<N; i++) {
a[i] = (float) (i)/(i+1);
b[i] = (float) (i)/(i+1);
c[i] = 0;
}
#ifdef TIMECUDA
cudaEventRecord(start,0);
#else
#endif
cudaMemcpy(temp_a,a,N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(temp_b,b,N*sizeof(int),cudaMemcpyHostToDevice);
#ifdef TIMECUDA
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_cpu_gpu,start,stop);
printf("Time taken CUDA : %f \n",elapsed_time_cpu_gpu);
cudaEventDestroy(start);
cudaEventDestroy(stop);
#else
#endif
#ifdef TIMECPU
timerStart = clock();
#else
#endif
#ifdef TIMECUDA
cudaEventRecord(startadd,0);
#else
#endif
add<<<N,N>>>(temp_a,temp_b,temp_c);
#ifdef TIMECUDA
cudaEventRecord(stopadd,0);
cudaEventSynchronize(stopadd);
cudaEventElapsedTime(&elapsed_time_add,startadd,stopadd);
printf("Time taken CUDA : %f \n",elapsed_time_add);
cudaEventDestroy(startadd);
cudaEventDestroy(stopadd);
#else
#endif
#ifdef TIMECPU
timerStop = clock();
elapsed_time = (float) ( timerStart - timerStop ) / CLOCKS_PER_SEC;
printf("Time taken CPU : %f \n",elapsed_time);
#else
#endif
#ifdef TIMECUDA
cudaEventRecord(startback,0);
#else
#endif
cudaMemcpy(c,temp_c, N*sizeof(int),cudaMemcpyDeviceToHost);
#ifdef TIMECUDA
cudaEventRecord(stopback,0);
cudaEventSynchronize(stopback);
cudaEventElapsedTime(&elapsed_time_gpu_cpu,startback,stopback);
printf("Time taken CUDA : %f ",elapsed_time_gpu_cpu);
cudaEventDestroy(startback);
cudaEventDestroy(stopback);
#else
#endif
/* for (i=0; i<N; i++) {
printf ("%f %f %f\n", a[i], b[i], c[i] );
}
*/
cudaFree(temp_a);
cudaFree(temp_b);
cudaFree(temp_c);
return 0;
}
My CPU version takes 20ms to execute.
In CUDA version, Copying data from device to host only takes 26 ms , add function takes 33 ms and back from host to device , it takes 29 ms. So around 80 ms.
Can anyone tell me the reason why its like this?
Its really strange. I thought CUDA is fast enough.
Please help me as I really want to work on CUDA.
Thanks