Hello All,
I am Deepak and I am very new to CUDA. I started with the first program given in book CUDA BY EXAMPLE and i was shocked to see starnge results when i tried to measure the time of execution.
If I run the program using g++ compliler, it takes around 20ms to run the program but if I use CUDA, its takes 90 ms to run the program.
Following is my code in both versions.
include <stdio.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#define N 10
void add( float *a, float *b, float *c ) {
int tid = 0;
while (tid < N) {
c[tid] = (a[tid]/(a[tid]*a[tid])) + (b[tid]/(b[tid]*b[tid]));
tid += 1;
}
}
int main( void ) {
float elapsed;
float a[N], b[N], c[N];
int i;
clock_t timerStart, timerStop;
for (i=0; i<N; i++) {
a[i] = (float) (i)/(i+1);
b[i] = (float) (i)/(i+1);
c[i] = 0;
}
timerStart = clock();
add( a, b, c );
timerStop = clock();
elapsed = (float) ( timerStop - timerStart ) / CLOCKS_PER_SEC;
printf( "Time elapsed: %f ", elapsed);
for (i=0; i<N;i++)
printf(" %f \n",c[i]);
return 0;
}
[b]
My CUDA Version is [/b]
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#include "cutil.h"
#define N 10
__global__ void add( float *a, float *b, float *c ) {
int tid = blockIdx.x; // TID is the block ID
if (tid < N) {
c[tid] = (a[tid]/(a[tid]*a[tid])) + (b[tid]/(b[tid]*b[tid]));
}
}
int main( void ) {
float a[N], b[N], c[N];
float *temp_a,*temp_b,*temp_c;
long i;
float elapsed_time_cpu_gpu,elapsed_time_add,elapsed_time_gpu_cpu;
cudaEvent_t start,stop,startadd,stopadd,startback,stopback;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventCreate(&startadd);
cudaEventCreate(&stopadd);
cudaEventCreate(&startback);
cudaEventCreate(&stopback);
cudaMalloc((void**)&temp_a,N*sizeof(int));
cudaMalloc((void**)&temp_b,N*sizeof(int));
cudaMalloc((void**)&temp_c,N*sizeof(int));
for (i=0; i<N; i++) {
a[i] = (float) (i)/(i+1);
b[i] = (float) (i)/(i+1);
c[i] = 0;
}
cudaEventRecord(start,0);
cudaMemcpy(temp_a,a,N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(temp_b,b,N*sizeof(int),cudaMemcpyHostToDevice);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_cpu_gpu,start,stop);
printf("Time taken CUDA : %f \n",elapsed_time_cpu_gpu);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventRecord(startadd,0);
add<<<N,N>>>(temp_a,temp_b,temp_c);
cudaEventRecord(stopadd,0);
cudaEventSynchronize(stopadd);
cudaEventElapsedTime(&elapsed_time_add,startadd,stopadd);
printf("Time taken CUDA : %f \n",elapsed_time_add);
cudaEventDestroy(startadd);
cudaEventDestroy(stopadd);
cudaEventRecord(startback,0);
cudaMemcpy(c,temp_c, N*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventRecord(stopback,0);
cudaEventSynchronize(stopback);
cudaEventElapsedTime(&elapsed_time_gpu_cpu,startback,stopback);
cudaEventDestroy(startback);
cudaEventDestroy(stopback);
/* for (i=0; i<N; i++) {
printf ("%f %f %f\n", a[i], b[i], c[i] );
}
*/
cudaFree(temp_a);
cudaFree(temp_b);
cudaFree(temp_c);
return 0;
}
If I measure the timings, Processing of add function in CUDA takes 35 ms and processing of add in normal C program takes 20 ms.
Also when I measure individual timing of copying from Device to host and host to devices, it takes around 28 ms for each making total of around 85 ms for executing the complete program.
May be I am making a mistake but I am not able to correct out as I am very new to CUDA.
Please help me in this regard.
Thanks
Deepak