more time taken by CUDA rather than reducing time

Hello All,

I am Deepak and I am very new to CUDA. I started with the first program given in book CUDA BY EXAMPLE and i was shocked to see starnge results when i tried to measure the time of execution.

If I run the program using g++ compliler, it takes around 20ms to run the program but if I use CUDA, its takes 90 ms to run the program.

Following is my code in both versions.

include <stdio.h>

#include <time.h>

#include <math.h>

#include <stdlib.h>

#define N   10

void add( float *a, float *b, float *c ) {

    int tid = 0;

    while (tid < N) {

        c[tid] = (a[tid]/(a[tid]*a[tid])) + (b[tid]/(b[tid]*b[tid]));

        tid += 1;

    }

}

int main( void ) {

	float elapsed;

        float a[N], b[N], c[N]; 

        int i;

	clock_t timerStart, timerStop;

	for (i=0; i<N; i++) {

        	a[i] = (float) (i)/(i+1);

       		b[i] = (float) (i)/(i+1);

        	c[i] = 0;

    	}

    	timerStart = clock();

    	add( a, b, c );

	timerStop = clock();

	elapsed = (float) ( timerStop - timerStart ) / CLOCKS_PER_SEC;

	printf( "Time elapsed:  %f ", elapsed);

	for (i=0; i<N;i++)

	printf(" %f \n",c[i]);

	return 0;

}

MY CUDA VERSION is

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <cuda.h>

#include "cutil.h"

#include <time.h>

#define TIMECUDA

//#define TIMECPU

#define N 10

__global__ void add( float *a, float *b, float *c ) {

    int tid = blockIdx.x;   // TID is the block ID

    if (tid < N) {

       c[tid] = (a[tid]/(a[tid]*a[tid])) + (b[tid]/(b[tid]*b[tid]));

    }

}

int main( void ) {

	float a[N], b[N], c[N];

	float *temp_a,*temp_b,*temp_c;

    	long i;

#ifdef TIMECUDA

	float elapsed_time_cpu_gpu,elapsed_time_add,elapsed_time_gpu_cpu;

#else

#endif

#ifdef TIMECPU

	float elapsed_time;

#else

#endif

#ifdef TIMECPU

	clock_t timerStart, timerStop;

#else

#endif

#ifdef TIMECUDA

	cudaEvent_t start,stop,startadd,stopadd,startback,stopback;

	cudaEventCreate(&start);

	cudaEventCreate(&stop);	

	cudaEventCreate(&startadd);

	cudaEventCreate(&stopadd);	

	cudaEventCreate(&startback);

	cudaEventCreate(&stopback);	

#else

#endif

	cudaMalloc((void**)&temp_a,N*sizeof(int));

	cudaMalloc((void**)&temp_b,N*sizeof(int));

	cudaMalloc((void**)&temp_c,N*sizeof(int));

for (i=0; i<N; i++) {

           a[i] = (float) (i)/(i+1);

           b[i] = (float) (i)/(i+1);

           c[i] = 0;

       }

#ifdef TIMECUDA

	cudaEventRecord(start,0);

#else

#endif

 	cudaMemcpy(temp_a,a,N*sizeof(int),cudaMemcpyHostToDevice);

	cudaMemcpy(temp_b,b,N*sizeof(int),cudaMemcpyHostToDevice);

#ifdef TIMECUDA

	cudaEventRecord(stop,0);

	cudaEventSynchronize(stop);

	cudaEventElapsedTime(&elapsed_time_cpu_gpu,start,stop);

	printf("Time taken CUDA : %f \n",elapsed_time_cpu_gpu);

	cudaEventDestroy(start);

	cudaEventDestroy(stop); 

#else

#endif

#ifdef TIMECPU

	timerStart = clock();

#else

#endif

#ifdef TIMECUDA

	cudaEventRecord(startadd,0);

#else

#endif

        add<<<N,N>>>(temp_a,temp_b,temp_c);

#ifdef TIMECUDA

	cudaEventRecord(stopadd,0);

	cudaEventSynchronize(stopadd);

	cudaEventElapsedTime(&elapsed_time_add,startadd,stopadd);

	printf("Time taken CUDA : %f \n",elapsed_time_add);

	cudaEventDestroy(startadd);

	cudaEventDestroy(stopadd); 

#else

#endif

#ifdef TIMECPU

	timerStop = clock();

	elapsed_time = (float) ( timerStart - timerStop ) / CLOCKS_PER_SEC;

	printf("Time taken CPU  : %f \n",elapsed_time);

#else

#endif

#ifdef TIMECUDA

	cudaEventRecord(startback,0);

#else

#endif

	cudaMemcpy(c,temp_c, N*sizeof(int),cudaMemcpyDeviceToHost);

#ifdef TIMECUDA

	cudaEventRecord(stopback,0);

	cudaEventSynchronize(stopback);

	cudaEventElapsedTime(&elapsed_time_gpu_cpu,startback,stopback);

	printf("Time taken CUDA : %f ",elapsed_time_gpu_cpu);

	cudaEventDestroy(startback);

	cudaEventDestroy(stopback); 

#else

#endif

/*        for (i=0; i<N; i++) {

            printf ("%f %f %f\n", a[i], b[i], c[i] );

        }

*/

	cudaFree(temp_a);

	cudaFree(temp_b);

	cudaFree(temp_c);

 	return 0;

}

My CPU version takes 20ms to execute.

In CUDA version, Copying data from device to host only takes 26 ms , add function takes 33 ms and back from host to device , it takes 29 ms. So around 80 ms.

Can anyone tell me the reason why its like this?

Its really strange. I thought CUDA is fast enough.

Please help me as I really want to work on CUDA.

Thanks

Hello All,

I am Deepak and I am very new to CUDA. I started with the first program given in book CUDA BY EXAMPLE and i was shocked to see starnge results when i tried to measure the time of execution.

If I run the program using g++ compliler, it takes around 20ms to run the program but if I use CUDA, its takes 90 ms to run the program.

Following is my code in both versions.

include <stdio.h>

#include <time.h>

#include <math.h>

#include <stdlib.h>

#define N   10

void add( float *a, float *b, float *c ) {

    int tid = 0;

    while (tid < N) {

        c[tid] = (a[tid]/(a[tid]*a[tid])) + (b[tid]/(b[tid]*b[tid]));

        tid += 1;

    }

}

int main( void ) {

	float elapsed;

        float a[N], b[N], c[N]; 

        int i;

	clock_t timerStart, timerStop;

	for (i=0; i<N; i++) {

        	a[i] = (float) (i)/(i+1);

       		b[i] = (float) (i)/(i+1);

        	c[i] = 0;

    	}

    	timerStart = clock();

    	add( a, b, c );

	timerStop = clock();

	elapsed = (float) ( timerStop - timerStart ) / CLOCKS_PER_SEC;

	printf( "Time elapsed:  %f ", elapsed);

	for (i=0; i<N;i++)

	printf(" %f \n",c[i]);

	return 0;

}

MY CUDA VERSION is

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <cuda.h>

#include "cutil.h"

#include <time.h>

#define TIMECUDA

//#define TIMECPU

#define N 10

__global__ void add( float *a, float *b, float *c ) {

    int tid = blockIdx.x;   // TID is the block ID

    if (tid < N) {

       c[tid] = (a[tid]/(a[tid]*a[tid])) + (b[tid]/(b[tid]*b[tid]));

    }

}

int main( void ) {

	float a[N], b[N], c[N];

	float *temp_a,*temp_b,*temp_c;

    	long i;

#ifdef TIMECUDA

	float elapsed_time_cpu_gpu,elapsed_time_add,elapsed_time_gpu_cpu;

#else

#endif

#ifdef TIMECPU

	float elapsed_time;

#else

#endif

#ifdef TIMECPU

	clock_t timerStart, timerStop;

#else

#endif

#ifdef TIMECUDA

	cudaEvent_t start,stop,startadd,stopadd,startback,stopback;

	cudaEventCreate(&start);

	cudaEventCreate(&stop);	

	cudaEventCreate(&startadd);

	cudaEventCreate(&stopadd);	

	cudaEventCreate(&startback);

	cudaEventCreate(&stopback);	

#else

#endif

	cudaMalloc((void**)&temp_a,N*sizeof(int));

	cudaMalloc((void**)&temp_b,N*sizeof(int));

	cudaMalloc((void**)&temp_c,N*sizeof(int));

for (i=0; i<N; i++) {

           a[i] = (float) (i)/(i+1);

           b[i] = (float) (i)/(i+1);

           c[i] = 0;

       }

#ifdef TIMECUDA

	cudaEventRecord(start,0);

#else

#endif

 	cudaMemcpy(temp_a,a,N*sizeof(int),cudaMemcpyHostToDevice);

	cudaMemcpy(temp_b,b,N*sizeof(int),cudaMemcpyHostToDevice);

#ifdef TIMECUDA

	cudaEventRecord(stop,0);

	cudaEventSynchronize(stop);

	cudaEventElapsedTime(&elapsed_time_cpu_gpu,start,stop);

	printf("Time taken CUDA : %f \n",elapsed_time_cpu_gpu);

	cudaEventDestroy(start);

	cudaEventDestroy(stop); 

#else

#endif

#ifdef TIMECPU

	timerStart = clock();

#else

#endif

#ifdef TIMECUDA

	cudaEventRecord(startadd,0);

#else

#endif

        add<<<N,N>>>(temp_a,temp_b,temp_c);

#ifdef TIMECUDA

	cudaEventRecord(stopadd,0);

	cudaEventSynchronize(stopadd);

	cudaEventElapsedTime(&elapsed_time_add,startadd,stopadd);

	printf("Time taken CUDA : %f \n",elapsed_time_add);

	cudaEventDestroy(startadd);

	cudaEventDestroy(stopadd); 

#else

#endif

#ifdef TIMECPU

	timerStop = clock();

	elapsed_time = (float) ( timerStart - timerStop ) / CLOCKS_PER_SEC;

	printf("Time taken CPU  : %f \n",elapsed_time);

#else

#endif

#ifdef TIMECUDA

	cudaEventRecord(startback,0);

#else

#endif

	cudaMemcpy(c,temp_c, N*sizeof(int),cudaMemcpyDeviceToHost);

#ifdef TIMECUDA

	cudaEventRecord(stopback,0);

	cudaEventSynchronize(stopback);

	cudaEventElapsedTime(&elapsed_time_gpu_cpu,startback,stopback);

	printf("Time taken CUDA : %f ",elapsed_time_gpu_cpu);

	cudaEventDestroy(startback);

	cudaEventDestroy(stopback); 

#else

#endif

/*        for (i=0; i<N; i++) {

            printf ("%f %f %f\n", a[i], b[i], c[i] );

        }

*/

	cudaFree(temp_a);

	cudaFree(temp_b);

	cudaFree(temp_c);

 	return 0;

}

My CPU version takes 20ms to execute.

In CUDA version, Copying data from device to host only takes 26 ms , add function takes 33 ms and back from host to device , it takes 29 ms. So around 80 ms.

Can anyone tell me the reason why its like this?

Its really strange. I thought CUDA is fast enough.

Please help me as I really want to work on CUDA.

Thanks

Where to start… You are incorrectly using and interpreting the timing in both of those examples. Your cuda version is nonsensical in several repsects, and It would suggest that it is follow to draw conclusions on the relative performance of either piece of hardware when your code does such a trivial amount of total calculation.

Where to start… You are incorrectly using and interpreting the timing in both of those examples. Your cuda version is nonsensical in several repsects, and It would suggest that it is follow to draw conclusions on the relative performance of either piece of hardware when your code does such a trivial amount of total calculation.

Thanks for the reply, but can you please be specific…

In the first version, I am trying to extract the time taken by add function. In CUDA version, I did the same.Sorry but I am not able to understand the reply of yours.

Please explain as I am new to CUDA.

Thanks

Thanks for the reply, but can you please be specific…

In the first version, I am trying to extract the time taken by add function. In CUDA version, I did the same.Sorry but I am not able to understand the reply of yours.

Please explain as I am new to CUDA.

Thanks

Or if possible can you tell me where am I doing the mistake in CUDA version.

Or if possible can you tell me where am I doing the mistake in CUDA version.