more time taken by CUDA rather than reducing time

Deepakjain · November 18, 2010, 4:10pm

Hello All,

I am Deepak and I am very new to CUDA. I started with the first program given in book CUDA BY EXAMPLE and i was shocked to see starnge results when i tried to measure the time of execution.

If I run the program using g++ compliler, it takes around 20ms to run the program but if I use CUDA, its takes 90 ms to run the program.

Following is my code in both versions.

include <stdio.h>

#include <time.h>

#include <math.h>

#include <stdlib.h>

#define N   10

void add( float *a, float *b, float *c ) {

    int tid = 0;

    while (tid < N) {

        c[tid] = (a[tid]/(a[tid]*a[tid])) + (b[tid]/(b[tid]*b[tid]));

        tid += 1;

    }

}

int main( void ) {

	float elapsed;

        float a[N], b[N], c[N]; 

        int i;

	clock_t timerStart, timerStop;

	for (i=0; i<N; i++) {

        	a[i] = (float) (i)/(i+1);

       		b[i] = (float) (i)/(i+1);

        	c[i] = 0;

    	}

    	timerStart = clock();

    	add( a, b, c );

	timerStop = clock();

	elapsed = (float) ( timerStop - timerStart ) / CLOCKS_PER_SEC;

	printf( "Time elapsed:  %f ", elapsed);

	for (i=0; i<N;i++)

	printf(" %f \n",c[i]);

	return 0;

}

MY CUDA VERSION is

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <cuda.h>

#include "cutil.h"

#include <time.h>

#define TIMECUDA

//#define TIMECPU

#define N 10

__global__ void add( float *a, float *b, float *c ) {

    int tid = blockIdx.x;   // TID is the block ID

    if (tid < N) {

       c[tid] = (a[tid]/(a[tid]*a[tid])) + (b[tid]/(b[tid]*b[tid]));

    }

}

int main( void ) {

	float a[N], b[N], c[N];

	float *temp_a,*temp_b,*temp_c;

    	long i;

#ifdef TIMECUDA

	float elapsed_time_cpu_gpu,elapsed_time_add,elapsed_time_gpu_cpu;

#else

#endif

#ifdef TIMECPU

	float elapsed_time;

#else

#endif

#ifdef TIMECPU

	clock_t timerStart, timerStop;

#else

#endif

#ifdef TIMECUDA

	cudaEvent_t start,stop,startadd,stopadd,startback,stopback;

	cudaEventCreate(&start);

	cudaEventCreate(&stop);	

	cudaEventCreate(&startadd);

	cudaEventCreate(&stopadd);	

	cudaEventCreate(&startback);

	cudaEventCreate(&stopback);	

#else

#endif

	cudaMalloc((void**)&temp_a,N*sizeof(int));

	cudaMalloc((void**)&temp_b,N*sizeof(int));

	cudaMalloc((void**)&temp_c,N*sizeof(int));

for (i=0; i<N; i++) {

           a[i] = (float) (i)/(i+1);

           b[i] = (float) (i)/(i+1);

           c[i] = 0;

       }

#ifdef TIMECUDA

	cudaEventRecord(start,0);

#else

#endif

 	cudaMemcpy(temp_a,a,N*sizeof(int),cudaMemcpyHostToDevice);

	cudaMemcpy(temp_b,b,N*sizeof(int),cudaMemcpyHostToDevice);

#ifdef TIMECUDA

	cudaEventRecord(stop,0);

	cudaEventSynchronize(stop);

	cudaEventElapsedTime(&elapsed_time_cpu_gpu,start,stop);

	printf("Time taken CUDA : %f \n",elapsed_time_cpu_gpu);

	cudaEventDestroy(start);

	cudaEventDestroy(stop); 

#else

#endif

#ifdef TIMECPU

	timerStart = clock();

#else

#endif

#ifdef TIMECUDA

	cudaEventRecord(startadd,0);

#else

#endif

        add<<<N,N>>>(temp_a,temp_b,temp_c);

#ifdef TIMECUDA

	cudaEventRecord(stopadd,0);

	cudaEventSynchronize(stopadd);

	cudaEventElapsedTime(&elapsed_time_add,startadd,stopadd);

	printf("Time taken CUDA : %f \n",elapsed_time_add);

	cudaEventDestroy(startadd);

	cudaEventDestroy(stopadd); 

#else

#endif

#ifdef TIMECPU

	timerStop = clock();

	elapsed_time = (float) ( timerStart - timerStop ) / CLOCKS_PER_SEC;

	printf("Time taken CPU  : %f \n",elapsed_time);

#else

#endif

#ifdef TIMECUDA

	cudaEventRecord(startback,0);

#else

#endif

	cudaMemcpy(c,temp_c, N*sizeof(int),cudaMemcpyDeviceToHost);

#ifdef TIMECUDA

	cudaEventRecord(stopback,0);

	cudaEventSynchronize(stopback);

	cudaEventElapsedTime(&elapsed_time_gpu_cpu,startback,stopback);

	printf("Time taken CUDA : %f ",elapsed_time_gpu_cpu);

	cudaEventDestroy(startback);

	cudaEventDestroy(stopback); 

#else

#endif

/*        for (i=0; i<N; i++) {

            printf ("%f %f %f\n", a[i], b[i], c[i] );

        }

*/

	cudaFree(temp_a);

	cudaFree(temp_b);

	cudaFree(temp_c);

 	return 0;

}

My CPU version takes 20ms to execute.

In CUDA version, Copying data from device to host only takes 26 ms , add function takes 33 ms and back from host to device , it takes 29 ms. So around 80 ms.

Can anyone tell me the reason why its like this?

Its really strange. I thought CUDA is fast enough.

Please help me as I really want to work on CUDA.

Thanks

Deepakjain · November 18, 2010, 4:10pm

Hello All,

I am Deepak and I am very new to CUDA. I started with the first program given in book CUDA BY EXAMPLE and i was shocked to see starnge results when i tried to measure the time of execution.

If I run the program using g++ compliler, it takes around 20ms to run the program but if I use CUDA, its takes 90 ms to run the program.

Following is my code in both versions.

include <stdio.h>

#include <time.h>

#include <math.h>

#include <stdlib.h>

#define N   10

void add( float *a, float *b, float *c ) {

    int tid = 0;

    while (tid < N) {

        c[tid] = (a[tid]/(a[tid]*a[tid])) + (b[tid]/(b[tid]*b[tid]));

        tid += 1;

    }

}

int main( void ) {

	float elapsed;

        float a[N], b[N], c[N]; 

        int i;

	clock_t timerStart, timerStop;

	for (i=0; i<N; i++) {

        	a[i] = (float) (i)/(i+1);

       		b[i] = (float) (i)/(i+1);

        	c[i] = 0;

    	}

    	timerStart = clock();

    	add( a, b, c );

	timerStop = clock();

	elapsed = (float) ( timerStop - timerStart ) / CLOCKS_PER_SEC;

	printf( "Time elapsed:  %f ", elapsed);

	for (i=0; i<N;i++)

	printf(" %f \n",c[i]);

	return 0;

}

MY CUDA VERSION is

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <cuda.h>

#include "cutil.h"

#include <time.h>

#define TIMECUDA

//#define TIMECPU

#define N 10

__global__ void add( float *a, float *b, float *c ) {

    int tid = blockIdx.x;   // TID is the block ID

    if (tid < N) {

       c[tid] = (a[tid]/(a[tid]*a[tid])) + (b[tid]/(b[tid]*b[tid]));

    }

}

int main( void ) {

	float a[N], b[N], c[N];

	float *temp_a,*temp_b,*temp_c;

    	long i;

#ifdef TIMECUDA

	float elapsed_time_cpu_gpu,elapsed_time_add,elapsed_time_gpu_cpu;

#else

#endif

#ifdef TIMECPU

	float elapsed_time;

#else

#endif

#ifdef TIMECPU

	clock_t timerStart, timerStop;

#else

#endif

#ifdef TIMECUDA

	cudaEvent_t start,stop,startadd,stopadd,startback,stopback;

	cudaEventCreate(&start);

	cudaEventCreate(&stop);	

	cudaEventCreate(&startadd);

	cudaEventCreate(&stopadd);	

	cudaEventCreate(&startback);

	cudaEventCreate(&stopback);	

#else

#endif

	cudaMalloc((void**)&temp_a,N*sizeof(int));

	cudaMalloc((void**)&temp_b,N*sizeof(int));

	cudaMalloc((void**)&temp_c,N*sizeof(int));

for (i=0; i<N; i++) {

           a[i] = (float) (i)/(i+1);

           b[i] = (float) (i)/(i+1);

           c[i] = 0;

       }

#ifdef TIMECUDA

	cudaEventRecord(start,0);

#else

#endif

 	cudaMemcpy(temp_a,a,N*sizeof(int),cudaMemcpyHostToDevice);

	cudaMemcpy(temp_b,b,N*sizeof(int),cudaMemcpyHostToDevice);

#ifdef TIMECUDA

	cudaEventRecord(stop,0);

	cudaEventSynchronize(stop);

	cudaEventElapsedTime(&elapsed_time_cpu_gpu,start,stop);

	printf("Time taken CUDA : %f \n",elapsed_time_cpu_gpu);

	cudaEventDestroy(start);

	cudaEventDestroy(stop); 

#else

#endif

#ifdef TIMECPU

	timerStart = clock();

#else

#endif

#ifdef TIMECUDA

	cudaEventRecord(startadd,0);

#else

#endif

        add<<<N,N>>>(temp_a,temp_b,temp_c);

#ifdef TIMECUDA

	cudaEventRecord(stopadd,0);

	cudaEventSynchronize(stopadd);

	cudaEventElapsedTime(&elapsed_time_add,startadd,stopadd);

	printf("Time taken CUDA : %f \n",elapsed_time_add);

	cudaEventDestroy(startadd);

	cudaEventDestroy(stopadd); 

#else

#endif

#ifdef TIMECPU

	timerStop = clock();

	elapsed_time = (float) ( timerStart - timerStop ) / CLOCKS_PER_SEC;

	printf("Time taken CPU  : %f \n",elapsed_time);

#else

#endif

#ifdef TIMECUDA

	cudaEventRecord(startback,0);

#else

#endif

	cudaMemcpy(c,temp_c, N*sizeof(int),cudaMemcpyDeviceToHost);

#ifdef TIMECUDA

	cudaEventRecord(stopback,0);

	cudaEventSynchronize(stopback);

	cudaEventElapsedTime(&elapsed_time_gpu_cpu,startback,stopback);

	printf("Time taken CUDA : %f ",elapsed_time_gpu_cpu);

	cudaEventDestroy(startback);

	cudaEventDestroy(stopback); 

#else

#endif

/*        for (i=0; i<N; i++) {

            printf ("%f %f %f\n", a[i], b[i], c[i] );

        }

*/

	cudaFree(temp_a);

	cudaFree(temp_b);

	cudaFree(temp_c);

 	return 0;

}

My CPU version takes 20ms to execute.

In CUDA version, Copying data from device to host only takes 26 ms , add function takes 33 ms and back from host to device , it takes 29 ms. So around 80 ms.

Can anyone tell me the reason why its like this?

Its really strange. I thought CUDA is fast enough.

Please help me as I really want to work on CUDA.

Thanks

avidday · November 18, 2010, 6:38pm

Where to start… You are incorrectly using and interpreting the timing in both of those examples. Your cuda version is nonsensical in several repsects, and It would suggest that it is follow to draw conclusions on the relative performance of either piece of hardware when your code does such a trivial amount of total calculation.

avidday · November 18, 2010, 6:38pm

Where to start… You are incorrectly using and interpreting the timing in both of those examples. Your cuda version is nonsensical in several repsects, and It would suggest that it is follow to draw conclusions on the relative performance of either piece of hardware when your code does such a trivial amount of total calculation.

Deepakjain · November 18, 2010, 8:32pm

Thanks for the reply, but can you please be specific…

In the first version, I am trying to extract the time taken by add function. In CUDA version, I did the same.Sorry but I am not able to understand the reply of yours.

Please explain as I am new to CUDA.

Thanks

Deepakjain · November 18, 2010, 8:32pm

Thanks for the reply, but can you please be specific…

In the first version, I am trying to extract the time taken by add function. In CUDA version, I did the same.Sorry but I am not able to understand the reply of yours.

Please explain as I am new to CUDA.

Thanks

Deepakjain · November 18, 2010, 9:41pm

Or if possible can you tell me where am I doing the mistake in CUDA version.

Deepakjain · November 18, 2010, 9:41pm

Or if possible can you tell me where am I doing the mistake in CUDA version.

Topic		Replies	Views
Cuda program taking more time. CUDA Programming and Performance	15	7165	November 21, 2010
Is CUDA really that fast? CUDA Programming and Performance	17	11875	September 21, 2009
well how do I know if cuda runs on the gpu CUDA Programming and Performance	20	13822	July 9, 2008
need a help from employees or guys who know compiler well CUDA Programming and Performance	22	8788	December 18, 2008
Confused about GPU vs CPU speed in multiplication CUDA Programming and Performance	8	6625	February 19, 2009
Performance in basic algorithm Why isn't faster? CUDA Programming and Performance	4	1711	January 9, 2009
How to explain the performance difference? CUDA Programming and Performance	7	3569	March 26, 2008
Program without CUDA is faster CUDA Programming and Performance	6	10524	December 19, 2008
DATA tranfer from CPU to GPU CUDA Programming and Performance	6	4888	April 23, 2008
Simple proven (timed) example code where GPU beats CPU, anyone? CUDA Programming and Performance	6	1225	November 1, 2013

more time taken by CUDA rather than reducing time

Related topics