time spent for operations in cuda

spiker · August 11, 2009, 8:57am

hello guys,

I have done a simple program to test the performance of a kernel with and without the shared variable.

the program is divided in 3 call to the same kernel and for every call it prints the result of the committed operation.

Even if the called kernel is the same the first call uses 0.036 usec, the second 0.012 usec and the sthird 0.011 (about)

If I change the kernel and use the “enhanced” one using the shared variable the results are the same.

I don’t know what’s the matter with this, can anyone kindly explain me what’s happening?

Thankyou

(code)

[codebox]#include <stdlib.h>

#include <stdio.h>

#include <time.h>

#include <string.h>

#include <cutil_inline.h>

const int NX=1024*4;

const int NY=1024*4;

const int N=NX*NY;

const int BLOCKX=16;

const int BLOCKY=16;

void global sum(float *a1,float *a2) {

uint i,j;

i=blockIdx.x * blockDim.x + threadIdx.x;

j=blockIdx.y * blockDim.y + threadIdx.y;

if (i<NX && j<NY) {

	a2[i+NX*j]+=a1[i+NX*j];

	if (a1[i+NX*j] + a2[i+NX*j] > 10.0f) a2[i+NX*j]*=-2.0f; else a2[i+NX*j]+=1.0f;

}

__syncthreads();

}

void global esum(float *a1,float *a2) {

uint i,j;

uint tx,ty;

__shared__ float b1[BLOCKX][BLOCKY];

__shared__ float b2[BLOCKX][BLOCKY];

tx=threadIdx.x;

ty=threadIdx.y;

i=blockIdx.x * blockDim.x + tx;

j=blockIdx.y * blockDim.y + ty;

if (i<NX && j<NY) {

	b1[tx][ty]=a1[i+NX*j];

	b2[tx][ty]=a2[i+NX*j];

	__syncthreads();

	b2[tx][ty]+=b1[tx][ty];

	if (b1[tx][ty] + b2[tx][ty] > 10.0f) b2[tx][ty]*=-2.0f; else b2[tx][ty]+=1.0f;

	a2[i+NX*j]=b2[tx][ty];

}

__syncthreads();

}

void global fill(float *a,float v) {

uint i,j;

i=blockIdx.x * blockDim.x + threadIdx.x;

j=blockIdx.y * blockDim.y + threadIdx.y;

if (i<NX && j<NY) {

	a[i+j*NX]=v+(float)(i)/3.76f-(float)(j);

}

__syncthreads();

}

int main(int argc,char* argv) {

float *v1,*v2;

uint hTimer=0;

cudaMalloc((void**)&v1,N*sizeof(float));

cudaMalloc((void**)&v2,N*sizeof(float));

dim3 dblock(BLOCKX,BLOCKY);

dim3 dgrid(NX/BLOCKX,NY/BLOCKY);

cutCreateTimer(&hTimer);

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError(cutResetTimer(hTimer));

cutilCheckError(cutStartTimer(hTimer));

fill<<<dgrid,dblock>>>(v1,18.5);

fill<<<dgrid,dblock>>>(v2,-88.4);

esum<<<dgrid,dblock>>>(v1,v2);

cutilCheckError(cutStopTimer(hTimer));

printf("GPU time: %f msecs.\n", cutGetTimerValue(hTimer));

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError(cutResetTimer(hTimer));

cutilCheckError(cutStartTimer(hTimer));

fill<<<dgrid,dblock>>>(v1,18.5);

fill<<<dgrid,dblock>>>(v2,-88.4);

esum<<<dgrid,dblock>>>(v1,v2);

cutilCheckError(cutStopTimer(hTimer));

printf("GPU time: %f msecs.\n", cutGetTimerValue(hTimer));

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError(cutResetTimer(hTimer));

cutilCheckError(cutStartTimer(hTimer));

fill<<<dgrid,dblock>>>(v1,18.5);

fill<<<dgrid,dblock>>>(v2,-88.4);

esum<<<dgrid,dblock>>>(v1,v2);

cutilCheckError(cutStopTimer(hTimer));

printf("GPU time: %f msecs.\n", cutGetTimerValue(hTimer));

cutilCheckError(cutDeleteTimer(hTimer));

return 0;

}

[/codebox]

avidday · August 11, 2009, 10:13am

You are only measuring the kernel launch times, not the execution times. Kernel launching is asynchronous. For timing purposes, you should add a cudaThreadsSynchronize() before you stop each timer.

spiker · August 11, 2009, 7:10pm

Ok, now it makes sense…

thankyou very much

still one thing… why the execution of the kernel “sum” is faster than the execution of “esum” by your opinion?

esum uses shared memory… it should run faster… am I wrong?

Thankyou