efficiency problem with shared memory

Hi,

i have a problem when using shared memory.

#define SIZE 46

#define LOOP 100

__global__ void test0()

{

		__shared__ float  pro;

		float sum = 0.0f;

		int l=0, v=0;

#ifdef DEBUG1

		for(l=0;l<SIZE;l++)

				pro[l] = 0.0f;

#endif

		for(v=1;v<LOOP;v++)

		{

#ifdef DEBUG2

				for(l=0;l<SIZE;l++)

						pro[l] = 0.0f;

#endif

				for(l=0;l<SIZE;l++)

						if(sum >= RAND_MAX)

								sum += pro[l];

		}

#ifdef DEBUG3

		for(l=0;l<SIZE;l++)

				pro[l] = 0.0f;

#endif

}

__global__ void test1()

{

	__shared__ float pro;

	__shared__ float sum;

	int l=0, v=0;

	sum = 0.0f; 

#ifdef DEBUG1

		for(l=0;l<SIZE;l++)

				pro[l] = 0.0f;

#endif

	for(v=1;v<LOOP;v++)

	{

#ifdef DEBUG2

		for(l=0;l<SIZE;l++)

			pro[l] = 0.0f;

#endif

		for(l=0;l<SIZE;l++)

			if(sum >= RAND_MAX)

				sum += pro[l];

	}

#ifdef DEBUG3

		for(l=0;l<SIZE;l++)

				pro[l] = 0.0f;

#endif

}

__global__ void test2()

{

	__shared__ float sum;

	__shared__ float  pro;

	int l=0, v=0;

	sum = 0.0f; 

#ifdef DEBUG1

		for(l=0;l<SIZE-1;l++)

				pro[l] = 0.0f;

#endif

	for(v=1;v<LOOP;v++)

	{

#ifdef DEBUG2

		for(l=0;l<SIZE-1;l++)

			pro[l] = 0.0f;

#endif

		for(l=0;l<SIZE-1;l++)

			if(sum >= RAND_MAX)

				sum += pro[l];

	}

#ifdef DEBUG3

		for(l=0;l<SIZE-1;l++)

				pro[l] = 0.0f;

#endif

}

The difference between these 3 functions is :

    [*]test0 : sum is a local variable

    [*]test1 : sum is in shared memory

    [*]test2 : sum is in a shared memory

    [*]test0 : pro is a array in shared memory with 46 elements

    [*]test1 : pro is a array in shared memory with 46 elements

    [*]test2 : pro is a array in shared memory with 45 elements

I have experimented these code on a Tesla S1070 (CUDA 3.0) and a Tesla C2050 (CUDA 3.1). I obtained for 10000 calls of each kernel (one block with one thread only) the next time results :

External Media

If i remove the “if(sum >= RAND_MAX)” test, i obtained the next time results :

External Media

I don’t understand these huge time differences.

Thanks.

ps : line compilation is “nvcc -O3 -Xptxas -v -o pb_mp pb_mp.cu -lcuda -lm -lpthread -lrt -lcublas -lshrutil_x86_64 -lcutil_x86_64