Hi,
i have a problem when using shared memory.
#define SIZE 46
#define LOOP 100
__global__ void test0()
{
__shared__ float pro;
float sum = 0.0f;
int l=0, v=0;
#ifdef DEBUG1
for(l=0;l<SIZE;l++)
pro[l] = 0.0f;
#endif
for(v=1;v<LOOP;v++)
{
#ifdef DEBUG2
for(l=0;l<SIZE;l++)
pro[l] = 0.0f;
#endif
for(l=0;l<SIZE;l++)
if(sum >= RAND_MAX)
sum += pro[l];
}
#ifdef DEBUG3
for(l=0;l<SIZE;l++)
pro[l] = 0.0f;
#endif
}
__global__ void test1()
{
__shared__ float pro;
__shared__ float sum;
int l=0, v=0;
sum = 0.0f;
#ifdef DEBUG1
for(l=0;l<SIZE;l++)
pro[l] = 0.0f;
#endif
for(v=1;v<LOOP;v++)
{
#ifdef DEBUG2
for(l=0;l<SIZE;l++)
pro[l] = 0.0f;
#endif
for(l=0;l<SIZE;l++)
if(sum >= RAND_MAX)
sum += pro[l];
}
#ifdef DEBUG3
for(l=0;l<SIZE;l++)
pro[l] = 0.0f;
#endif
}
__global__ void test2()
{
__shared__ float sum;
__shared__ float pro;
int l=0, v=0;
sum = 0.0f;
#ifdef DEBUG1
for(l=0;l<SIZE-1;l++)
pro[l] = 0.0f;
#endif
for(v=1;v<LOOP;v++)
{
#ifdef DEBUG2
for(l=0;l<SIZE-1;l++)
pro[l] = 0.0f;
#endif
for(l=0;l<SIZE-1;l++)
if(sum >= RAND_MAX)
sum += pro[l];
}
#ifdef DEBUG3
for(l=0;l<SIZE-1;l++)
pro[l] = 0.0f;
#endif
}
The difference between these 3 functions is :
[*]test0 : sum is a local variable
[*]test1 : sum is in shared memory
[*]test2 : sum is in a shared memory
[*]test0 : pro is a array in shared memory with 46 elements
[*]test1 : pro is a array in shared memory with 46 elements
[*]test2 : pro is a array in shared memory with 45 elements
I have experimented these code on a Tesla S1070 (CUDA 3.0) and a Tesla C2050 (CUDA 3.1). I obtained for 10000 calls of each kernel (one block with one thread only) the next time results :
If i remove the “if(sum >= RAND_MAX)” test, i obtained the next time results :
I don’t understand these huge time differences.
Thanks.
ps : line compilation is “nvcc -O3 -Xptxas -v -o pb_mp pb_mp.cu -lcuda -lm -lpthread -lrt -lcublas -lshrutil_x86_64 -lcutil_x86_64”