Hi gys,
I am trying to use atomicadd instruction on shared memory to speed up my code, but it is having the opposite effect.
Atomic instruction on global memory is as follows:
__global__ void calcCentroidKernel( int *gpu_labels, int *gpu_nRegions, int *gpu_regionOff, int *gpu_regionSize, int *gpu_centroid, int *i, int pitch)
{
int x = (blockIdx.x*blockDim.x)+threadIdx.x;
int y = (blockIdx.y*blockDim.y)+threadIdx.y;
int index = x+y*pitch;
int j = 0;
if( gpu_labels[index] == index)
{
atomicAdd( i, 1);
atomicAdd( gpu_regionOff + (*i), index);
}
for( j=0; j < *gpu_nRegions; j++)
{
if( gpu_labels[index] == gpu_regionOff[j])
break;
}
if( gpu_labels[index] != -1)
{
atomicAdd( gpu_centroid+(2*j), x);
atomicAdd( gpu_centroid+(2*j)+1, y);
atomicAdd( gpu_regionSize+j, 1);
}
}
Atomic instruction on shared memory is as follows:
__global__ void calcCentroidSharedKernel(int *gpu_labels,int *gpu_nRegions,int *gpu_regionOff,int *gpu_regionsSize,int *gpu_centroid,int *i,int pitch)
{
extern __shared__ int sMem[];
int x = (blockIdx.x*blockDim.x)+threadIdx.x;
int y = (blockIdx.y*blockDim.y)+threadIdx.y;
int index = x+y*pitch;
int j = 0;
int shSize_Offset = 0;
int shCentroid_Offset = *gpu_nRegions;
if( gpu_labels[index] == index)
{
atomicAdd( i, 1);
atomicAdd( gpu_regionOff + (*i), index);
}
for( j=0; j < *gpu_nRegions; j++)
{
if( gpu_labels[index] == gpu_regionOff[j])
break;
}
if(gpu_labels[index] != -1)
{
atomicAdd( sMem+shCentroid_Offset+(2*j), x);
atomicAdd( sMem+shCentroid_Offset+(2*j)+1, y);
atomicAdd( sMem+shSize_Offset+j, 1);
}
__syncthreads();
atomicAdd(gpu_centroid+(2*j), sMem[shCentroid_Offset+(2*j)]);
atomicAdd(gpu_centroid+(2*j)+1, sMem[shCentroid_Offset+(2*j)+1]);
atomicAdd(gpu_regionsSize+j, sMem[shSize_Offset+j]);
}
I am unable to understand why the timing of my code increased after using shared memory. Though it looks like shared memory usage is just a over head but considering that shared memory is used >200 times, the usage of shared memory should have provided considerable timing improvement…