Using shared Memory

Hi evryone…

I have a doubt in my program about using shared memory…iam using a single thread in a block to transfer the memory from global to shared …and then processing the values in shared memory and doing sm calculations…aft these calculations i have to transfer one shared memory array back to global memory which can be transferred to host and access those values…So my doubt is am i using the shared memory in the correct way or doing smthing wrong…and am i correctly calculating the thread and block ID’s …please help me in this…

Program Description : Here i hv total of 128x64 in global memory and iam accessing values of 16x16 in each block and giving each thread a value and accessing values around them…and if the avg of those values are greater than a value i have to make value as 1 in pxl_res array…and transfer that array back to global memory …

# define N 16

# define w 128

# define h 64

# define sbimg_sz 32

# define tres_val 45

__global__ void neighbours(unsigned char *dev_pxlr,unsigned char *dev_pxlg,unsigned char *dev_pxlb,bool *dev_res)

{

	unsigned int bid = blockIdx.x + blockIdx.y;

__shared__ unsigned char pxlr[N][N];

        __shared__ unsigned char pxlg[N][N];

        __shared__ unsigned char pxlb[N][N];

        __shared__ bool pxlres[N][N];

unsigned int nsimgx=w/N,

        	     nsimgy=h/N;

        		     

        if( threadIdx.x==0 &&  threadIdx.y==0)

        {

        	int i=0,j=0;

        	unsigned int st_indx=(bid/nsimgx)*N*sbimg_sz+(bid%nsimgy)*N;

                for (; i < N ; i++)

                {

                     for (; j < N ; j++)

                     {

                                   pxlr[i][j]=dev_pxlr[st_indx+j];

                                   pxlg[i][j]=dev_pxlg[st_indx+j];

                                   pxlb[i][j]=dev_pxlb[st_indx+j];

                                   pxlres[i][j]=0;                                  

                     }

                     st_indx=st_indx+sbimg_sz;

                }

        }

unsigned int tx=threadIdx.x+blockIdx.x*blockDim.x;

        unsigned int ty=threadIdx.y+blockIdx.y*blockDim.y;

int nx,ny,gnx,gny,count,rsum,gsum,bsum,tot_sum;

        rsum=gsum=bsum=count=tot_sum=0;

for(nx=-1;nx<=1;nx++)

        {

        	for(ny=-1;ny<=1;ny++)

                {

                	gnx=nx+tx;

                	gny=ny+ty;

                	if((gnx<N&&gnx>0)&&(gny<N&&gny>0))

                	{

                		rsum=rsum+abs(pxlr[tx][ty]-pxlr[gnx][gny]);

                		gsum=gsum+abs(pxlg[tx][ty]-pxlg[gnx][gny]);

                		bsum=bsum+abs(pxlb[tx][ty]-pxlb[gnx][gny]);

                		count++;

                	}

                }

        }

tot_sum=(rsum+gsum+bsum)/3;

if(tot_sum/(count-1)>tres_val)

               	pxlres[tx][ty]=1;

__syncthreads();

        if( threadIdx.x==0 &&  threadIdx.y==0)

        {

        	int i=0,j=0;

        	unsigned int st_indx=(bid/nsimgx)*N*sbimg_sz+(bid%nsimgy)*N;

                for (i=0; i < N ; i++)

                {

                     for (j=0; j < N ; j++)

                     {

                     	     dev_res[st_indx]=pxlres[i][j];                            

                     }

                     st_indx=st_indx+sbimg_sz;

                }

        }

}

Use at least all threads of one warp to fully coalesce loads data from global to shared memory. And you are missing a __syncthreads() between initialization of shared memory contents and their processing.

Thanks for the sugesstion…can u gv me a example code how to use threads in a warp…im a complete newbie to shared memory and CUDA

E.g. the transpose example of the SDK.