Hi evryone…
I have a doubt in my program about using shared memory…iam using a single thread in a block to transfer the memory from global to shared …and then processing the values in shared memory and doing sm calculations…aft these calculations i have to transfer one shared memory array back to global memory which can be transferred to host and access those values…So my doubt is am i using the shared memory in the correct way or doing smthing wrong…and am i correctly calculating the thread and block ID’s …please help me in this…
Program Description : Here i hv total of 128x64 in global memory and iam accessing values of 16x16 in each block and giving each thread a value and accessing values around them…and if the avg of those values are greater than a value i have to make value as 1 in pxl_res array…and transfer that array back to global memory …
# define N 16
# define w 128
# define h 64
# define sbimg_sz 32
# define tres_val 45
__global__ void neighbours(unsigned char *dev_pxlr,unsigned char *dev_pxlg,unsigned char *dev_pxlb,bool *dev_res)
{
unsigned int bid = blockIdx.x + blockIdx.y;
__shared__ unsigned char pxlr[N][N];
__shared__ unsigned char pxlg[N][N];
__shared__ unsigned char pxlb[N][N];
__shared__ bool pxlres[N][N];
unsigned int nsimgx=w/N,
nsimgy=h/N;
if( threadIdx.x==0 && threadIdx.y==0)
{
int i=0,j=0;
unsigned int st_indx=(bid/nsimgx)*N*sbimg_sz+(bid%nsimgy)*N;
for (; i < N ; i++)
{
for (; j < N ; j++)
{
pxlr[i][j]=dev_pxlr[st_indx+j];
pxlg[i][j]=dev_pxlg[st_indx+j];
pxlb[i][j]=dev_pxlb[st_indx+j];
pxlres[i][j]=0;
}
st_indx=st_indx+sbimg_sz;
}
}
unsigned int tx=threadIdx.x+blockIdx.x*blockDim.x;
unsigned int ty=threadIdx.y+blockIdx.y*blockDim.y;
int nx,ny,gnx,gny,count,rsum,gsum,bsum,tot_sum;
rsum=gsum=bsum=count=tot_sum=0;
for(nx=-1;nx<=1;nx++)
{
for(ny=-1;ny<=1;ny++)
{
gnx=nx+tx;
gny=ny+ty;
if((gnx<N&&gnx>0)&&(gny<N&&gny>0))
{
rsum=rsum+abs(pxlr[tx][ty]-pxlr[gnx][gny]);
gsum=gsum+abs(pxlg[tx][ty]-pxlg[gnx][gny]);
bsum=bsum+abs(pxlb[tx][ty]-pxlb[gnx][gny]);
count++;
}
}
}
tot_sum=(rsum+gsum+bsum)/3;
if(tot_sum/(count-1)>tres_val)
pxlres[tx][ty]=1;
__syncthreads();
if( threadIdx.x==0 && threadIdx.y==0)
{
int i=0,j=0;
unsigned int st_indx=(bid/nsimgx)*N*sbimg_sz+(bid%nsimgy)*N;
for (i=0; i < N ; i++)
{
for (j=0; j < N ; j++)
{
dev_res[st_indx]=pxlres[i][j];
}
st_indx=st_indx+sbimg_sz;
}
}
}