speed up multiple image erosion algorithm (convolution over multiple images)

i’ve implemented two version of morphological image erosion with multiple images algorithm in two kernels.

The first use global memory instead the second use shared memory.

When i test the speed of two kernels the first that use Global memory is faster than the second that use shared memory.

I think that there is bank conflict or uncoalescending access.

but how i can avoid that? how i can modify the SM version for speed up?

GM version:

// no use shared memory
__global__ void erode_multiple_img(unsigned char * buffer_in,
								unsigned char * buffer_out,
								int w,int h ){

	int col = blockIdx.x * blockDim.x + threadIdx.x;
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int plane = blockIdx.z * blockDim.z +threadIdx.z;

	int index = (h * w * plane) + (row ) * w + col ;

	int min_value = 255;
	if( (col >= R) && (col < w-R) && (row >= R) && (row < h-R)){
		// pixels <= the boder-R ---> che if them must be eroded
		for(int dy=-STREL_H/2; dy<=STREL_H/2; dy++){
			for (int dx = -STREL_W/2 ; dx <= STREL_W/2; dx++) {
				min_value = min( buffer_in[index + (dy * w ) + (dx)], min_value);
			}
		}
		buffer_out[index]= min_value;
	}else{
		//pixels > border-R ---> (must be eroded)
		buffer_out[index] = 0;
	}

}

SM version:

__global__ void erode_multiple_img_SM_v2(unsigned char * buffer_in,
                            unsigned char * buffer_out,
                            int w,int h ){

// Data cache: threadIdx.x , threadIdx.y
__shared__ unsigned char data[TILE_W + STREL_SIZE-1 ][TILE_H + STREL_SIZE ];

// coord over img
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;

int plane = blockIdx.z * blockDim.z + threadIdx.z;

int plane_offset = plane*h*w;

int gLoc =plane_offset + row*w +col;

// each threads loads four values from global memory into shared mem
int x, y;   // image based coordinate

if((col<w)&&(row<h)) {

    //read px from (0:15,0:15) --> THE_TILE (to do this i use all Thread in thread Block)
    data[threadIdx.x][threadIdx.y] = buffer_in[gLoc];

    //read px from (0:15, 16:19) --> the Lower part of tile (to do this i use thread (0:15,12:15)
    if ( threadIdx.y > (TILE_H - STREL_SIZE))
            //if my bouds are in the image i add the patch of image that exceeds the TILE_HEIGHT else i add ficticious pixels (that not exists
            // because i exceed the height of image
          data[threadIdx.x][threadIdx.y + STREL_SIZE-1] = row + STREL_SIZE-1 < h ? buffer_in[plane_offset + (row + STREL_SIZE-1)*w + col] : 255;

    //read px from (16:19,0:15) --> the Lower part of tile (to do this i use thread (12:15,0:15)
    if (threadIdx.x > (TILE_W-STREL_SIZE))
        //if my bouds are in the image i add the patch of image that exceeds the TILE_WIDTH else i add ficticious pixels (that not exists
        // because i exceed the width of image
          data[threadIdx.x + STREL_SIZE-1][threadIdx.y] = col + STREL_SIZE-1 < w ? buffer_in[plane_offset + row*w+col + STREL_SIZE-1] : 255;

    //read px from (16:19,16:19) --> the Lower part of tile (to do this i use thread (12:15,12:15)
     if ((threadIdx.x > (TILE_W-STREL_SIZE)) && (threadIdx.y > (TILE_H-STREL_SIZE)))
         //if my bouds are in the image i add the patch of image that exceeds the TILE_WIDTH else i add ficticious pixels (that not exists
         // because i exceed the width of image
          data[threadIdx.x + STREL_SIZE-1][threadIdx.y + STREL_SIZE-1] = (row + STREL_SIZE-1<h && col + STREL_SIZE-1<w) ? buffer_in[plane_offset+(row + STREL_SIZE-1)*w + col + STREL_SIZE-1] : 255;

     //wait for all threads to finish read
     __syncthreads();

unsigned char min_value = 255;
      for(x=0;x<STREL_SIZE;x++){
          for(y=0;y<STREL_SIZE;y++){
              min_value = min( (data[threadIdx.x+x][threadIdx.y+y]) , min_value);
              }

          }
      buffer_out[gLoc]= min_value;
      }

}

this are my costant:

#define STREL_W 5
#define STREL_H 5

#define STREL_SIZE 5

#define TILE_W 16
#define TILE_H 16

#define R (STREL_SIZE/2)
#define KERNEL_RADIUS (STREL_SIZE/2)

#define BLOCK_W (TILE_W+(2*R))
#define BLOCK_H (TILE_H+(2*R))

here’s my kernel call:

dim3 block(16,16);
dim3 grid(img_width/(TILE_H),img_height/(TILE_W),nImg);

erode_multiple_img_SM_v2<<<grid,block>>>(dimage_src,dimage_dst,img_width,img_height);