use thread to copy global to shared memory

I have a 512x512 image and I want to apply some effect on it, to apply a specific effect on a region I need part of the image larger than the region to make the calculations. So I want to use the threads index to copy data from global memory. Make the calculation using the shared memory and then copy back only the interesting part.

This is my kernel

__global__ void filter(unsigned char *image, unsigned char *out, int n, int m)
	int i = threadIdx.x + blockIdx.x * blockDim.x;
        int j = threadIdx.y + blockIdx.y * blockDim.y;
	int bindex = threadIdx.x + threadIdx.y * blockDim.x;
        int index = i + j * blockDim.x * gridDim.x;
	__shared__ unsigned char shared[16][16*3];

	if (bindex < 256 && index < n*m)

		shared[threadIdx.x][threadIdx.y*3+0] = image[index*3+0];
		shared[threadIdx.x][threadIdx.y*3+1] = image[index*3+1];
		shared[threadIdx.x][threadIdx.y*3+2] = image[index*3+2];



and I am use it like:

cudaMalloc( (void**)&dev_image, n*m*3);
	cudaMalloc( (void**)&dev_out, n*m*3);
	cudaMemcpy( dev_image, image, n*m*3, cudaMemcpyHostToDevice);
	dim3 threads( 16, 16 );
	dim3 blocks( 32, 32 );
        filter<<<blocks, threads>>>(dev_image, dev_out, n, m);
        cudaMemcpy( out, dev_out, n*m*3, cudaMemcpyDeviceToHost );

Does this copy the wondered region or not?