Processing pictures, low load efficiency..?

Hello, i’m doing a work on a picture for University; I have to do some operations on an image, as noise reduction ecc.; I have the code with CPU, and I have to do it with CUDA using shared memory, and it should be faster then CPU. I have some problems: with small pictures, CPU it’s a little faster then GPU; instead, with bigger pictures GPU it’s faster then CPU…Then I don’t know if I can do it better or if I have to accept this. I have 3 kernels, I write only code of 1 kernel because other kernels are similar

CPU

void cannyCPU(float *im, float *image_out, 
	float *NR, float *G, float *phi, float *Gx, float *Gy, int *pedge,
	float level,
	int height, int width)
{
	int i, j;
	int ii, jj;
	float PI = 3.141593;

	float lowthres, hithres;

	for(i=2; i<height-2; i++)
		for(j=2; j<width-2; j++)
		{
			// Noise reduction
			NR[i*width+j] =
				 (2.0*im[(i-2)*width+(j-2)] +  4.0*im[(i-2)*width+(j-1)] +  5.0*im[(i-2)*width+(j)] +  4.0*im[(i-2)*width+(j+1)] + 2.0*im[(i-2)*width+(j+2)]
				+ 4.0*im[(i-1)*width+(j-2)] +  9.0*im[(i-1)*width+(j-1)] + 12.0*im[(i-1)*width+(j)] +  9.0*im[(i-1)*width+(j+1)] + 4.0*im[(i-1)*width+(j+2)]
				+ 5.0*im[(i  )*width+(j-2)] + 12.0*im[(i  )*width+(j-1)] + 15.0*im[(i  )*width+(j)] + 12.0*im[(i  )*width+(j+1)] + 5.0*im[(i  )*width+(j+2)]
				+ 4.0*im[(i+1)*width+(j-2)] +  9.0*im[(i+1)*width+(j-1)] + 12.0*im[(i+1)*width+(j)] +  9.0*im[(i+1)*width+(j+1)] + 4.0*im[(i+1)*width+(j+2)]
				+ 2.0*im[(i+2)*width+(j-2)] +  4.0*im[(i+2)*width+(j-1)] +  5.0*im[(i+2)*width+(j)] +  4.0*im[(i+2)*width+(j+1)] + 2.0*im[(i+2)*width+(j+2)])
				/159.0;
		}
}

GPU

_global__ void noise_reduction(float* im, float* NR, int height, int width) {

	int i, j;
	i = blockIdx.x * blockDim.x + threadIdx.x; 
	j = blockIdx.y * blockDim.y + threadIdx.y;
	
	
	
	int tx = threadIdx.x;
	int ty = threadIdx.y;

	__shared__ float imsh [BLOCK_SIZE+4][BLOCK_SIZE+5];

	//load image
	

	if((i >= 2 && i<(height-2)) && (j>=2 && j<(width-2))) {
		imsh[ty+2][tx+2] = im[j*width+i];
		__syncthreads();
	
		
		
		if((tx>=2 && tx<BLOCK_SIZE-2) && (ty>=2 && ty<BLOCK_SIZE-2)){
			tx+=2;
			ty+=2;
			NR[j*width+i] = (2.0*imsh[ty-2][tx-2] + 4.0*imsh[ty-2][tx-1] + 5.0*imsh[ty-2][tx]  + 4.0*imsh[ty-2][tx+1]  + 2.0*imsh[ty-2][tx+2]
		       		+ 	 4.0*imsh[ty-1][tx-2] + 9.0*imsh[ty-1][tx-1] + 12.0*imsh[ty-1][tx] + 9.0*imsh[ty-1][tx+1]  + 4.0*imsh[ty-1][tx+2]    
		       		+ 	 5.0*imsh[ty][tx-2]   + 12.0*imsh[ty][tx-1]  + 15.0*imsh[ty][tx]   + 12.0*imsh[ty][tx+1]   + 5.0*imsh[ty][tx+2]
		       		+        4.0*imsh[ty+1][tx-2] + 9.0*imsh[ty+1][tx-1] + 12.0*imsh[ty+1][tx] + 9.0*imsh[ty+1][tx+1]  + 4.0*imsh[ty+1][tx+2] +     
		       		+        2.0*imsh[ty+2][tx-2] + 4.0*imsh[ty+2][tx-1] +  5.0*imsh[ty+2][tx] +  4.0*imsh[ty+2][tx+1] + 2.0*imsh[ty+2][tx+2])
				/159.0;
		}
		else {
			NR[j*width+i] =
				 (2.0*im[(j-2)*width+(i-2)] +  4.0*im[(j-2)*width+(i-1)] +  5.0*im[(j-2)*width+(i)] +  4.0*im[(j-2)*width+(i+1)] + 2.0*im[(j-2)*width+(i+2)]
				+ 4.0*im[(j-1)*width+(i-2)] +  9.0*im[(j-1)*width+(i-1)] + 12.0*im[(j-1)*width+(i)] +  9.0*im[(j-1)*width+(i+1)] + 4.0*im[(j-1)*width+(i+2)]
				+ 5.0*im[(j  )*width+(i-2)] + 12.0*im[(j  )*width+(i-1)] + 15.0*im[(j  )*width+(i)] + 12.0*im[(j  )*width+(i+1)] + 5.0*im[(j  )*width+(i+2)]
				+ 4.0*im[(j+1)*width+(i-2)] +  9.0*im[(j+1)*width+(i-1)] + 12.0*im[(j+1)*width+(i)] +  9.0*im[(j+1)*width+(i+1)] + 4.0*im[(j+1)*width+(i+2)]
				+ 2.0*im[(j+2)*width+(i-2)] +  4.0*im[(j+2)*width+(i-1)] +  5.0*im[(j+2)*width+(i)] +  4.0*im[(j+2)*width+(i+1)] + 2.0*im[(j+2)*width+(i+2)])
				/159.0;

	

		}

	}

}

Moreover, NVIDIA Visual Profiler tells me that load efficiency is only 25.6%, instead store efficiency is 72.8%, I can’t understand why…Can I optimize this code?

Isn’t it wrong to have __syncthreads inside a branching if-else ?

Consider using texture memory as it has a cache optimized for 2D spatial locality.

Yes, I’ve changed code and I put synchronize out the if-else branch; I can use texture memory, but access pattern for load/store look correct?I can’t do better without texture memory?