convolution using shared memory slowdown instead of speedup...

i’m implementing a convolution kernel for a filter with length 49.

this is my first approach without using shared memory:

__constant__ float g_hTrans[H_TRANS_LEN];

__global__ void dirFilterTrans_Kernel(float* srcImg, float* dstImg, dim2dp_t dim)

{

	unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;

	unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;

	float sum = 0;

	float value = 0;

	for (int i=-H_TRANS_RADIUS; i<=H_TRANS_RADIUS; i++)

	{

		if ((((int)idx+i)<0) || (((int)idx+i)>=dim.wp))

		{

			value = 0;

		}

		else

		{

			value = srcImg[idy*dim.wp + idx + i];

		}

		sum += value * g_hTrans[H_TRANS_RADIUS + i];

	}

	dstImg[idy*dim.wp + idx] = sum;

}

where H_TRANS_LEN is 49, H_TRANS_RADIUS is 24 and the block size is 16x16

then i wanted to improve this code using shared memory. since (BLOCK_SIZE + 2*H_TRANS_RADIUS) is 64, every thread can load 4 values to shared memory first and then compute one output value:

__global__ void dirFilterTrans_Kernel(float* srcImg, float* dstImg, dim2dp_t dim)

{

	__shared__ float data[BLOCK_SIZE + 2*H_TRANS_RADIUS][BLOCK_SIZE];

	unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;

	unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;

	unsigned int gloc = idy*dim.wp + idx;

	// load 4 values from global memory to shared memory

	if ((int)idx-H_TRANS_RADIUS < 0)

	{

		data[threadIdx.x			   ][threadIdx.y] = 0;

	}

	else

	{

		data[threadIdx.x			   ][threadIdx.y] = srcImg[gloc - H_TRANS_RADIUS];

	}

	if ((int)idx-H_TRANS_RADIUS+BLOCK_SIZE < 0)

	{

		data[threadIdx.x +   BLOCK_SIZE][threadIdx.y] = 0;

	}

	else

	{

		data[threadIdx.x +   BLOCK_SIZE][threadIdx.y] = srcImg[gloc - H_TRANS_RADIUS +   BLOCK_SIZE];

	}

	if (idx+H_TRANS_RADIUS-BLOCK_SIZE > dim.wp-1)

	{

		data[threadIdx.x + 2*BLOCK_SIZE][threadIdx.y] = 0;

	}

	else

	{

		data[threadIdx.x + 2*BLOCK_SIZE][threadIdx.y] = srcImg[gloc - H_TRANS_RADIUS + 2*BLOCK_SIZE];

	}

	if (idx+H_TRANS_RADIUS > dim.wp-1)

	{

		data[threadIdx.x + 3*BLOCK_SIZE][threadIdx.y] = 0;

	}

	else

	{

		data[threadIdx.x + 3*BLOCK_SIZE][threadIdx.y] = srcImg[gloc - H_TRANS_RADIUS + 3*BLOCK_SIZE];

	}

	__syncthreads();

	// perform convolution for current pixel

	float sum = 0;

	for (int i=-H_TRANS_RADIUS; i<= H_TRANS_RADIUS; i++)

	{

		sum += data[H_TRANS_RADIUS + threadIdx.x + i][threadIdx.y] * g_hTrans[H_TRANS_RADIUS + i];

	}

	dstImg[gloc] = sum;

}

i expected a huge speedup because every pixel has to be read only 4 times from global memory (instead of 49 times with the original implementation). however, the execution time of this kernel inceased by factor 2!!!

why is this code so slow? is it because of bank conflicts? if yes, how can i avoid them in this example?

thanks,

robert

problem solved; it was a very basic c-programming mistake…

what i wanted to do in my second kernel is declaring a 2d array with width (BLOCK_SIZE + 2*H_TRANS_RADIUS) and height BLOCK_SIZE. so i had to swap the dimensions.

now i have no more bank conflicts and a speedup of 2 compared to the implementation without shared memory.

however, i hoped that the speedup would be bigger. but i guess i would have to use a fft approach to make it faster.