i’m implementing a convolution kernel for a filter with length 49.
this is my first approach without using shared memory:
__constant__ float g_hTrans[H_TRANS_LEN];
__global__ void dirFilterTrans_Kernel(float* srcImg, float* dstImg, dim2dp_t dim)
{
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;
float sum = 0;
float value = 0;
for (int i=-H_TRANS_RADIUS; i<=H_TRANS_RADIUS; i++)
{
if ((((int)idx+i)<0) || (((int)idx+i)>=dim.wp))
{
value = 0;
}
else
{
value = srcImg[idy*dim.wp + idx + i];
}
sum += value * g_hTrans[H_TRANS_RADIUS + i];
}
dstImg[idy*dim.wp + idx] = sum;
}
where H_TRANS_LEN is 49, H_TRANS_RADIUS is 24 and the block size is 16x16
then i wanted to improve this code using shared memory. since (BLOCK_SIZE + 2*H_TRANS_RADIUS) is 64, every thread can load 4 values to shared memory first and then compute one output value:
__global__ void dirFilterTrans_Kernel(float* srcImg, float* dstImg, dim2dp_t dim)
{
__shared__ float data[BLOCK_SIZE + 2*H_TRANS_RADIUS][BLOCK_SIZE];
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int gloc = idy*dim.wp + idx;
// load 4 values from global memory to shared memory
if ((int)idx-H_TRANS_RADIUS < 0)
{
data[threadIdx.x ][threadIdx.y] = 0;
}
else
{
data[threadIdx.x ][threadIdx.y] = srcImg[gloc - H_TRANS_RADIUS];
}
if ((int)idx-H_TRANS_RADIUS+BLOCK_SIZE < 0)
{
data[threadIdx.x + BLOCK_SIZE][threadIdx.y] = 0;
}
else
{
data[threadIdx.x + BLOCK_SIZE][threadIdx.y] = srcImg[gloc - H_TRANS_RADIUS + BLOCK_SIZE];
}
if (idx+H_TRANS_RADIUS-BLOCK_SIZE > dim.wp-1)
{
data[threadIdx.x + 2*BLOCK_SIZE][threadIdx.y] = 0;
}
else
{
data[threadIdx.x + 2*BLOCK_SIZE][threadIdx.y] = srcImg[gloc - H_TRANS_RADIUS + 2*BLOCK_SIZE];
}
if (idx+H_TRANS_RADIUS > dim.wp-1)
{
data[threadIdx.x + 3*BLOCK_SIZE][threadIdx.y] = 0;
}
else
{
data[threadIdx.x + 3*BLOCK_SIZE][threadIdx.y] = srcImg[gloc - H_TRANS_RADIUS + 3*BLOCK_SIZE];
}
__syncthreads();
// perform convolution for current pixel
float sum = 0;
for (int i=-H_TRANS_RADIUS; i<= H_TRANS_RADIUS; i++)
{
sum += data[H_TRANS_RADIUS + threadIdx.x + i][threadIdx.y] * g_hTrans[H_TRANS_RADIUS + i];
}
dstImg[gloc] = sum;
}
i expected a huge speedup because every pixel has to be read only 4 times from global memory (instead of 49 times with the original implementation). however, the execution time of this kernel inceased by factor 2!!!
why is this code so slow? is it because of bank conflicts? if yes, how can i avoid them in this example?
thanks,
robert