I am relatively new to CUDA programming. I have two functions to implemented 3D-Median Filtering (third dimension is the time i.e stacking of consecutive frames). First function uses Naive Parallelism without any shared memory and the second one uses shared memory. However, both of them take almost the same time. My implementation of shared memory is based on this thread of stack overflow. My layout of the input stacked images in the device pointer is depth major (pixels along the depth are contiguous,then row and then column). Each image is a single channel of dimension 384x384.
For Quadro RTX 6000 with kernel size 15x15,block size of 16 x 16 and 10 frames (384x384x10), I get the following timing statistics:
Naive Paralleism: 3237.9 milliseconds
Shared memory: 3444.0 milliseconds.
However, when I increase the tile size to 32:
Naive Parallelism: 4555.8.9 milliseconds
Shared memory: 2447.8 milliseconds.
As 32x32 is the maximum limit(1024) I want to avoid using it. Can I improve the shared memory usage using 16x16 block size?
#define TILE_SIZE 16 // block size for each dimension
#define CHANNELS 10
#define KERNEL_SIZE 15
__global__ void ThreeDMedianFilterShared(unsigned char* input,
unsigned char* output,
const int width,
const int height,
const int colorWidthStep,//number of channels * number of columns
const int grayWidthStep) // number of columns only
{
int halfWindow = (int)KERNEL_SIZE / 2;
const int col_index = blockIdx.x * blockDim.x + threadIdx.x;
const int row_index = blockIdx.y * blockDim.y + threadIdx.y;
__shared__ unsigned char sharedmem[CHANNELS][TILE_SIZE+KERNEL_SIZE-1] [TILE_SIZE+KERNEL_SIZE-1]; //initialize shared memory
bool is_x_left = (threadIdx.x == 0), is_x_right = (threadIdx.x == TILE_SIZE-1);
bool is_y_top = (threadIdx.y == 0), is_y_bottom = (threadIdx.y == TILE_SIZE-1);
if (row_index<height && col_index<width)
{ //Location of pixel in output
const int median_pixel_index = row_index * grayWidthStep + col_index;
for(int chan=0;chan<CHANNELS;chan++)
sharedmem[chan][threadIdx.x+halfWindow][threadIdx.y+halfWindow] = input[row_index*colorWidthStep+col_index*CHANNELS+chan];
if(is_x_left && (col_index>halfWindow-1))
for(int j=1;j<=halfWindow ;j++)
for(int chan=0;chan<CHANNELS;chan++)
sharedmem[chan][threadIdx.x+halfWindow-j][threadIdx.y+halfWindow] = input[row_index*colorWidthStep+(col_index-j)*CHANNELS+chan];
else if(is_x_right && (col_index<width-halfWindow))
for(int j = 1;j<=halfWindow;j++)
for(int chan=0;chan<CHANNELS;chan++)
sharedmem[chan][threadIdx.x +halfWindow+j][threadIdx.y+halfWindow]= input[row_index*colorWidthStep+(col_index+j)*CHANNELS+chan];
if (is_y_top && (row_index>halfWindow-1)){
for(int j = 1;j<=halfWindow;j++)
for(int chan=0;chan<CHANNELS;chan++)
sharedmem[chan][threadIdx.x+halfWindow][threadIdx.y+halfWindow-j] = input[(row_index-j)*colorWidthStep+col_index*CHANNELS+chan];
if(is_x_left)
for(int j = 1;j<=halfWindow;j++)
for(int i = 1;i<=halfWindow;i++)
for(int chan=0;chan<CHANNELS;chan++)
sharedmem[chan][threadIdx.x+halfWindow-i][threadIdx.y+halfWindow-j] = input[(row_index-j)*colorWidthStep+(col_index-i)*CHANNELS+chan];
else if(is_x_right )
for(int j = 1;j<=halfWindow;j++)
for(int i = 1;i<=halfWindow;i++)
for(int chan=0;chan<CHANNELS;chan++)
sharedmem[chan][threadIdx.x+halfWindow+i][threadIdx.y+halfWindow-j] = input[(row_index-j)*colorWidthStep+(col_index+i)*CHANNELS+chan];
}
else if (is_y_bottom && (row_index<height-halfWindow)){
for(int j = 1;j<=halfWindow;j++)
for(int chan=0;chan<CHANNELS;chan++)
sharedmem[chan][threadIdx.x+halfWindow][threadIdx.y+halfWindow+j] = input[(row_index+j)*colorWidthStep + col_index*CHANNELS+chan];
if(is_x_right)
for(int j = 1;j<=halfWindow;j++)
for(int i = 1;i<=halfWindow;i++)
for(int chan=0;chan<CHANNELS;chan++)
sharedmem[chan][threadIdx.x+halfWindow+i][threadIdx.y+halfWindow+j] = input[(row_index+j)*colorWidthStep+(col_index+i)*CHANNELS+chan];
else if(is_x_left)
for(int j = 1;j<=halfWindow;j++)
for(int i = 1;i<=halfWindow;i++)
for(int chan=0;chan<CHANNELS;chan++)
sharedmem[chan][threadIdx.x+halfWindow-i][threadIdx.y+halfWindow+j] = input[(row_index+j)*colorWidthStep+(col_index-i)*CHANNELS+chan];
}
__syncthreads();
if((row_index >= int(KERNEL_SIZE / 2)) && (row_index < height - int(KERNEL_SIZE / 2)) && (col_index >= int(KERNEL_SIZE / 2)) && (col_index < width - int(KERNEL_SIZE / 2)))
{
unsigned char median_array[KERNEL_SIZE*KERNEL_SIZE*CHANNELS];
int median_array_index = 0;
for(int chan = 0;chan<CHANNELS;chan++){
for(int i = 0;i<KERNEL_SIZE;i++ ){
for(int j=0;j<KERNEL_SIZE;j++){
if(median_array_index<KERNEL_SIZE*KERNEL_SIZE*CHANNELS)
median_array[median_array_index] = sharedmem[chan][threadIdx.x+i][threadIdx.y+j];
median_array_index++;
}
}
}
// Ignore pixels which require padding for its window (example:corner and boundary pixels).
// Those pixels are initialized to 0
output[median_pixel_index] = medianBubble(median_array, KERNEL_SIZE*KERNEL_SIZE*CHANNELS);
}
}
}
__device__ unsigned char medianBubble(unsigned char* dataQueue, int lengthQueue)
{
int minValueIndex;
unsigned char bufferData;
int i, j;
for (j = 0; j <= (lengthQueue - 1) / 2; j++)
{
minValueIndex = j;
for (i = j + 1; i < lengthQueue; i++)
if (dataQueue[i] < dataQueue[minValueIndex])
minValueIndex = i;
bufferData = dataQueue[j];
dataQueue[j] = dataQueue[minValueIndex];
dataQueue[minValueIndex] = bufferData;
}
return dataQueue[(lengthQueue - 1) / 2];
}