Proper Shared Memory Usage for 3D Median Filtering

mehranjeelani · April 21, 2021, 10:56am

I am relatively new to CUDA programming. I have two functions to implemented 3D-Median Filtering (third dimension is the time i.e stacking of consecutive frames). First function uses Naive Parallelism without any shared memory and the second one uses shared memory. However, both of them take almost the same time. My implementation of shared memory is based on this thread of stack overflow. My layout of the input stacked images in the device pointer is depth major (pixels along the depth are contiguous,then row and then column). Each image is a single channel of dimension 384x384.
For Quadro RTX 6000 with kernel size 15x15,block size of 16 x 16 and 10 frames (384x384x10), I get the following timing statistics:
Naive Paralleism: 3237.9 milliseconds
Shared memory: 3444.0 milliseconds.
However, when I increase the tile size to 32:
Naive Parallelism: 4555.8.9 milliseconds
Shared memory: 2447.8 milliseconds.
As 32x32 is the maximum limit(1024) I want to avoid using it. Can I improve the shared memory usage using 16x16 block size?

#define TILE_SIZE 16 // block size for each dimension
#define CHANNELS 10
#define KERNEL_SIZE 15
__global__ void ThreeDMedianFilterShared(unsigned char* input,
	unsigned char* output,
	const int width,
	const int height,
	const int colorWidthStep,//number of channels * number of columns
	const int grayWidthStep) // number of columns only
{
	
	int halfWindow = (int)KERNEL_SIZE / 2;
	const int col_index = blockIdx.x * blockDim.x + threadIdx.x;
	const int row_index = blockIdx.y * blockDim.y + threadIdx.y;
	__shared__ unsigned char sharedmem[CHANNELS][TILE_SIZE+KERNEL_SIZE-1] [TILE_SIZE+KERNEL_SIZE-1];  //initialize shared memory
	bool is_x_left = (threadIdx.x == 0), is_x_right = (threadIdx.x == TILE_SIZE-1);
    bool is_y_top = (threadIdx.y == 0), is_y_bottom = (threadIdx.y == TILE_SIZE-1);
	
	
	if (row_index<height && col_index<width)
	{	//Location of  pixel in output
		const int median_pixel_index = row_index * grayWidthStep + col_index;
		for(int chan=0;chan<CHANNELS;chan++)
			sharedmem[chan][threadIdx.x+halfWindow][threadIdx.y+halfWindow] = input[row_index*colorWidthStep+col_index*CHANNELS+chan];
			
		if(is_x_left && (col_index>halfWindow-1))
			for(int j=1;j<=halfWindow ;j++)
				for(int chan=0;chan<CHANNELS;chan++)
					sharedmem[chan][threadIdx.x+halfWindow-j][threadIdx.y+halfWindow] = input[row_index*colorWidthStep+(col_index-j)*CHANNELS+chan];
		else if(is_x_right && (col_index<width-halfWindow))
			for(int j = 1;j<=halfWindow;j++)
				for(int chan=0;chan<CHANNELS;chan++)
					sharedmem[chan][threadIdx.x +halfWindow+j][threadIdx.y+halfWindow]= input[row_index*colorWidthStep+(col_index+j)*CHANNELS+chan];	
		if (is_y_top && (row_index>halfWindow-1)){
				for(int j = 1;j<=halfWindow;j++)
					for(int chan=0;chan<CHANNELS;chan++)
						sharedmem[chan][threadIdx.x+halfWindow][threadIdx.y+halfWindow-j] = input[(row_index-j)*colorWidthStep+col_index*CHANNELS+chan];
			if(is_x_left)
				for(int j = 1;j<=halfWindow;j++)
					for(int i = 1;i<=halfWindow;i++)
						for(int chan=0;chan<CHANNELS;chan++)
							sharedmem[chan][threadIdx.x+halfWindow-i][threadIdx.y+halfWindow-j] = input[(row_index-j)*colorWidthStep+(col_index-i)*CHANNELS+chan];
			else if(is_x_right )
				for(int j = 1;j<=halfWindow;j++)
					for(int i = 1;i<=halfWindow;i++)
						for(int chan=0;chan<CHANNELS;chan++)
							sharedmem[chan][threadIdx.x+halfWindow+i][threadIdx.y+halfWindow-j] = input[(row_index-j)*colorWidthStep+(col_index+i)*CHANNELS+chan];
		}
		else if (is_y_bottom && (row_index<height-halfWindow)){
			for(int j = 1;j<=halfWindow;j++)
				for(int chan=0;chan<CHANNELS;chan++)
					sharedmem[chan][threadIdx.x+halfWindow][threadIdx.y+halfWindow+j] = input[(row_index+j)*colorWidthStep + col_index*CHANNELS+chan];
			if(is_x_right)
				for(int j = 1;j<=halfWindow;j++)
					for(int i = 1;i<=halfWindow;i++)
						for(int chan=0;chan<CHANNELS;chan++)
							sharedmem[chan][threadIdx.x+halfWindow+i][threadIdx.y+halfWindow+j] = input[(row_index+j)*colorWidthStep+(col_index+i)*CHANNELS+chan];
			else if(is_x_left)
				for(int j = 1;j<=halfWindow;j++)
					for(int i = 1;i<=halfWindow;i++)
						for(int chan=0;chan<CHANNELS;chan++)
							sharedmem[chan][threadIdx.x+halfWindow-i][threadIdx.y+halfWindow+j] = input[(row_index+j)*colorWidthStep+(col_index-i)*CHANNELS+chan];
		}
		__syncthreads(); 
		if((row_index >= int(KERNEL_SIZE / 2)) && (row_index < height - int(KERNEL_SIZE / 2)) && (col_index >= int(KERNEL_SIZE / 2)) && (col_index < width - int(KERNEL_SIZE / 2)))
		{
			unsigned char median_array[KERNEL_SIZE*KERNEL_SIZE*CHANNELS];
			int median_array_index = 0;
			for(int chan = 0;chan<CHANNELS;chan++){
				for(int i = 0;i<KERNEL_SIZE;i++ ){
					for(int j=0;j<KERNEL_SIZE;j++){
						if(median_array_index<KERNEL_SIZE*KERNEL_SIZE*CHANNELS)
							median_array[median_array_index] = sharedmem[chan][threadIdx.x+i][threadIdx.y+j];
						median_array_index++;
					}
				}
			}
			// Ignore pixels which require padding for its window (example:corner and boundary pixels).
            // Those pixels are initialized to 0 
			output[median_pixel_index] = medianBubble(median_array, KERNEL_SIZE*KERNEL_SIZE*CHANNELS);
		}
	}
}
__device__ unsigned char medianBubble(unsigned char* dataQueue, int lengthQueue)
{
	int minValueIndex;
	unsigned char bufferData;
	int i, j;

	for (j = 0; j <= (lengthQueue - 1) / 2; j++)
	{
		minValueIndex = j;
		for (i = j + 1; i < lengthQueue; i++)
			if (dataQueue[i] < dataQueue[minValueIndex])
				minValueIndex = i;

		bufferData = dataQueue[j];
		dataQueue[j] = dataQueue[minValueIndex];
		dataQueue[minValueIndex] = bufferData;
	}

	return dataQueue[(lengthQueue - 1) / 2];
}

ddleon · April 22, 2021, 8:15am

Try to reduce the number of blocks (16x16) and let each block to process more tiles (e.g., each block processes 4 tiles instead of 1).

Topic		Replies	Views
Kernel Konfiguration and Runtime CUDA Programming and Performance	6	2455	March 6, 2010
Need Help with Shared Memory Allocation for 1D and 2D Arrays in CUDA CUDA Programming and Performance	15	450	May 16, 2024
Shared Memory Limitation CUDA Programming and Performance	8	4776	February 4, 2009
CUDA performance How to improve my code? CUDA Programming and Performance	16	13574	June 29, 2010
shared memory performance kernel execution timings with one block CUDA Programming and Performance	3	3168	May 6, 2007
Newbie help on thread blocks CUDA Programming and Performance	22	10595	December 24, 2008
full warp Vs. half warp coalesced global memory loads CUDA Programming and Performance	7	8021	May 27, 2009
Device Query GeForce 210 CUDA Programming and Performance	6	11169	May 27, 2010
Image convolution with Shared Memory CUDA Programming and Performance	0	1139	March 2, 2010
Shared memory problem CUDA Programming and Performance	3	2258	February 8, 2008

Proper Shared Memory Usage for 3D Median Filtering

Related topics