Hi, I have been trying to compare the performance of a non-maximal suppression filter using global and shared memory with the profiler.
The code is similar to the one given in the sample projects for the Canny filter, except that here the neighbourhood radius is variable (= 4 in the tests below), thus requiring an apron of variable size, and a copy of the data c resides in texture memory (texData).
Here are the two versions of the algorithm:
- Global memory version
[codebox]#define NONMAXSUPP_RADIUS 4
#define BLOCK_WIDTH 16
#define BLOCK_HEIGHT 16
global void nonMaximalSuppression(bool *features, float *c, int width, int height) {
const int ix = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
const int iy = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
const int idx = IMUL(iy, width) + ix;
const float x = (float)ix + 0.5f;
const float y = (float)iy + 0.5f;
bool isMax = true;
for (int i = -NONMAXSUPP_RADIUS; i <= NONMAXSUPP_RADIUS; i++) {
for (int j = -NONMAXSUPP_RADIUS; j <= NONMAXSUPP_RADIUS; j++) {
if (tex2D(texData, x + i, y + j) > c[idx]) {
isMax = false;
break;
}
}
}
features[idx] = (isMax) ? true : false;
}[/codebox]
- Shared memory version
[codebox] #define NONMAXSUPP_RADIUS 4
#define BLOCK_WIDTH 16
#define BLOCK_HEIGHT 16
#define NONMAXSUPP_WIDTH (BLOCK_WIDTH+NONMAXSUPP_RADIUS*2)
#define NONMAXSUPP_HEIGHT (BLOCK_HEIGHT+NONMAXSUPP_RADIUS*2)
global void nonMaximalSuppression(bool *features, float *c, int width, int height) {
const int ix = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
const int iy = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
const int idx = IMUL(iy, width) + ix;
float x = (float)ix + 0.5f;
float y = (float)iy + 0.5f;
__shared__ float cornerData[NONMAXSUPP_HEIGHT][NONMAXSUPP_WIDTH];
//load block data into shared memory block - coalesced
cornerData[threadIdx.y + NONMAXSUPP_RADIUS][threadIdx.x + NONMAXSUPP_RADIUS] = tex2D(texData,x,y);
//load apron
if (threadIdx.y == 0) {
//top row
if (threadIdx.x == 0) {
//top left corner
#pragma unroll NONMAXSUPP_RADIUS
for (int i = 0; i < NONMAXSUPP_RADIUS; i++)
#pragma unroll NONMAXSUPP_RADIUS
for (int j = 0; j < NONMAXSUPP_RADIUS; j++)
cornerData[i][j] = tex2D(texData,x - NONMAXSUPP_RADIUS +j, y-NONMAXSUPP_RADIUS+i);
} else if (threadIdx.x == (BLOCK_WIDTH -1)) {
//top right corner
#pragma unroll NONMAXSUPP_RADIUS
for (int i = 0; i < NONMAXSUPP_RADIUS; i++)
#pragma unroll NONMAXSUPP_RADIUS
for (int j = 0; j < NONMAXSUPP_RADIUS; j++)
cornerData[i][NONMAXSUPP_WIDTH -1 -j] = tex2D(texData,x + NONMAXSUPP_RADIUS -1 -j, y-NONMAXSUPP_RADIUS+i);
} else {
#pragma unroll NONMAXSUPP_RADIUS
for (int i = 0; i < NONMAXSUPP_RADIUS; i++)
cornerData[i][threadIdx.x + NONMAXSUPP_RADIUS] = tex2D(texData,x,y-NONMAXSUPP_RADIUS+i);
}
} else if (threadIdx.y == (BLOCK_HEIGHT -1)) {
//bottom row
if (threadIdx.x == 0) {
//bottom left corner
#pragma unroll NONMAXSUPP_RADIUS
for (int i = 0; i < NONMAXSUPP_RADIUS; i++)
#pragma unroll NONMAXSUPP_RADIUS
for (int j = 0; j < NONMAXSUPP_RADIUS; j++)
cornerData[NONMAXSUPP_HEIGHT -1 - i][j] = tex2D(texData,x - NONMAXSUPP_RADIUS +j, y+NONMAXSUPP_RADIUS-1-i);
} else if (threadIdx.x == (BLOCK_WIDTH -1)) {
//bottom right corner
#pragma unroll NONMAXSUPP_RADIUS
for (int i = 0; i < NONMAXSUPP_RADIUS; i++)
#pragma unroll NONMAXSUPP_RADIUS
for (int j = 0; j < NONMAXSUPP_RADIUS; j++)
cornerData[NONMAXSUPP_HEIGHT -1 - i][NONMAXSUPP_WIDTH -1 -j] = tex2D(texData,x + NONMAXSUPP_RADIUS -1 -j, y+NONMAXSUPP_RADIUS-1-i);
} else {
#pragma unroll NONMAXSUPP_RADIUS
for (int i = 0; i < NONMAXSUPP_RADIUS; i++)
cornerData[NONMAXSUPP_HEIGHT -1 -i][threadIdx.x + NONMAXSUPP_RADIUS] = tex2D(texData,x,y+NONMAXSUPP_RADIUS-1 -i);
}
} else if (threadIdx.x == 0) {
//left column
#pragma unroll NONMAXSUPP_RADIUS
for (int i = 0; i < NONMAXSUPP_RADIUS; i++)
cornerData[threadIdx.y + NONMAXSUPP_RADIUS][i] = tex2D(texData,x-NONMAXSUPP_RADIUS +i,y);
} else if (threadIdx.x == (BLOCK_WIDTH -1)) {
//right column
#pragma unroll NONMAXSUPP_RADIUS
for (int i = 0; i < NONMAXSUPP_RADIUS; i++)
cornerData[threadIdx.y + NONMAXSUPP_RADIUS][NONMAXSUPP_WIDTH -1 -i] = tex2D(texData,x+NONMAXSUPP_RADIUS -1 -i,y);
}
__syncthreads(); //make sure everything is loaded correctly
bool isMax = true;
for (int i = -NONMAXSUPP_RADIUS; i <= NONMAXSUPP_RADIUS; i++) {
for (int j = -NONMAXSUPP_RADIUS; j <= NONMAXSUPP_RADIUS; j++) {
if (cornerData[threadIdx.y + NONMAXSUPP_RADIUS + i][threadIdx.x + NONMAXSUPP_RADIUS + j] > cornerData[threadIdx.y+NONMAXSUPP_RADIUS][threadIdx.x+NONMAX
SUPP_RADIUS]) {
isMax = false;
break;
}
}
}
features[idx] = (isMax) ? true : false;
} [/codebox]
The results I obtained with the profiler suggested that the smem implementation would be about 80% slower than the one using gmem. Is this because of the way I am loading the apron or because looking for the maximum in the neighbourhood leads to bank conflicts?
A few additional notes:
-
#pragma unroll did not seem to have much of an effect, possibly because the bottleneck lies elsewhere
-
I am using textures so that values queries for pixels outside the image boundaries will automatically return zero.