Hello all!
I am studying CUDA and implemented in fact a test task, image Gaussian filtering. I’m not sure whether performance is good, but I suppose that it’s not. On GT 520M (1 multiprocessor, 32 execution units) it takes about 150 ms for 2500*2000 image. I tried NVidia Visual Profiler, but don’t understand well what it says.
I suppose many if checks and numerous “c_tileSize - params.gaussRadius” calculations may decrease performance. I understand that ifs can be removed for inner blocks by writing separated function. But caching of calculation results would require additional variables. I also understand that 1 block with 1024 threads is bad, but for such a simple implementation and radius 2-5 16*16 block looks too uneffective. Visual Profiler reports good achieved occupancy.
Please tell me how the code can be speeded up (and how to understand this from Visual Profiler) or show any principial mistakes.
static const int c_tileSize = 32;
struct TCudaGaussFilterParams
{
int imageW, imageH;
uchar1 *pSourceImageData, *pDestImageData;
int gaussRadius; // Hardcoded 2
float *gaussTex; // {0.06136f, 0.24477f, 0.38774f, 0.24477f, 0.06136f}
};
struct TCudaGaussFilterParams_Inner : TCudaGaussFilterParams
{
int blockShift;
};
__global__ void cudaGaussFilter(const TCudaGaussFilterParams_Inner params)
{
// Block of threads generates resulting 28 * 28 block but reads 32 * 32 pixels
__shared__ uchar4 aTile[c_tileSize][c_tileSize];
__shared__ uchar4 aTile2[c_tileSize][c_tileSize];
int x = blockIdx.x * params.blockShift + threadIdx.x;
int y = blockIdx.y * params.blockShift + threadIdx.y;
if (x >= params.imageW || y >= params.imageH)
return;
// Loading the entire tile into shared memory
aTile[threadIdx.x][threadIdx.y] = ((uchar4 *)params.pSourceImageData)[y * params.imageW + x];
__syncthreads();
if (x < params.gaussRadius ||
x >= params.imageW - params.gaussRadius ||
y < params.gaussRadius ||
y >= params.imageH - params.gaussRadius)
((uchar4 *)params.pDestImageData)[y * params.imageW + x] = aTile[threadIdx.x][threadIdx.y];
// Keeping the whole image's 2-pixel border unchanged
else if ((threadIdx.x >= params.gaussRadius) &&
(threadIdx.x < c_tileSize - params.gaussRadius) &&
(threadIdx.y >= params.gaussRadius) &&
(threadIdx.y < c_tileSize - params.gaussRadius))
{
// 2-pixel border's pixels are only read. The rest are smoothed in this block
float4 pixelSum = {0, 0, 0, 0};
int i;
// Smoothing horizontally
for (i = 0; i < params.gaussRadius * 2 + 1; i++)
{
uchar4 pixel = aTile[threadIdx.x - params.gaussRadius + i][threadIdx.y];
pixelSum.x += pixel.x * params.gaussTex[i]; // I didn't find vector multiplication in standard types
pixelSum.y += pixel.y * params.gaussTex[i];
pixelSum.z += pixel.z * params.gaussTex[i];
pixelSum.w += pixel.w * params.gaussTex[i];
}
aTile2[threadIdx.x][threadIdx.y].x = unsigned char(pixelSum.x); // Nor copying
aTile2[threadIdx.x][threadIdx.y].y = unsigned char(pixelSum.y);
aTile2[threadIdx.x][threadIdx.y].z = unsigned char(pixelSum.z);
aTile2[threadIdx.x][threadIdx.y].w = unsigned char(pixelSum.w);
__syncthreads();
aTile[threadIdx.x][threadIdx.y] = aTile2[threadIdx.x][threadIdx.y];
__syncthreads();
// Smoothing vertically
pixelSum.x = 0;
pixelSum.y = 0;
pixelSum.z = 0;
pixelSum.w = 0;
for (i = 0; i < params.gaussRadius * 2 + 1; i++)
{
uchar4 pixel = aTile[threadIdx.x][threadIdx.y - params.gaussRadius + i];
pixelSum.x += float(pixel.x) * params.gaussTex[i];
pixelSum.y += float(pixel.y) * params.gaussTex[i];
pixelSum.z += float(pixel.z) * params.gaussTex[i];
pixelSum.w += float(pixel.w) * params.gaussTex[i];
}
((uchar4 *)params.pDestImageData)[y * params.imageW + x].x = unsigned char(pixelSum.x);
((uchar4 *)params.pDestImageData)[y * params.imageW + x].y = unsigned char(pixelSum.y);
((uchar4 *)params.pDestImageData)[y * params.imageW + x].z = unsigned char(pixelSum.z);
((uchar4 *)params.pDestImageData)[y * params.imageW + x].w = unsigned char(pixelSum.w);
}
}
void FilterImageWithGauss_CudaOuter(const TCudaGaussFilterParams ¶ms)
{
assert(c_tileSize > params.gaussRadius * 2 + 1);
int blockShift = c_tileSize - (params.gaussRadius * 2 + 1);
const dim3 grid((params.imageW - c_tileSize - 1 + blockShift * 2) / blockShift,
(params.imageH - c_tileSize - 1 + blockShift * 2) / blockShift, 1);
// number of blocks
const dim3 threads(c_tileSize, c_tileSize, 1); // block width: number of threads per block
TCudaGaussFilterParams_Inner innerParams;
reinterpret_cast<TCudaGaussFilterParams &>(innerParams) = params;
innerParams.blockShift = blockShift;
cudaGaussFilter<<<grid, threads>>>(innerParams);
}