Hello, i’m doing a work on a picture for University; I have to do some operations on an image, as noise reduction ecc.; I have the code with CPU, and I have to do it with CUDA using shared memory, and it should be faster then CPU. I have some problems: with small pictures, CPU it’s a little faster then GPU; instead, with bigger pictures GPU it’s faster then CPU…Then I don’t know if I can do it better or if I have to accept this. I have 3 kernels, I write only code of 1 kernel because other kernels are similar
CPU
void cannyCPU(float *im, float *image_out,
float *NR, float *G, float *phi, float *Gx, float *Gy, int *pedge,
float level,
int height, int width)
{
int i, j;
int ii, jj;
float PI = 3.141593;
float lowthres, hithres;
for(i=2; i<height-2; i++)
for(j=2; j<width-2; j++)
{
// Noise reduction
NR[i*width+j] =
(2.0*im[(i-2)*width+(j-2)] + 4.0*im[(i-2)*width+(j-1)] + 5.0*im[(i-2)*width+(j)] + 4.0*im[(i-2)*width+(j+1)] + 2.0*im[(i-2)*width+(j+2)]
+ 4.0*im[(i-1)*width+(j-2)] + 9.0*im[(i-1)*width+(j-1)] + 12.0*im[(i-1)*width+(j)] + 9.0*im[(i-1)*width+(j+1)] + 4.0*im[(i-1)*width+(j+2)]
+ 5.0*im[(i )*width+(j-2)] + 12.0*im[(i )*width+(j-1)] + 15.0*im[(i )*width+(j)] + 12.0*im[(i )*width+(j+1)] + 5.0*im[(i )*width+(j+2)]
+ 4.0*im[(i+1)*width+(j-2)] + 9.0*im[(i+1)*width+(j-1)] + 12.0*im[(i+1)*width+(j)] + 9.0*im[(i+1)*width+(j+1)] + 4.0*im[(i+1)*width+(j+2)]
+ 2.0*im[(i+2)*width+(j-2)] + 4.0*im[(i+2)*width+(j-1)] + 5.0*im[(i+2)*width+(j)] + 4.0*im[(i+2)*width+(j+1)] + 2.0*im[(i+2)*width+(j+2)])
/159.0;
}
}
GPU
_global__ void noise_reduction(float* im, float* NR, int height, int width) {
int i, j;
i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
__shared__ float imsh [BLOCK_SIZE+4][BLOCK_SIZE+5];
//load image
if((i >= 2 && i<(height-2)) && (j>=2 && j<(width-2))) {
imsh[ty+2][tx+2] = im[j*width+i];
__syncthreads();
if((tx>=2 && tx<BLOCK_SIZE-2) && (ty>=2 && ty<BLOCK_SIZE-2)){
tx+=2;
ty+=2;
NR[j*width+i] = (2.0*imsh[ty-2][tx-2] + 4.0*imsh[ty-2][tx-1] + 5.0*imsh[ty-2][tx] + 4.0*imsh[ty-2][tx+1] + 2.0*imsh[ty-2][tx+2]
+ 4.0*imsh[ty-1][tx-2] + 9.0*imsh[ty-1][tx-1] + 12.0*imsh[ty-1][tx] + 9.0*imsh[ty-1][tx+1] + 4.0*imsh[ty-1][tx+2]
+ 5.0*imsh[ty][tx-2] + 12.0*imsh[ty][tx-1] + 15.0*imsh[ty][tx] + 12.0*imsh[ty][tx+1] + 5.0*imsh[ty][tx+2]
+ 4.0*imsh[ty+1][tx-2] + 9.0*imsh[ty+1][tx-1] + 12.0*imsh[ty+1][tx] + 9.0*imsh[ty+1][tx+1] + 4.0*imsh[ty+1][tx+2] +
+ 2.0*imsh[ty+2][tx-2] + 4.0*imsh[ty+2][tx-1] + 5.0*imsh[ty+2][tx] + 4.0*imsh[ty+2][tx+1] + 2.0*imsh[ty+2][tx+2])
/159.0;
}
else {
NR[j*width+i] =
(2.0*im[(j-2)*width+(i-2)] + 4.0*im[(j-2)*width+(i-1)] + 5.0*im[(j-2)*width+(i)] + 4.0*im[(j-2)*width+(i+1)] + 2.0*im[(j-2)*width+(i+2)]
+ 4.0*im[(j-1)*width+(i-2)] + 9.0*im[(j-1)*width+(i-1)] + 12.0*im[(j-1)*width+(i)] + 9.0*im[(j-1)*width+(i+1)] + 4.0*im[(j-1)*width+(i+2)]
+ 5.0*im[(j )*width+(i-2)] + 12.0*im[(j )*width+(i-1)] + 15.0*im[(j )*width+(i)] + 12.0*im[(j )*width+(i+1)] + 5.0*im[(j )*width+(i+2)]
+ 4.0*im[(j+1)*width+(i-2)] + 9.0*im[(j+1)*width+(i-1)] + 12.0*im[(j+1)*width+(i)] + 9.0*im[(j+1)*width+(i+1)] + 4.0*im[(j+1)*width+(i+2)]
+ 2.0*im[(j+2)*width+(i-2)] + 4.0*im[(j+2)*width+(i-1)] + 5.0*im[(j+2)*width+(i)] + 4.0*im[(j+2)*width+(i+1)] + 2.0*im[(j+2)*width+(i+2)])
/159.0;
}
}
}
Moreover, NVIDIA Visual Profiler tells me that load efficiency is only 25.6%, instead store efficiency is 72.8%, I can’t understand why…Can I optimize this code?