Hi,

so I implemented the demosaicing with the 5x5 kernel according to the paper by Malvar, He and Cutler. First of all, here is the Kernel code in case anyone needs it.

GetPixel and SetColorPixel are device functions to read the raw source data and write to the output image. They are currently specific to my images of a fixed size and therefore not posted here.

```
__global__ void gpu_MalvarHeCutler(unsigned short *d_raw, unsigned char *out, size_t rgbPitch, size_t pitch)
{
__shared__ unsigned short subImage[20][20];
int x = 16*blockIdx.x + threadIdx.x;
int y = 16*blockIdx.y + threadIdx.y;
int bx = threadIdx.x;
int by = threadIdx.y;
int col = threadIdx.z;
unsigned short val;
if (col == 0) // Get inner image
{
subImage[bx+2][by+2] = GetPixel(d_raw, x, y, pitch);
}
else if (col == 1) // Get sides
{
if (bx < 2) subImage[bx][by+2] = GetPixel(d_raw, x-2, y, pitch); // left side
if (bx > 13) subImage[bx+4][by+2] = GetPixel(d_raw, x+2, y, pitch); // right side
if (by < 2) subImage[bx+2][by] = GetPixel(d_raw, x, y-2, pitch); // lower side
if (by > 13) subImage[bx+2][by+4] = GetPixel(d_raw, x, y+2, pitch); // upper side
}
else if (col == 2) // Get corners
{
if (bx == 0 && by == 0) subImage[1][1] = GetPixel(d_raw, x-1,y-1, pitch);
if (bx == 15 && by == 15) subImage[18][18] = GetPixel(d_raw, x+1,y+1, pitch);
if (bx == 0 && by == 15) subImage[1][18] = GetPixel(d_raw, x-1,y+1, pitch);
if (bx == 15 && by == 0) subImage[18][1] = GetPixel(d_raw, x+1,y-1, pitch);
}
__syncthreads();
switch (col)
{
case _RED_:
if (x % 2 == 0 && y % 2 == 0)
{
val = subImage[bx+2][by+2];
SetColorPixel(out, x, y, _RED_, val, rgbPitch);
}
else if (x % 2 == 0 && y % 2 != 0)
{
val = 5*subImage[bx+2][by+2] + 4*subImage[bx+2][by+3] + 4*subImage[bx+2][by+1] - subImage[bx+2][by+4] - subImage[bx+2][by] - subImage[bx+3][by+3] - subImage[bx+3][by+1]
- subImage[bx+1][by+3] - subImage[bx+1][by+1] + 0.5*subImage[bx+4][by+2] + 0.5*subImage[bx][by+2];
SetColorPixel(out, x, y, _RED_, 0.125*val, rgbPitch);
}
else if (x % 2 != 0 && y % 2 == 0)
{
val = 5*subImage[bx+2][by+2] + 4*subImage[bx+3][by+2] +4*subImage[bx+1][by+2] - subImage[bx+3][by+3] - subImage[bx+3][by+1] - subImage[bx+1][by+3] - subImage[bx+1][by+1]
- subImage[bx+4][by+2] - subImage[bx][by+2] + 0.5*subImage[bx+2][by+4] + 0.5*subImage[bx+2][by];
SetColorPixel(out, x, y, _RED_, 0.125*val, rgbPitch);
}
else
{
val = 6*subImage[bx+2][by+2] + 2*subImage[bx+3][by+3] + 2*subImage[bx+3][by+1] + 2*subImage[bx+1][by+3] + 2*subImage[bx+1][by+1]
- 1.5*subImage[bx+4][by+4] - 1.5*subImage[bx+4][by] - 1.5*subImage[bx][by+4] - 1.5*subImage[bx][by];
SetColorPixel(out, x, y, _RED_, 0.125*val, rgbPitch);
}
break;
case _GREEN_:
if (x % 2 == 0 && y % 2 == 0)
{
val = 4*subImage[bx+2][by+2] + 2*subImage[bx+3][by+2] + 2*subImage[bx+1][by+2] + 2*subImage[bx+2][by+3] + 2*subImage[bx+2][by+1]
- subImage[bx+4][by+2] - subImage[bx][by+2] - subImage[bx+2][by+4] - subImage[bx+2][by];
SetColorPixel(out, x, y, _GREEN_, 0.125*val, rgbPitch);
}
else if (x % 2 != 0 && y % 2 != 0)
{
val = 4*subImage[bx+2][by+2] + 2*subImage[bx+3][by+2] + 2*subImage[bx+1][by+2] + 2*subImage[bx+2][by+3] + 2*subImage[bx+2][by+1]
- subImage[bx+4][by+2] - subImage[bx][by+2] - subImage[bx+2][by+4] - subImage[bx+2][by];
SetColorPixel(out, x, y, _GREEN_, 0.125*val, rgbPitch);
}
else
{
val = subImage[bx+2][by+2];
SetColorPixel(out, x, y, _GREEN_, val, rgbPitch);
}
break;
case _BLUE_:
if (x % 2 == 0 && y % 2 == 0)
{
val = 6*subImage[bx+2][by+2] + 2*subImage[bx+3][by+3] + 2*subImage[bx+3][by+1] + 2*subImage[bx+1][by+3] + 2*subImage[bx+1][by+1]
- 1.5*subImage[bx+4][by+4] - 1.5*subImage[bx+4][by] - 1.5*subImage[bx][by+4] - 1.5*subImage[bx][by];
SetColorPixel(out, x, y, _BLUE_, 0.125*val, rgbPitch);
}
else if (x % 2 != 0 && y % 2 == 0)
{
val = 5*subImage[bx+2][by+2] + 4*subImage[bx+2][by+3] + 4*subImage[bx+2][by+1] - subImage[bx+2][by+4] - subImage[bx+2][by] - subImage[bx+3][by+3] - subImage[bx+3][by+1]
- subImage[bx+1][by+3] - subImage[bx+1][by+1] + 0.5*subImage[bx+4][by+2] + 0.5*subImage[bx][by+2];
SetColorPixel(out, x, y, _BLUE_, 0.125*val, rgbPitch);
}
else if (x % 2 == 0 && y % 2 != 0)
{
val = 5*subImage[bx+2][by+2] + 4*subImage[bx+3][by+2] +4*subImage[bx+1][by+2] - subImage[bx+3][by+3] - subImage[bx+3][by+1] - subImage[bx+1][by+3] - subImage[bx+1][by+1]
- subImage[bx+4][by+2] - subImage[bx][by+2] + 0.5*subImage[bx+2][by+4] + 0.5*subImage[bx+2][by];
SetColorPixel(out, x, y, _BLUE_, 0.125*val, rgbPitch);
}
else
{
val = subImage[bx+2][by+2];
SetColorPixel(out, x, y, _BLUE_, val, rgbPitch);
}
break;
}
}
```

So the approach is to have 16x16x3 threads in a block, to work on a pixel area of 16x16 and the 3 colors. Currently my image size always is a multiple of 16, maybe some adaptions are necessary for different cases.

First, a tile of 20x20 pixel is loaded into shared memory. With this, the 5x5 kernels can be computed for all pixel and written to the output. Input and output memory is always allocated and used with cudaMallocPitch and cudaMemcpy2D.

The way I assign threads to fetch certain parts of the 20x20 subimage might be ugly, but I couldn’t figure out a better method.

Maybe someone has ideas how to improve this code, as it’s my first major project with CUDA. Do I have issues with warp divergence here, and what can be done about it? Are the modulo operations bad / slow? Could I make better use of shared memory?

Thanks for any input, and maybe someone might even find my code useful.

Edit: Oh, and *RED*, *GREEN* and *BLUE* are simply defined as 0, 1 and 2 to match a threadIdx.z value.