Hi Forum!
I programmed a simple nearest neighbor algorithm for demosaicing (debayering) raw images. These images are stored in a OpenCV IplImage structure. Unfortunately I used to many IF-statments which makes my CUDA version three times slower than the equivalent CPU version. So I wonder someone can help me accelerate my algorithm. Or maybe one already programmed another demosaicing algorithm (i.e. biliear) in CUDA.
I would be grateful for your help. Thx for any helping advise.
Sandra
PS: Enclose you’ll find my code.
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include <cutil_inline.h>
#include <cv.h>
texture<unsigned char, 2, cudaReadModeElementType> imgTex;
//nearest neighbor kernel using texture memory
__global__ void debayer_TM( unsigned char *res, int width, int height )
{
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
if ( x > 1 && y > 1 && x <= width && y <= height )
{
if ( x%2==0 )
{
if ( y%2==0 )
{
res[y*width*3 + x*3] = tex2D(imgTex, x-1, y-1); //blue
res[y*width*3 + x*3+1] = tex2D(imgTex, x-1, y); //green
res[y*width*3 + x*3+2] = tex2D(imgTex, x, y); //red
}
else
{
res[y*width*3 + x*3] = tex2D(imgTex, x-1, y); //blue
res[y*width*3 + x*3+1] = tex2D(imgTex, x, y); //green
res[y*width*3 + x*3+2] = tex2D(imgTex, x, y-1); //red
}
}
else
if ( y%2==0 )
{
res[y*width*3 + x*3] = tex2D(imgTex, x, y-1); //blue
res[y*width*3 + x*3+1] = tex2D(imgTex, x, y); //green
res[y*width*3 + x*3+2] = tex2D(imgTex, x-1, y); //red
}
else
{
res[y*width*3 + x*3] = tex2D(imgTex, x, y); //blue
res[y*width*3 + x*3+1] = tex2D(imgTex, x-1, y); //green
res[y*width*3 + x*3+2] = tex2D(imgTex, x-1, y-1); //red
}
}
}
extern "C" { void CudaDeBayerTM( IplImage *iplIn, IplImage *iplOut )
{
//declare device pointer
unsigned char *DEVres;
cudaArray *imgArray;
// create channel descriptor for 2D cuda array
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>();
//malloc device memory
int size = sizeof(unsigned char)*iplIn->width*iplIn->height;
cudaMallocArray(&imgArray, &channelDesc, iplIn->width, iplIn->height);
cudaMalloc((void**)&DEVres, size*3);
//copy host2device
cudaMemcpy2DToArray(imgArray, 0, 0, (unsigned char*) iplIn->imageData, sizeof(unsigned char) * iplIn->widthStep, sizeof(unsigned char) * iplIn->width, iplIn->height, cudaMemcpyHostToDevice);
// bind the array to the texture
cudaBindTextureToArray(imgTex, imgArray, channelDesc);
//launch kernel
dim3 block(16, 8, 1);
dim3 grid(iplIn->width/block.x, iplIn->height/block.y, 1);
debayer_TM <<< grid,block,0 >>> ( DEVres, iplIn->width, iplIn->height );
CUDA_SAFE_CALL(cudaThreadSynchronize());
//copy device2host
cudaMemcpy(iplOut->imageData, DEVres, iplIn->height*iplIn->width*3, cudaMemcpyDeviceToHost);
//free memory on device and host
cudaFreeArray(imgArray);
cudaUnbindTexture(imgTex);
cudaFree(DEVres);
}}