I’m trying to convert a raw Bayer camera image to RGB using Cuda 4 on a 16core GTS250

Based on the SDK sample code in “bilinear” Blocksize is 16x16, image is a multiple of 16 pixels.

But it’s taking 10ms to run, longer than the similar code takes single-threaded on the CPU!

Am I doing something obviously stupid?

```
texture<uchar, 2, cudaReadModeElementType> tex;
cudaArray *d_imageArray = 0;
__global__ void convertGRBG(uchar4 *d_output, uint width, uint height)
{
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
uint i = __umul24(y, width) + x;
// input is GR/BG output is BGRA
if ((x < width) && (y < height)) {
if ( y & 0x01 ) {
if ( x & 0x01 ) {
d_output[i].x = (tex2D(tex,x+1,y)+tex2D(tex,x-1,y))/2; // B
d_output[i].y = (tex2D(tex,x,y)); // G in B
d_output[i].z = (tex2D(tex,x,y+1)+tex2D(tex,x,y-1))/2; // R
} else {
d_output[i].x = (tex2D(tex,x,y)); //B
d_output[i].y = (tex2D(tex,x+1,y) + tex2D(tex,x-1,y)+tex2D(tex,x,y+1)+tex2D(tex,x,y-1))/4; // G
d_output[i].z = (tex2D(tex,x+1,y+1) + tex2D(tex,x+1,y-1)+tex2D(tex,x-1,y+1)+tex2D(tex,x-1,y-1))/4; // R
}
} else {
if ( x & 0x01 ) {
// odd col = R
d_output[i].y = (tex2D(tex,x+1,y+1) + tex2D(tex,x+1,y-1)+tex2D(tex,x-1,y+1)+tex2D(tex,x-1,y-1))/4; // B
d_output[i].z = (tex2D(tex,x,y)); //R
d_output[i].y = (tex2D(tex,x+1,y) + tex2D(tex,x-1,y)+tex2D(tex,x,y+1)+tex2D(tex,x,y-1))/4; // G
} else {
d_output[i].x = (tex2D(tex,x,y+1)+tex2D(tex,x,y-1))/2; // B
d_output[i].y = (tex2D(tex,x,y)); // G in R
d_output[i].z = (tex2D(tex,x+1,y)+tex2D(tex,x-1,y))/2; // R
}
}
}
}
void initTexture(int imageWidth, int imageHeight, uchar *imagedata)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
cutilSafeCall( cudaMallocArray(&d_imageArray, &channelDesc, imageWidth, imageHeight) );
uint size = imageWidth * imageHeight * sizeof(uchar);
cutilSafeCall( cudaMemcpyToArray(d_imageArray, 0, 0, imagedata, size, cudaMemcpyHostToDevice) );
cutFree(imagedata);
// bind array to texture reference with point sampling
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModePoint;
tex.normalized = false;
cutilSafeCall( cudaBindTextureToArray(tex, d_imageArray) );
}
```