I’m trying to use CUDA and OpenGL to write a viewer for large (say 12k x 12k) images. I’m storing the images in video memory as an array of 16-bit indices and a palette of 32-bit RGB values. I’m then displaying (part of) this image in a display window with pan and zoom controls. I do this by processing the data into an OpenGL PBO which I then copy into an OpenGL texture and use to draw a quad into the backbuffer of my window.

I have a couple of problems. Firstly, it doesn’t seem to be very fast. For a 1600x1200 window the whole process takes around 30ms on a GeForce 8600GT. Secondly, it seems to be using 100% of the host CPU. Can anyone help?

My kernel looks like this:

```
__global__ void render_kernel(unsigned int *pDst, int nWindowSizeX, int nWindowSizeY, float fPanX, float fPanY, float fZoom)
{
int ix = blockDim.x * blockIdx.x + threadIdx.x;
int iy = blockDim.y * blockIdx.y + threadIdx.y;
if(ix < nWindowSizeX && iy < nWindowSizeY)
{
float x = ((float)ix + fPanX + 0.5f) / fZoom;
float y = ((float)iy + fPanY + 0.5f) / fZoom;
float x0 = floorf(x - 0.5f) + 0.5f;
float y0 = floorf(y - 0.5f) + 0.5f;
float x1 = x0 + 1.0f;
float y1 = y0 + 1.0f;
unsigned short lx0y0 = tex2D(texImage, x0, y0).x;
unsigned short lx1y0 = tex2D(texImage, x1, y0).x;
unsigned short lx0y1 = tex2D(texImage, x0, y1).x;
unsigned short lx1y1 = tex2D(texImage, x1, y1).x;
__syncthreads();
uchar4 rgbx0y0 = tex1Dfetch(texImageLUT, lx0y0);
uchar4 rgbx1y0 = tex1Dfetch(texImageLUT, lx1y0);
uchar4 rgbx0y1 = tex1Dfetch(texImageLUT, lx0y1);
uchar4 rgbx1y1 = tex1Dfetch(texImageLUT, lx1y1);
__syncthreads();
float wx1 = x - x0;
float wx0 = 1.0f - wx1;
float wy1 = y - y0;
float wy0 = 1.0f - wy1;
unsigned char b = (rgbx0y0.x * wx0 + rgbx1y0.x * wx1) * wy0 + (rgbx0y1.x * wx0 + rgbx1y1.x * wx1) * wy1;
unsigned char g = (rgbx0y0.y * wx0 + rgbx1y0.y * wx1) * wy0 + (rgbx0y1.y * wx0 + rgbx1y1.y * wx1) * wy1;
unsigned char r = (rgbx0y0.z * wx0 + rgbx1y0.z * wx1) * wy0 + (rgbx0y1.z * wx0 + rgbx1y1.z * wx1) * wy1;
pDst[nWindowSizeX * (nWindowSizeY - iy - 1) + ix] = (b << 16) + (g << 8) + r;
}
}
```