I’m trying to use CUDA to resample a 16-bit indexed color image and I just can’t seem to get good performance. I’m guessing that the problem is to do with memory latency because I don’t think this is a very computationally intensive problem. I’ve been able to do the same thing in Direct3D about 3 times faster.
By my estimation I’m getting about 1.5GB/sec memory read performance.
Can anyone suggest what I’m doing wrong please?
texture<ushort1, 2, cudaReadModeElementType> texImage;
texture<uchar4, 1, cudaReadModeElementType> texImageLUT;
__global__ void render_kernel(unsigned int *pDst, int nWindowSizeX, int nWindowSizeY, float fPanX, float fPanY, float fZoom)
{
int ix = blockDim.x * blockIdx.x + threadIdx.x;
int iy = blockDim.y * blockIdx.y + threadIdx.y;
if(ix < nWindowSizeX && iy < nWindowSizeY)
{
float x = ((float)ix + 0.5f) / fZoom - fPanX;
float y = ((float)iy + 0.5f) / fZoom - fPanY;
float x0 = floorf(x - 0.5f) + 0.5f;
float y0 = floorf(y - 0.5f) + 0.5f;
float x1 = x0 + 1.0f;
float y1 = y0 + 1.0f;
unsigned short lx0y0 = tex2D(texImage, x0, y0).x;
unsigned short lx1y0 = tex2D(texImage, x1, y0).x;
unsigned short lx0y1 = tex2D(texImage, x0, y1).x;
unsigned short lx1y1 = tex2D(texImage, x1, y1).x;
//__syncthreads();
uchar4 rgbx0y0 = tex1Dfetch(texImageLUT, lx0y0);
uchar4 rgbx1y0 = tex1Dfetch(texImageLUT, lx1y0);
uchar4 rgbx0y1 = tex1Dfetch(texImageLUT, lx0y1);
uchar4 rgbx1y1 = tex1Dfetch(texImageLUT, lx1y1);
float wx1 = x - x0;
float wx0 = 1.0f - wx1;
float wy1 = y - y0;
float wy0 = 1.0f - wy1;
unsigned char b = (rgbx0y0.x * wx0 + rgbx1y0.x * wx1) * wy0 + (rgbx0y1.x * wx0 + rgbx1y1.x * wx1) * wy1;
unsigned char g = (rgbx0y0.y * wx0 + rgbx1y0.y * wx1) * wy0 + (rgbx0y1.y * wx0 + rgbx1y1.y * wx1) * wy1;
unsigned char r = (rgbx0y0.z * wx0 + rgbx1y0.z * wx1) * wy0 + (rgbx0y1.z * wx0 + rgbx1y1.z * wx1) * wy1;
//__syncthreads();
pDst[nWindowSizeX * (nWindowSizeY - iy - 1) + ix] = (b << 16) + (g << 8) + r;
}
}
extern "C" void render(unsigned int *pDst, cudaArray *pImage, unsigned int *pImageLUT, int nWindowSizeX, int nWindowSizeY, float fPanX, float fPanY, float fZoom)
{
dim3 threads(8, 8);
dim3 grid((nWindowSizeX + 7) / 8, (nWindowSizeY + 7) / 8);
cudaBindTextureToArray(texImage, pImage);
cudaBindTexture(0, texImageLUT, pImageLUT, 65536 * 4);
render_kernel<<<grid, threads>>>(pDst, nWindowSizeX, nWindowSizeY, fPanX, fPanY, fZoom);
cudaUnbindTexture(texImageLUT);
cudaUnbindTexture(texImage);
}