I’m having serious problems with the following code on a GTX 460. I’m starting to suspect it is related to the fact that this chip only has 2 warp schedulers where it really needs 3.
[codebox]
#define BLOCK_SIZE_X 32
#define BLOCK_SIZE_Y 16
#define BLOCKS_X 32
#define BLOCKS_Y 64
#define VOXELS_X 1024
#define VOXELS_Y 1024
#define VOXELS_Z 128
#define RCP_VOXELS_X 0.0009765625f; // 1/1024
#define RCP_VOXELS_Y 0.0009765625f; // 1/1024
#define RCP_VOXELS_Z 0.0078125f; // 1/128
texture<float, 2, cudaReadModeElementType> tex1;
texture<float, 2, cudaReadModeElementType> tex2;
texture<float, 2, cudaReadModeElementType> tex3;
texture<float, 2, cudaReadModeElementType> tex4;
global void kernelSample4(float *sptr, float *eptr)
{
unsigned int x = blockIdx.x * BLOCK_SIZE_X + threadIdx.x;
unsigned int y = blockIdx.y * BLOCK_SIZE_Y + threadIdx.y;
unsigned int i = (blockIdx.y * BLOCKS_X + blockIdx.x) * BLOCK_SIZE_X * BLOCK_SIZE_Y + (threadIdx.y * BLOCK_SIZE_X) + threadIdx.x;
float u = (x + 0.5f) * RCP_VOXELS_X;
float v = 0.5f * RCP_VOXELS_Z;
float u1 = (x + 0.2f) * RCP_VOXELS_X;
float v1 = 0.2f;
float u2 = (x + 0.4f) * RCP_VOXELS_X;
float v2 = 0.4f;
float u3 = (x + 0.6f) * RCP_VOXELS_X;
float v3 = 0.6f;
float u4 = (x + 0.8f) * RCP_VOXELS_X;
float v4 = 0.8f;
volatile float *ptr = (float *)((unsigned int)sptr + i * sizeof(float));
while(ptr < eptr)
{
float accum = *ptr;
// This code performs at 37.6 GT/s on GTX 460
//accum += tex2D(tex1, u1, v1);
//accum += tex2D(tex2, u2, v2);
//accum += tex2D(tex3, u3, v3);
//accum += tex2D(tex4, u4, v4);
// This code performs at 18.8 GT/s on GTX 460
accum += tex2D(tex1, u1, v);
accum += tex2D(tex2, u2, v);
accum += tex2D(tex3, u3, v);
accum += tex2D(tex4, u4, v);
*ptr = accum;
v += RCP_VOXELS_Z;
ptr += VOXELS_X * VOXELS_Y;
}
}
extern “C” void cudaSample4(float *slices, cudaArray *ptr1, cudaArray *ptr2, cudaArray *ptr3, cudaArray *ptr4)
{
cudaChannelFormatDesc channelDesc1 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
tex1.addressMode[0] = cudaAddressModeClamp;
tex1.addressMode[1] = cudaAddressModeClamp;
tex1.filterMode = cudaFilterModeLinear;
tex1.normalized = true;
cudaBindTextureToArray(tex1, ptr1, channelDesc1);
cudaChannelFormatDesc channelDesc2 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
tex2.addressMode[0] = cudaAddressModeClamp;
tex2.addressMode[1] = cudaAddressModeClamp;
tex2.filterMode = cudaFilterModeLinear;
tex2.normalized = true;
cudaBindTextureToArray(tex2, ptr2, channelDesc2);
cudaChannelFormatDesc channelDesc3 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
tex3.addressMode[0] = cudaAddressModeClamp;
tex3.addressMode[1] = cudaAddressModeClamp;
tex3.filterMode = cudaFilterModeLinear;
tex3.normalized = true;
cudaBindTextureToArray(tex3, ptr3, channelDesc3);
cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
tex4.addressMode[0] = cudaAddressModeClamp;
tex4.addressMode[1] = cudaAddressModeClamp;
tex4.filterMode = cudaFilterModeLinear;
tex4.normalized = true;
cudaBindTextureToArray(tex4, ptr4, channelDesc4);
dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y, 1);
dim3 dimGrid(BLOCKS_X, BLOCKS_Y);
kernelSample4<<< dimGrid, dimBlock, 0 >>>(slices, slices + VOXELS_X * VOXELS_Y * VOXELS_Z);
cudaUnbindTexture(tex1);
cudaUnbindTexture(tex2);
cudaUnbindTexture(tex3);
cudaUnbindTexture(tex4);
}
[/codebox]
The use of normalized coordinates for the texture sampling means I can make the textures as small or large as I like to study the effect of texture cache hits/misses. For the moment I have made each texture 64 x 8 floats so I have a total of just 8KB in textures.
On a GTX 260 (216 core) the code acheives a texture fill rate of 41.5 GT/s which is basically the quoted limit of the hardware and it uses 83.1 GB/s of memory bandwidth.
On a GTX 460 the code acheives a texture fill rate of 18.8 GT/s which is basically HALF the quoted limit of the hardware and it uses 37.6 GB/s of memory bandwidth. This leads me to conclude that half the texture units are completely idle. Curiously, if I use the commented out code instead then the texture fill rate immediately doubles to 37.6 GT/s.
I’m using CUDA 3.1 and Windows XP Pro 64-bit (although I’m currently compiling as 32-bit).