Texture alignment issue?

Hello,

I am writing a raytracer in CUDA. I currently have my triangles passed in as a struct like:

struct tri_t
{
float3 v[ 3 ];
float3 n;
float pad[ 4 ]; // Pad to 16 dwords
};

If I fetch the into the kernel using global reads, it works. I have converted the code to use textures for beter performance. I have it compiling and running correctly in emulate mode but when I try it on the GPU I get a corrupted image. I suspect I am hitting some texure alignment issue in the hardware. Here’s how I set it up:

I declare a global texture variable:

texture<float, 1, cudaReadModeElementType> tex;

Then in my main…

// Is this correct?
cudaChannelFormatDesc chanDesc = cudaCreateChannelDesc(32, 0, 0, 0,
cudaChannelFormatKindFloat);
cudaArray* cu_array;
// Allocate 16 dwords per triangle
CUDA_SAFE_CALL( cudaMallocArray( &cu_array, &chanDesc, 16 * num_tris, 1 ));
// Copy 16 dwords (64 bytes) per triangle to GPU memory
CUDA_SAFE_CALL( cudaMemcpyToArray( cu_array, 0, 0, tris, 64 * num_tris,
cudaMemcpyHostToDevice ) );

// set texture parameters
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
tex.filterMode = cudaFilterModePoint;
tex.normalized = false;

// Bind the array to the texture
CUDA_SAFE_CALL( cudaBindTextureToArray( tex, cu_array, chanDesc));

And finally in my kernel I fetch individual dwords of the vert as follows:

for ( i = 0; i < num_tris; ++i )
{
tri_t t;
addr = i * 16;
t.v[ 0 ].x = tex1D( tex, addr + 0 );
t.v[ 0 ].y = tex1D( tex, addr + 1 );
t.v[ 0 ].z = tex1D( tex, addr + 2 );
t.v[ 1 ].x = tex1D( tex, addr + 3 );
t.v[ 1 ].y = tex1D( tex, addr + 4 );
t.v[ 1 ].z = tex1D( tex, addr + 5 );
t.v[ 2 ].x = tex1D( tex, addr + 6 );
t.v[ 2 ].y = tex1D( tex, addr + 7 );
t.v[ 2 ].z = tex1D( tex, addr + 8 );
t.n.x = tex1D( tex, addr + 9 );
t.n.y = tex1D( tex, addr + 10 );
t.n.z = tex1D( tex, addr + 11 );
}

I am running this on a G80.

Any help will be much appreciated.