2D 64-bit texture Correct use of cudaMallocArray in this case is unclear

I want to access a 2D 64-bit array as a texture. What I thought would be the correct way of allocating the Cuda Array for this does not seem to allocate enough memory.

The problem is that one has to use the __hiloint2double trick to use doubles in textures, but it is not clear how this affects the way the Cuda Array should be allocated.

Here is a code sample for you:

#include <stdio.h>

#define XDIM 3

#define YDIM 3

#define SIZE (XDIM * YDIM)

texture < int2, 1 > tex1;

static __inline__ __device__ double

fetch_double (texture < int2, 1 > t, int i)

{

  int2 v = tex1Dfetch (t, i);

  return __hiloint2double (v.y, v.x);

}

void __global__

testker (double *out)

{

*(out + threadIdx.x + threadIdx.y * XDIM) =

	fetch_double (tex1, threadIdx.x + threadIdx.y * XDIM);

}

int

main (void)

{

  cudaArray *cuA;

  double *out_d, out, h_data;

  int i, j;

  dim3 BlockDim (XDIM, YDIM, 1);

cudaChannelFormatDesc channelDesc =

	cudaCreateChannelDesc (32, 32, 0, 0, cudaChannelFormatKindSigned);

cudaMallocArray (&cuA, &channelDesc, XDIM, YDIM);

  cudaMalloc ((void **) &out_d, sizeof (double) * SIZE);

for (j = 0; j < YDIM; j++)

	{

	  for (i = 0; i < XDIM; i++)

	{

	  h_data[i + j * XDIM] = (float) (i + j * XDIM);

	}

	}

cudaMemcpyToArray (cuA, 0, 0, h_data, sizeof (double) * SIZE,

			 cudaMemcpyHostToDevice);

cudaBindTextureToArray (tex1, cuA, channelDesc);

testker <<< 1, BlockDim >>> (out_d);

cudaMemcpy (out, out_d, sizeof (double) * SIZE, cudaMemcpyDeviceToHost);

for (j = 0; j < YDIM; j++)

	{

	  for (i = 0; i < XDIM; i++)

	{

	  printf ("%.0f ", out[i + j * XDIM]);

	}

	  printf ("\n");

	}

return 0;

}

This should return:

0 1 2

3 4 5

6 7 8

but instead returns:

0 1 2

0 0 0

0 0 0

If the Cuda Array is allocated as (3 * XDIM, YDIM), however, it does produce the right answer.