I want to access a 2D 64-bit array as a texture. What I thought would be the correct way of allocating the Cuda Array for this does not seem to allocate enough memory.
The problem is that one has to use the __hiloint2double trick to use doubles in textures, but it is not clear how this affects the way the Cuda Array should be allocated.
Here is a code sample for you:
#include <stdio.h>
#define XDIM 3
#define YDIM 3
#define SIZE (XDIM * YDIM)
texture < int2, 1 > tex1;
static __inline__ __device__ double
fetch_double (texture < int2, 1 > t, int i)
{
int2 v = tex1Dfetch (t, i);
return __hiloint2double (v.y, v.x);
}
void __global__
testker (double *out)
{
*(out + threadIdx.x + threadIdx.y * XDIM) =
fetch_double (tex1, threadIdx.x + threadIdx.y * XDIM);
}
int
main (void)
{
cudaArray *cuA;
double *out_d, out, h_data;
int i, j;
dim3 BlockDim (XDIM, YDIM, 1);
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc (32, 32, 0, 0, cudaChannelFormatKindSigned);
cudaMallocArray (&cuA, &channelDesc, XDIM, YDIM);
cudaMalloc ((void **) &out_d, sizeof (double) * SIZE);
for (j = 0; j < YDIM; j++)
{
for (i = 0; i < XDIM; i++)
{
h_data[i + j * XDIM] = (float) (i + j * XDIM);
}
}
cudaMemcpyToArray (cuA, 0, 0, h_data, sizeof (double) * SIZE,
cudaMemcpyHostToDevice);
cudaBindTextureToArray (tex1, cuA, channelDesc);
testker <<< 1, BlockDim >>> (out_d);
cudaMemcpy (out, out_d, sizeof (double) * SIZE, cudaMemcpyDeviceToHost);
for (j = 0; j < YDIM; j++)
{
for (i = 0; i < XDIM; i++)
{
printf ("%.0f ", out[i + j * XDIM]);
}
printf ("\n");
}
return 0;
}
This should return:
0 1 2
3 4 5
6 7 8
but instead returns:
0 1 2
0 0 0
0 0 0
If the Cuda Array is allocated as (3 * XDIM, YDIM), however, it does produce the right answer.