problems with texture fetching from cuda arrays

Hi All,

could anyone help me with the problem? i’m trying to run a test code where the kernel would simply fetch data from a 2d texture, but it turns out that it does not work for 2d textures of 5120x5120 size though works fine with textures of 64x64 size. as the manual says, 5120 is still smaller than the allowed texture size of 2^15, so i have no clue, why the code below does not work… any ideas?

so, everything is fine if WIDTH and HEIGHT is 64, for example; but for WIDTH and HEIGHT = 5120 the kernel crashes

#define WIDTH 5120

#define HEIGHT 5120

#include <stdio.h>

texture<float4, 2, cudaReadModeElementType> tex;

__global__ void kernel(float4* d_output)

{

   int tx = __mul24(blockIdx.x, 16) + threadIdx.x;;

   int ty = __mul24(blockIdx.y, 16) + threadIdx.y;

   float4 data = texfetch(tex, (float)tx, (float)ty);

   d_output[__mul24(WIDTH, ty) + tx] = data;

}

int main()

{

   int size = WIDTH * HEIGHT * sizeof(float4);

   float4* input = (float4*)malloc(size);

   float4* output = (float4*)malloc(size);

   for(int i = 0; i < HEIGHT; i ++)

       for(int j = 0; j < WIDTH; j ++)

           input[WIDTH * i + j].x = 

           input[WIDTH * i + j].y = 

           input[WIDTH * i + j].z = 

           input[WIDTH * i + j].w = 1.0f;

  float4* d_output;

   cudaMalloc((void**)&d_output, size);

   cudaArray* cu_data;

   cudaMallocArray(&cu_data, &tex.channelDesc, WIDTH, HEIGHT);

   cudaMemcpyToArray(cu_data, 0, 0, input, size, cudaMemcpyHostToDevice);

   cudaBindTexture(tex, cu_data);

  dim3 grid(WIDTH/16, HEIGHT/16);

   dim3 block(16, 16);

   kernel <<< grid, block >>> (d_output);

  cudaMemcpy(output, d_output, size, cudaMemcpyDeviceToHost);

   for(int i = 0; i < 10; i ++)

       printf("%f %f %f %f\n", output[i].x, output[i].y, output[i].z, output[i].w);

  return 0;

}

Please, nevermind. i incorrectly calculated the required memory amount