I’m using a uchar4 texture reference; somehow it doesn’t work.

I defined cuda array and texture as the following code snippet; then strangely the kernels will refrain from doing any work ( like printf under emu mode, or some dummy d_odata[idx] = 255 output data assignment). Even if I don’t use any tex2D in the kernels, they still behave the same, doing nothing, while reporting no error.

If I simply quote out the cudaBindTextureToArray code, then the kernels behaves normal again (printf or dummy output data assignment)

Afterwards, if I turned the texture into a float4 one, then everything also became normal.

  • Dev Environment: Windows XP SP2 with MS Visual Studio 2005

  • CUDA version: 1.0

  • Hardware: AMD duo-core Opteron 2142, Geforce 8800GTS

// define texture reference

texture <uchar4, 2, cudaReadModeElementType > tex;


// allocate cuda array

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, 


cudaArray* cu_array;

cudaMallocArray( &cu_array, &channelDesc, width, height);

cudaMemcpyToArray( cu_array, 0, 0, h_data, width*height*4, cudaMemcpyHostToDevice);   // uchar* h_data has been initialized properly

// bind array to the texture

cudaBindTextureToArray( tex, cu_array, channelDesc)


texture<uchar4, 2, cudaReadModeElementType> tex;

cudaArray* array;


threads = dim3(32, 12, 1);

blocks = dim3(width / threads.x + (width % threads.x != 0 ? 1 : 0),

	height / threads.y + (height % threads.y != 0 ? 1 : 0), 1);

// channel of uchar4

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8,


CUDA_SAFE_CALL(cudaMallocArray(&array, &channelDesc, width, height));

CUDA_SAFE_CALL(cudaMemcpyToArray(array, 0, 0, pixels, 4 * width * height,


CUDA_SAFE_CALL(cudaBindTextureToArray(tex, array, channelDesc));


__global__ void textureCopy(uchar4* texOut, int width, int height) {

	int x = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;

	int y = __mul24(blockIdx.y, blockDim.y) + threadIdx.y;

	if (x < width && y < height) {

  texOut[__mul24(y, width) + x] = tex2D(tex, x, y);



Kernel launch:

textureCopy <<< blocks, threads >>> (d_texOut, width, height);

CUT_CHECK_ERROR("Kernel execution failed");