tex1dfetch apparently returning incorrect value

In my CUDA application I noticed that tex1Dfetch is not returning the correct value, past a certain index in the buffer. An initial observation in the application was that a value at index 0 could be read correctly, but at 12705625, the value read was 0. I made a small test program to investigate this, given below. The results are a little bit baffling to me. I’m trying to probe at what index the values no longer are read correctly. But as the value arraySize is changed, so does the “firstBadIndex”. Even with arraySize =2, the second value is read incorrectly! As arraySize is made bigger, the firstBadIndex gets bigger. This happens when binding float, float2, or float4. If the data are read from the device buffer instead (switch around the commented lines in FetchTextureData), then everything is fine. This is using CUDA 6.5, on a Tesla c2075.
Thanks for any insights or advice you might have.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

#define FLOATTYPE float4
texture<FLOATTYPE,cudaTextureType1D,cudaReadModeElementType> texture1D;

const unsigned int arraySize = 1000;
FLOATTYPE* device;

__global__ void FetchTextureData(FLOATTYPE* data,FLOATTYPE* arr,int idx)
	data[0] = tex1Dfetch(texture1D, idx);
	//data[0] = arr[idx];

bool GetTextureValues(int idx){


	// copy to the host
	cudaError_t err = cudaMemcpy(hTemp,dTemp,sizeof(FLOATTYPE),cudaMemcpyDeviceToHost);
	if (err != cudaSuccess) {
		throw "cudaMemcpy failed!";

	if (cudaDeviceSynchronize() != cudaSuccess) {
		throw "cudaDeviceSynchronize failed!";

	return hTemp[0].x == 1.0f;

int main()

		host = new FLOATTYPE[arraySize];
		cudaError_t err = cudaMalloc((void**)&device,sizeof(FLOATTYPE) * arraySize);
		cudaError_t err1 = cudaMalloc((void**)&dTemp,sizeof(FLOATTYPE));
		if (err != cudaSuccess || err1 != cudaSuccess) {
			throw "cudaMalloc failed!";

		// make some host data
		for(unsigned int i=0; i<arraySize; i++){
			FLOATTYPE data = {1.0f, 0.0f, 0.0f, 0.0f};
			host[i] = data;

		// and copy it to the device
		err = cudaMemcpy(device,host,sizeof(FLOATTYPE) * arraySize,cudaMemcpyHostToDevice);
		if (err != cudaSuccess){
			throw "cudaMemcpy failed!";

		// set up the textures
		cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<FLOATTYPE>();
		texture1D.addressMode[0] = cudaAddressModeClamp;
		texture1D.filterMode = cudaFilterModePoint;
		texture1D.normalized = false;
		cudaBindTexture(NULL, texture1D, device, channelDesc, arraySize);

		// do a texture fetch and find where the fetches stop working
		int lastGoodValue = -1, firstBadValue = -1;
		float4 badValue = {-1.0f,0.0f,0.0f,0.0f};

		for(unsigned int i=0; i<arraySize; i++){

			if(i % 100000 == 0) printf("%d\n",i);
			bool isGood = GetTextureValues(i);

			if(firstBadValue == -1 && !isGood)
				firstBadValue = i;

				lastGoodValue = i;
				badValue = hTemp[0];

		printf("lastGoodValue %d, firstBadValue %d\n",lastGoodValue,firstBadValue);
		printf("Bad value is (%.2f)\n",badValue.x);

	}catch(const char* err){
		printf("\nCaught an error : %s\n",err);
    return 0;

What happens if you add error checking to the call to cudaBindTexture? If you run the app under control of cuda-memcheck, are any issues reported?

For what it is worth, robust code should never pass NULL as the first argument of cudaBindTexture(). Instead, examine the offset returned in the variable whose address is specified by the first argument to cudaBindTexture() to make sure it is zero.

Note that the size argument passed to cudaBindTexture() is the size of the bound storage in bytes, so your call should be something like:

size_t offset;
cudaBindTexture(&offset, texture1D, device, channelDesc, arraySize*sizeof (FLOATTYPE));

Thanks, that was the problem.