Why is there linear interpolation from this code?

I don’t understand why the initial is interpolated to give an out put in which the initial values haveb been shifted by 0.5.

I am trying to have the initial array cuArray bound as a texture and at the moment simply read into an output array, but as I say the cuArray values are interpolated. Why is this, and how do I stop it?

[codebox]#define NTHREADS 128

#include <stdio.h>

texture<float, 1, cudaReadModeElementType> tex;

global void kernel(float *d_out);

//============================

int main()

{

//create memory for output from the device

float *d_out;

cudaMalloc((void**)&d_out, sizeof(float) * NTHREADS);

//create an array on the device

cudaArray* cuArray;

cudaMallocArray (&cuArray, &tex.channelDesc, NTHREADS, 1);

cudaBindTextureToArray (tex, cuArray);

tex.filterMode = cudaFilterModeLinear;

//create and fill data array with increasing values ie thread id

float *data = (float*)malloc(NTHREADS*sizeof(float));

for (int i = 0; i < NTHREADS; i++) data[i] = float(i);

cudaMemcpyToArray(cuArray, 0, 0, data, sizeof(float)*NTHREADS, cudaMemcpyHostToDevice);

kernel<<<1, NTHREADS>>>(d_out);

//create array to hold output on the host and copy output into it

float *h_out = (float*)malloc(sizeof(float)*NTHREADS);

cudaMemcpy(h_out, d_out, sizeof(float)*NTHREADS, cudaMemcpyDeviceToHost);

for (int i = 0; i < NTHREADS; i++) printf("%f\n", h_out[i]);

free(h_out);

free(data);

cudaFreeArray(cuArray);

cudaFree(d_out);

}

//===============================

global void kernel(float *d_out)

{

int	threadid = threadIdx.x;

float x = tex1D(tex, float(threadid));



d_out[threadid] = x;

}

[/codebox]

See appendix D in the programming guide.

because it should be tex.filterMode = cudaFilterModePoint; not cudaFilterModeLinear.

But if I was to execute the kernel in a loop do I need to bind and unbind inside the loop, or just the once outside the loop?

If I do need to bind/unbind inside the loop then what are the consequences of not doing so?