1D texture clamping doubt


I’m trying to use some 1D textures in a particle fluid simulation, but when accessing out of range, instead of really clamping to the last possible element, it gives me some weird results. I’ve replicated here the code that gives a similar result. If you execute the next code, the last 3 elements printed won’t give the expected results if the clamping was working correctly.

I know I’m missing some detail but I’m clueless.

Any help will be highly appreciated.

#include <cuda_runtime.h>

#include <cstdio>

#define NUMVAL 128

#define BLOCKSIZE 64

texture<unsigned int, 1, cudaReadModeElementType> _dtex;

__global__ void test(unsigned int *val){

    unsigned int pos = threadIdx.x + blockIdx.x * BLOCKSIZE;

    val[pos] += tex1D(_dtex, pos+3);


int main(int argc, char** argv){

    unsigned int *_h_val = new unsigned int[NUMVAL];

    for(int i=0; i<NUMVAL; i++)

        _h_val[i] = 1;

cudaArray *_ar_val;

    cudaChannelFormatDesc _chdesc = cudaCreateChannelDesc(32, 0,0,0, cudaChannelFormatKindUnsigned);

    cudaMallocArray(&_ar_val, &_chdesc, sizeof(unsigned int)*NUMVAL, 1);

    cudaMemcpyToArray(_ar_val, 0, 0, _h_val, sizeof(unsigned int)*NUMVAL, cudaMemcpyHostToDevice);

_dtex.addressMode[0] = cudaAddressModeClamp;

    _dtex.filterMode = cudaFilterModePoint;

    _dtex.normalized = false;

    cudaBindTextureToArray(_dtex, _ar_val, _chdesc);

for(int i=0; i<NUMVAL; i++)

        _h_val[i] = i;

unsigned int *_d_val;

    cudaMalloc(&_d_val, sizeof(unsigned int)*NUMVAL);

    cudaMemcpy(_d_val, _h_val, sizeof(unsigned int)*NUMVAL, cudaMemcpyHostToDevice);

dim3 dimBlock(BLOCKSIZE);

    dim3 dimGrid(NUMVAL/BLOCKSIZE);

    test<<<dimGrid, dimBlock>>>(_d_val);

cudaMemcpy(_h_val, _d_val, sizeof(unsigned int)*NUMVAL, cudaMemcpyDeviceToHost);

for(int i=0; i<NUMVAL; i++)

        printf("%i: %u\n", i, _h_val[i]);




    delete [] _h_val;

return 0;


Silly me…

When doing malloc

cudaMallocArray(&_ar_val, &_chdesc, sizeof(unsigned int)*NUMVAL, 1);

one have to tell the number of elements (columns and rows), not the size. So that line must be

cudaMallocArray(&_ar_val, &_chdesc, NUMVAL, 1);