Allocation of second texture ruins code?

Hello everybody,

I’m quite confused about a bug in my current code. I reduced it to a quite short version (so dont wonder about unused arguments in kernel calls, etc.).

The code always works well if i dont declare the seond texture. If i declare it, in 50% of the cases the last pixels aren’t copied properly by my kernel.

I also tried to perform a cudaThreadSynchronize() before and after each statement, but that didn’t work, too;

Thanks for helping,


texture <unsigned short, 2, cudaReadModeElementType> tex;   //texture for input image

texture <float, 1, cudaReadModeElementType> texMask;    //texture for mask, size 10-20 elements

__global__ void erodeBG (unsigned short *out, int width, int parabelWidth) {

    int y = blockIdx.x;    

    int x;

    for (x = threadIdx.x; x < width; x += blockDim.x) {

        out [y * width + x] = tex2D(tex,x, y);



extern "C" bool SubtractBG_GPU (unsigned short *dataIn, unsigned short* dataOut, unsigned int width, unsigned int height, unsigned int parableWidth) {

   int imaSizeBytes = width * height * sizeof(unsigned short);

   //allocate GPU Array for input image and copy input image to GPU

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsigned);

    cudaArray* inputArray;

    CUDA_SAFE_CALL(cudaMallocArray( &inputArray, &channelDesc, width, height)); 

    CUDA_SAFE_CALL(cudaMemcpyToArray( inputArray, 0, 0, dataIn, imaSizeBytes, cudaMemcpyHostToDevice));

    CUDA_SAFE_CALL(cudaBindTextureToArray( tex, inputArray, channelDesc));

   //allocate GPU Memory for output image

    unsigned short *gpu_dataOut;

    CUDA_SAFE_CALL(cudaMalloc( (void**) &gpu_dataOut, imaSizeBytes));

/*** allocate 2nd texture ***/

    //allocate GPU Array for masks

    cudaChannelFormatDesc channelDescMasks = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

    cudaArray* maskArray;

    CUDA_SAFE_CALL(cudaMallocArray(&maskArray, &channelDescMasks, 2 * parableWidth + 1, 1)); 

   // Bind the arrays to textures

    CUDA_SAFE_CALL(cudaBindTextureToArray( texMask, maskArray, channelDescMasks));

/*** end of allocation ***/

   //execute Kernel 1

    erodeBG<<< height, 64, 0 >>>(gpu_dataOut, width, parableWidth);

    CUT_CHECK_ERROR("Error while subtracting background");

   CUDA_SAFE_CALL(cudaMemcpy( dataOut, gpu_dataOut, imaSizeBytes, cudaMemcpyDeviceToHost) );

return true;


any ideas?

I’m using a custom build rule, but the generated command line function is looking correct (for a nvcc beginner like me):

nvcc.exe -ccbin “C:\Programme\Microsoft Visual Studio 8\VC\bin” -c -Xcompiler "/EHsc /W3 /nologo /Wp64 /O2 /Zi /MT " -IC:\Programme\NVIDIA_Corporation\NVIDIA_CUDA_SDK\common\inc -o Release\SubtractBackgroundGPU.obj