Hello everybody,
I’m quite confused about a bug in my current code. I reduced it to a quite short version (so dont wonder about unused arguments in kernel calls, etc.).
The code always works well if i dont declare the seond texture. If i declare it, in 50% of the cases the last pixels aren’t copied properly by my kernel.
I also tried to perform a cudaThreadSynchronize() before and after each statement, but that didn’t work, too;
Thanks for helping,
xlro.
texture <unsigned short, 2, cudaReadModeElementType> tex; //texture for input image
texture <float, 1, cudaReadModeElementType> texMask; //texture for mask, size 10-20 elements
__global__ void erodeBG (unsigned short *out, int width, int parabelWidth) {
int y = blockIdx.x;
int x;
for (x = threadIdx.x; x < width; x += blockDim.x) {
out [y * width + x] = tex2D(tex,x, y);
}
}
extern "C" bool SubtractBG_GPU (unsigned short *dataIn, unsigned short* dataOut, unsigned int width, unsigned int height, unsigned int parableWidth) {
int imaSizeBytes = width * height * sizeof(unsigned short);
//allocate GPU Array for input image and copy input image to GPU
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* inputArray;
CUDA_SAFE_CALL(cudaMallocArray( &inputArray, &channelDesc, width, height));
CUDA_SAFE_CALL(cudaMemcpyToArray( inputArray, 0, 0, dataIn, imaSizeBytes, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaBindTextureToArray( tex, inputArray, channelDesc));
//allocate GPU Memory for output image
unsigned short *gpu_dataOut;
CUDA_SAFE_CALL(cudaMalloc( (void**) &gpu_dataOut, imaSizeBytes));
/*** allocate 2nd texture ***/
//allocate GPU Array for masks
cudaChannelFormatDesc channelDescMasks = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* maskArray;
CUDA_SAFE_CALL(cudaMallocArray(&maskArray, &channelDescMasks, 2 * parableWidth + 1, 1));
// Bind the arrays to textures
CUDA_SAFE_CALL(cudaBindTextureToArray( texMask, maskArray, channelDescMasks));
/*** end of allocation ***/
//execute Kernel 1
erodeBG<<< height, 64, 0 >>>(gpu_dataOut, width, parableWidth);
CUT_CHECK_ERROR("Error while subtracting background");
CUDA_SAFE_CALL(cudaMemcpy( dataOut, gpu_dataOut, imaSizeBytes, cudaMemcpyDeviceToHost) );
return true;
}