memory allocation bug?

When using several textures, I find that if I allocate device memory (using cudaMalloc) BEFORE allocating/binding the textures, writes to the device memory corrupt the textures.

If I allocate the device memory AFTER allocating/binding the textures, there is no corruption.

The problem consistently occurs every other run of the application, but consecutive calls to the kernel are fine, leading me to believe something isn’t getting allocated/deallocated properly.

(I suppose I could be doing something completely wrong in creating the textures… However, the program works perfectly in the emulator every time.)

Attached is a cuTexMemBug.cu file that produces the problem.

#define SHOWBUG 1 to cause the bug to appear (Line 12)
#define SHOWBUG 0 to cause it to disappear

It appears at least 3 textures are required before the problem manifests, so the code uses 4 textures.

Platform:
WinXP Pro SP2, Dell Precision 390
GeForce 8800 GTS as non-primary display
(GeForce FX 5200 as primary display, dualview mode)
nVidia driver 97.73
CUDA v0.8
VisualStudio 2005

For some reason I can’t attach a file to the BB post… code follows:

/**

* Texture Memory Bug?

* 27-Mar-2007  jhanweck

*/

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <cutil.h>

// SET THIS TO 0 TO CAUSE THE BUG TO DISAPPEAR

#define SHOWBUG 1

extern __shared__ float shmem[];

/**

* TEXTURES

*/

texture<float, 2, cudaReadModeElementType> tex1;

texture<float, 2, cudaReadModeElementType> tex2;

texture<float, 2, cudaReadModeElementType> tex3;

texture<float, 2, cudaReadModeElementType> tex4;

/**

* Helper function for testing near equality of two floats within epsilon.

*/

__device__ bool feqf(float x1, float x2, float eps)

{

	return fabsf(x2 - x1) < eps;

}

/**

* KERNEL

*

* This kernel fetches values from texture memory

* and compares the values to their correct values.

* Returns bits indicating which texture fetch did not compare.

*/

__global__ void kernel_TestTexFetch2(

	unsigned int *g_failed

	)

{

	const unsigned int nThreads = blockDim.x;

	const unsigned int tid = threadIdx.x;

	const unsigned int bidX = blockIdx.x;

	const unsigned int bidY = blockIdx.y;

	const unsigned int bid = bidY * gridDim.x + bidX;

	const float EPS = 1.0e-5;

	__shared__ unsigned int bitsTex1, bitsTex2, bitsTex3, bitsTex4;

	if (tid == 0)

	{

  bitsTex1 = 0; 

  bitsTex2 = 0;

  bitsTex3 = 0;

  bitsTex4 = 0;

	}

	__syncthreads();

	float fTex1 = texfetch(tex1, tid, bidY);

	if (!feqf(fTex1, 0.1f + (float) tid / nThreads, EPS))

  bitsTex1 = 0x0001;

	float fTex2 = texfetch(tex2, tid, bidY);

	if (!feqf(fTex2, 0.2f + (float) tid / nThreads, EPS))

  bitsTex2 = 0x0002;

	float fTex3 = texfetch(tex3, tid, bidY);

	if (!feqf(fTex3, 0.3f + (float) tid / nThreads, EPS))

  bitsTex3 = 0x0004;

	float fTex4 = texfetch(tex4, tid, bidY);

	if (!feqf(fTex4, 0.4f + (float) tid / nThreads, EPS))

  bitsTex4 = 0x0008;

	__syncthreads();

	if (tid == 0)

	{

  g_failed[bid] = bitsTex1 | bitsTex2 | bitsTex3 | bitsTex4;

	}

	__syncthreads();

}

/**

* Texture memory bug:

* Many textures seem to create instabilities?

*/

int main( int argc, char** argv) 

{

    CUT_CHECK_DEVICE();

	const int nThreads = 64;

	const int nGridY = 10;

	const int nGridX = 200;

	const int nBlocks = nGridX * nGridY;

	//  Allocate some host memory for holding texture values.

	//  The first 3 textures are 2D textures, nThreads wide x nGridY high.

	//  The 4th texture is 2D texture of size 1 x nGridY

	float *h_tex1 = (float *) malloc(nGridY * nThreads * sizeof(float));

	float *h_tex2 = (float *) malloc(nGridY * nThreads * sizeof(float));

	float *h_tex3 = (float *) malloc(nGridY * nThreads * sizeof(float));

	float *h_tex4 = (float *) malloc(nGridY * nThreads * sizeof(float));

	// initialize the texture values with some identifying values

	for (int bidY = 0; bidY < nGridY; bidY++)

	{

  for (int tid = 0; tid < nThreads; tid++)

  {

  	h_tex1[bidY * nThreads + tid] = 0.1f + (float) tid / nThreads;

  	h_tex2[bidY * nThreads + tid] = 0.2f + (float) tid / nThreads;

  	h_tex3[bidY * nThreads + tid] = 0.3f + (float) tid / nThreads;

  	h_tex4[bidY * nThreads + tid] = 0.4f + (float) tid / nThreads;

  }

	}

#if (SHOWBUG != 0)

	// allocate device memory for result

	unsigned int cbFailed = nBlocks * sizeof(unsigned int);

    unsigned int *d_failed;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_failed, cbFailed));

    CUT_CHECK_ERROR( "cudaMalloc() failed" );

#endif //SHOWBUG

	//

	// Initialize the 4 textures

	//

	//-------   Texture 1   ------------------------

	cudaChannelFormatDesc desc1 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex1;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex1, &desc1, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex1, 0, 0, h_tex1, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex1.addressMode[0] = cudaAddressModeClamp;

	tex1.addressMode[1] = cudaAddressModeClamp;

	tex1.filterMode = cudaFilterModePoint;

	tex1.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex1, d_aTex1, desc1) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//-------   Texture 2   ------------------------

	cudaChannelFormatDesc desc2 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex2;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex2, &desc2, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex2, 0, 0, h_tex2, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex2.addressMode[0] = cudaAddressModeClamp;

	tex2.addressMode[1] = cudaAddressModeClamp;

	tex2.filterMode = cudaFilterModePoint;

	tex2.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex2, d_aTex2, desc2) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//-------   Texture 3   ------------------------

	cudaChannelFormatDesc desc3 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex3;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex3, &desc3, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex3, 0, 0, h_tex3, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex3.addressMode[0] = cudaAddressModeClamp;

	tex3.addressMode[1] = cudaAddressModeClamp;

	tex3.filterMode = cudaFilterModePoint;

	tex3.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex3, d_aTex3, desc3) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//-------   Texture 4   ------------------------

	cudaChannelFormatDesc desc4 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex4;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex4, &desc4, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex4, 0, 0, h_tex4, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex4.addressMode[0] = cudaAddressModeClamp;

	tex4.addressMode[1] = cudaAddressModeClamp;

	tex4.filterMode = cudaFilterModePoint;

	tex4.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex4, d_aTex4, desc4) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//---------------------------------------------

#if (SHOWBUG == 0)

	// allocate device memory for result

	unsigned int cbFailed = nBlocks * sizeof(unsigned int);

    unsigned int *d_failed;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_failed, cbFailed));

    CUT_CHECK_ERROR( "cudaMalloc() failed" );

#endif //!BUG

	// setup execution parameters

    dim3  grid( nGridX, nGridY, 1);

    dim3  threads( nThreads, 1, 1);

	//--------  EXECUTE KERNEL -------

    unsigned int timer = 0;

    CUT_SAFE_CALL( cutCreateTimer( &timer));

    CUT_SAFE_CALL( cutStartTimer( timer));

   kernel_TestTexFetch2<<< grid, threads, 0 >>>( d_failed );

    CUT_CHECK_ERROR("Kernel execution failed");

   CUT_SAFE_CALL( cutStopTimer( timer));

    printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));

    CUT_SAFE_CALL( cutDeleteTimer( timer));

	// test the results

	unsigned int *h_failed = (unsigned int *) malloc(cbFailed);

    CUDA_SAFE_CALL( cudaMemcpy(h_failed, d_failed, cbFailed, cudaMemcpyDeviceToHost) );

	for (int bid = 0; bid < nBlocks; bid++)

  if (h_failed[bid] != 0)

  	printf("%u\tFAILED\t%u\n", bid, h_failed[bid]);

#ifdef RUN_KERNEL_TWICE

    CUT_SAFE_CALL( cutCreateTimer( &timer));

    CUT_SAFE_CALL( cutStartTimer( timer));

   kernel_TestTexFetch2<<< grid, threads, 0 >>>( d_failed );

    CUT_CHECK_ERROR("Kernel execution failed");

   CUT_SAFE_CALL( cutStopTimer( timer));

    printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));

    CUT_SAFE_CALL( cutDeleteTimer( timer));

	// test the results

    CUDA_SAFE_CALL( cudaMemcpy(h_failed, d_failed, cbFailed, cudaMemcpyDeviceToHost) );

	for (int bid = 0; bid < nBlocks; bid++)

  if (h_failed[bid] != 0)

  	printf("%u\tFAILED\t%u\n", bid, h_failed[bid]);

#endif // RUN_KERNEL_TWICE

   // cleanup memory

    

	CUDA_SAFE_CALL( cudaUnbindTexture(tex4) );

	CUDA_SAFE_CALL( cudaUnbindTexture(tex3) );

	CUDA_SAFE_CALL( cudaUnbindTexture(tex2) );

	CUDA_SAFE_CALL( cudaUnbindTexture(tex1) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex4) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex3) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex2) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex1) );

   CUDA_SAFE_CALL( cudaFree(d_failed));

   free(h_failed);

	free(h_tex4);

    free(h_tex3);

    free(h_tex2);

    free(h_tex1);

   CUT_EXIT(argc, argv);

}