Memory allocation bug? using several textures

When using several textures, I find that if I allocate device memory (using cudaMalloc) BEFORE allocating/binding the textures, writes to the device memory corrupt the textures.

If I allocate the device memory AFTER allocating/binding the textures, there is no corruption.

It appears at least 3 textures are required before the problem manifests, so the code uses 4 textures. Sorry, I haven’t been able to narrow it down further than that.

The problem consistently occurs every other run of the application. Consecutive calls to the kernel within the app are fine (when the app runs), leading me to believe something isn’t getting allocated/deallocated properly.

Could I be doing something completely wrong in creating the textures??? The program works perfectly in the emulator every time.

Can anyone else replicate this problem… either Windows or Linux???


Below is cuTexMemBug.cu that produces the problem.

#define SHOWBUG 1 to cause the bug to appear (Line 12)

#define SHOWBUG 0 to cause it to disappear

Platform:

WinXP Pro SP2, Dell Precision 390

GeForce 8800 GTS as non-primary display

(GeForce FX 5200 as primary display, dualview mode)

nVidia driver 97.73

CUDA v0.8

VisualStudio 2005

/**

* Texture Memory Bug?

* 27-Mar-2007  jhanweck

*/

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <cutil.h>

// SET THIS TO 0 TO CAUSE THE BUG TO DISAPPEAR

#define SHOWBUG 1

extern __shared__ float shmem[];

/**

* TEXTURES

*/

texture<float, 2, cudaReadModeElementType> tex1;

texture<float, 2, cudaReadModeElementType> tex2;

texture<float, 2, cudaReadModeElementType> tex3;

texture<float, 2, cudaReadModeElementType> tex4;

/**

* Helper function for testing near equality of two floats within epsilon.

*/

__device__ bool feqf(float x1, float x2, float eps)

{

	return fabsf(x2 - x1) < eps;

}

/**

* KERNEL

*

* This kernel fetches values from texture memory

* and compares the values to their correct values.

* Returns bits indicating which texture fetch did not compare.

*/

__global__ void kernel_TestTexFetch2(

	unsigned int *g_failed

	)

{

	const unsigned int nThreads = blockDim.x;

	const unsigned int tid = threadIdx.x;

	const unsigned int bidX = blockIdx.x;

	const unsigned int bidY = blockIdx.y;

	const unsigned int bid = bidY * gridDim.x + bidX;

	const float EPS = 1.0e-5;

	__shared__ unsigned int bitsTex1, bitsTex2, bitsTex3, bitsTex4;

	if (tid == 0)

	{

  bitsTex1 = 0; 

  bitsTex2 = 0;

  bitsTex3 = 0;

  bitsTex4 = 0;

	}

	__syncthreads();

	float fTex1 = texfetch(tex1, tid, bidY);

	if (!feqf(fTex1, 0.1f + (float) tid / nThreads, EPS))

  bitsTex1 = 0x0001;

	float fTex2 = texfetch(tex2, tid, bidY);

	if (!feqf(fTex2, 0.2f + (float) tid / nThreads, EPS))

  bitsTex2 = 0x0002;

	float fTex3 = texfetch(tex3, tid, bidY);

	if (!feqf(fTex3, 0.3f + (float) tid / nThreads, EPS))

  bitsTex3 = 0x0004;

	float fTex4 = texfetch(tex4, tid, bidY);

	if (!feqf(fTex4, 0.4f + (float) tid / nThreads, EPS))

  bitsTex4 = 0x0008;

	__syncthreads();

	if (tid == 0)

	{

  g_failed[bid] = bitsTex1 | bitsTex2 | bitsTex3 | bitsTex4;

	}

	__syncthreads();

}

/**

* Texture memory bug:

* Many textures seem to create instabilities?

*/

int main( int argc, char** argv) 

{

    CUT_CHECK_DEVICE();

	const int nThreads = 64;

	const int nGridY = 10;

	const int nGridX = 200;

	const int nBlocks = nGridX * nGridY;

	//  Allocate some host memory for holding texture values.

	//  The first 3 textures are 2D textures, nThreads wide x nGridY high.

	//  The 4th texture is 2D texture of size 1 x nGridY

	float *h_tex1 = (float *) malloc(nGridY * nThreads * sizeof(float));

	float *h_tex2 = (float *) malloc(nGridY * nThreads * sizeof(float));

	float *h_tex3 = (float *) malloc(nGridY * nThreads * sizeof(float));

	float *h_tex4 = (float *) malloc(nGridY * nThreads * sizeof(float));

	// initialize the texture values with some identifying values

	for (int bidY = 0; bidY < nGridY; bidY++)

	{

  for (int tid = 0; tid < nThreads; tid++)

  {

  	h_tex1[bidY * nThreads + tid] = 0.1f + (float) tid / nThreads;

  	h_tex2[bidY * nThreads + tid] = 0.2f + (float) tid / nThreads;

  	h_tex3[bidY * nThreads + tid] = 0.3f + (float) tid / nThreads;

  	h_tex4[bidY * nThreads + tid] = 0.4f + (float) tid / nThreads;

  }

	}

#if (SHOWBUG != 0)

	// allocate device memory for result

	unsigned int cbFailed = nBlocks * sizeof(unsigned int);

    unsigned int *d_failed;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_failed, cbFailed));

    CUT_CHECK_ERROR( "cudaMalloc() failed" );

#endif //SHOWBUG

	//

	// Initialize the 4 textures

	//

	//-------   Texture 1   ------------------------

	cudaChannelFormatDesc desc1 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex1;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex1, &desc1, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex1, 0, 0, h_tex1, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex1.addressMode[0] = cudaAddressModeClamp;

	tex1.addressMode[1] = cudaAddressModeClamp;

	tex1.filterMode = cudaFilterModePoint;

	tex1.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex1, d_aTex1, desc1) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//-------   Texture 2   ------------------------

	cudaChannelFormatDesc desc2 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex2;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex2, &desc2, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex2, 0, 0, h_tex2, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex2.addressMode[0] = cudaAddressModeClamp;

	tex2.addressMode[1] = cudaAddressModeClamp;

	tex2.filterMode = cudaFilterModePoint;

	tex2.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex2, d_aTex2, desc2) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//-------   Texture 3   ------------------------

	cudaChannelFormatDesc desc3 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex3;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex3, &desc3, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex3, 0, 0, h_tex3, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex3.addressMode[0] = cudaAddressModeClamp;

	tex3.addressMode[1] = cudaAddressModeClamp;

	tex3.filterMode = cudaFilterModePoint;

	tex3.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex3, d_aTex3, desc3) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//-------   Texture 4   ------------------------

	cudaChannelFormatDesc desc4 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex4;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex4, &desc4, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex4, 0, 0, h_tex4, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex4.addressMode[0] = cudaAddressModeClamp;

	tex4.addressMode[1] = cudaAddressModeClamp;

	tex4.filterMode = cudaFilterModePoint;

	tex4.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex4, d_aTex4, desc4) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//---------------------------------------------

#if (SHOWBUG == 0)

	// allocate device memory for result

	unsigned int cbFailed = nBlocks * sizeof(unsigned int);

    unsigned int *d_failed;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_failed, cbFailed));

    CUT_CHECK_ERROR( "cudaMalloc() failed" );

#endif //!BUG

	// setup execution parameters

    dim3  grid( nGridX, nGridY, 1);

    dim3  threads( nThreads, 1, 1);

	//--------  EXECUTE KERNEL -------

    unsigned int timer = 0;

    CUT_SAFE_CALL( cutCreateTimer( &timer));

    CUT_SAFE_CALL( cutStartTimer( timer));

   kernel_TestTexFetch2<<< grid, threads, 0 >>>( d_failed );

    CUT_CHECK_ERROR("Kernel execution failed");

   CUT_SAFE_CALL( cutStopTimer( timer));

    printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));

    CUT_SAFE_CALL( cutDeleteTimer( timer));

	// test the results

	unsigned int *h_failed = (unsigned int *) malloc(cbFailed);

    CUDA_SAFE_CALL( cudaMemcpy(h_failed, d_failed, cbFailed, cudaMemcpyDeviceToHost) );

	for (int bid = 0; bid < nBlocks; bid++)

  if (h_failed[bid] != 0)

  	printf("%u\tFAILED\t%u\n", bid, h_failed[bid]);

#ifdef RUN_KERNEL_TWICE

    CUT_SAFE_CALL( cutCreateTimer( &timer));

    CUT_SAFE_CALL( cutStartTimer( timer));

   kernel_TestTexFetch2<<< grid, threads, 0 >>>( d_failed );

    CUT_CHECK_ERROR("Kernel execution failed");

   CUT_SAFE_CALL( cutStopTimer( timer));

    printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));

    CUT_SAFE_CALL( cutDeleteTimer( timer));

	// test the results

    CUDA_SAFE_CALL( cudaMemcpy(h_failed, d_failed, cbFailed, cudaMemcpyDeviceToHost) );

	for (int bid = 0; bid < nBlocks; bid++)

  if (h_failed[bid] != 0)

  	printf("%u\tFAILED\t%u\n", bid, h_failed[bid]);

#endif // RUN_KERNEL_TWICE

   // cleanup memory

    

	CUDA_SAFE_CALL( cudaUnbindTexture(tex4) );

	CUDA_SAFE_CALL( cudaUnbindTexture(tex3) );

	CUDA_SAFE_CALL( cudaUnbindTexture(tex2) );

	CUDA_SAFE_CALL( cudaUnbindTexture(tex1) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex4) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex3) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex2) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex1) );

   CUDA_SAFE_CALL( cudaFree(d_failed));

   free(h_failed);

	free(h_tex4);

    free(h_tex3);

    free(h_tex2);

    free(h_tex1);

   CUT_EXIT(argc, argv);

}

Just tested on Ubuntu Linux 6.06 (32 bit) with the NVIDIA Linux drivers (9751) and an 8800 GTX card. I added a printf at the beginning to show whether SHOWBUG was enabled for the run.

SHOWBUG enabled

Processing time: 0.164000 (ms)
SHOWBUG not enabled

Processing time: 0.166000 (ms)

No crashing or other strange behavior.

Thanks… Were you able to run it multiple times consecutively in both cases?

The first run typically works, but the 2nd fails. (Maybe different on different setups – could you run it 8 or so times in a row?)

I have noticed the same problem in my case, using the code exactly similar to the posted one. only the first one was sucessful, all other three are failed. is this a bug from the driver or the sdk (i am using the latest driver and the sdk).