Memory allocation bug? using several textures

jhanweck · March 28, 2007, 3:41pm

When using several textures, I find that if I allocate device memory (using cudaMalloc) BEFORE allocating/binding the textures, writes to the device memory corrupt the textures.

If I allocate the device memory AFTER allocating/binding the textures, there is no corruption.

It appears at least 3 textures are required before the problem manifests, so the code uses 4 textures. Sorry, I haven’t been able to narrow it down further than that.

The problem consistently occurs every other run of the application. Consecutive calls to the kernel within the app are fine (when the app runs), leading me to believe something isn’t getting allocated/deallocated properly.

Could I be doing something completely wrong in creating the textures??? The program works perfectly in the emulator every time.

Can anyone else replicate this problem… either Windows or Linux???

Below is cuTexMemBug.cu that produces the problem.

#define SHOWBUG 1 to cause the bug to appear (Line 12)

#define SHOWBUG 0 to cause it to disappear

Platform:

WinXP Pro SP2, Dell Precision 390

GeForce 8800 GTS as non-primary display

(GeForce FX 5200 as primary display, dualview mode)

nVidia driver 97.73

CUDA v0.8

VisualStudio 2005

/**

* Texture Memory Bug?

* 27-Mar-2007  jhanweck

*/

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <cutil.h>

// SET THIS TO 0 TO CAUSE THE BUG TO DISAPPEAR

#define SHOWBUG 1

extern __shared__ float shmem[];

/**

* TEXTURES

*/

texture<float, 2, cudaReadModeElementType> tex1;

texture<float, 2, cudaReadModeElementType> tex2;

texture<float, 2, cudaReadModeElementType> tex3;

texture<float, 2, cudaReadModeElementType> tex4;

/**

* Helper function for testing near equality of two floats within epsilon.

*/

__device__ bool feqf(float x1, float x2, float eps)

{

	return fabsf(x2 - x1) < eps;

}

/**

* KERNEL

*

* This kernel fetches values from texture memory

* and compares the values to their correct values.

* Returns bits indicating which texture fetch did not compare.

*/

__global__ void kernel_TestTexFetch2(

	unsigned int *g_failed

	)

{

	const unsigned int nThreads = blockDim.x;

	const unsigned int tid = threadIdx.x;

	const unsigned int bidX = blockIdx.x;

	const unsigned int bidY = blockIdx.y;

	const unsigned int bid = bidY * gridDim.x + bidX;

	const float EPS = 1.0e-5;

	__shared__ unsigned int bitsTex1, bitsTex2, bitsTex3, bitsTex4;

	if (tid == 0)

	{

  bitsTex1 = 0; 

  bitsTex2 = 0;

  bitsTex3 = 0;

  bitsTex4 = 0;

	}

	__syncthreads();

	float fTex1 = texfetch(tex1, tid, bidY);

	if (!feqf(fTex1, 0.1f + (float) tid / nThreads, EPS))

  bitsTex1 = 0x0001;

	float fTex2 = texfetch(tex2, tid, bidY);

	if (!feqf(fTex2, 0.2f + (float) tid / nThreads, EPS))

  bitsTex2 = 0x0002;

	float fTex3 = texfetch(tex3, tid, bidY);

	if (!feqf(fTex3, 0.3f + (float) tid / nThreads, EPS))

  bitsTex3 = 0x0004;

	float fTex4 = texfetch(tex4, tid, bidY);

	if (!feqf(fTex4, 0.4f + (float) tid / nThreads, EPS))

  bitsTex4 = 0x0008;

	__syncthreads();

	if (tid == 0)

	{

  g_failed[bid] = bitsTex1 | bitsTex2 | bitsTex3 | bitsTex4;

	}

	__syncthreads();

}

/**

* Texture memory bug:

* Many textures seem to create instabilities?

*/

int main( int argc, char** argv) 

{

    CUT_CHECK_DEVICE();

	const int nThreads = 64;

	const int nGridY = 10;

	const int nGridX = 200;

	const int nBlocks = nGridX * nGridY;

	//  Allocate some host memory for holding texture values.

	//  The first 3 textures are 2D textures, nThreads wide x nGridY high.

	//  The 4th texture is 2D texture of size 1 x nGridY

	float *h_tex1 = (float *) malloc(nGridY * nThreads * sizeof(float));

	float *h_tex2 = (float *) malloc(nGridY * nThreads * sizeof(float));

	float *h_tex3 = (float *) malloc(nGridY * nThreads * sizeof(float));

	float *h_tex4 = (float *) malloc(nGridY * nThreads * sizeof(float));

	// initialize the texture values with some identifying values

	for (int bidY = 0; bidY < nGridY; bidY++)

	{

  for (int tid = 0; tid < nThreads; tid++)

  {

  	h_tex1[bidY * nThreads + tid] = 0.1f + (float) tid / nThreads;

  	h_tex2[bidY * nThreads + tid] = 0.2f + (float) tid / nThreads;

  	h_tex3[bidY * nThreads + tid] = 0.3f + (float) tid / nThreads;

  	h_tex4[bidY * nThreads + tid] = 0.4f + (float) tid / nThreads;

  }

	}

#if (SHOWBUG != 0)

	// allocate device memory for result

	unsigned int cbFailed = nBlocks * sizeof(unsigned int);

    unsigned int *d_failed;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_failed, cbFailed));

    CUT_CHECK_ERROR( "cudaMalloc() failed" );

#endif //SHOWBUG

	//

	// Initialize the 4 textures

	//

	//-------   Texture 1   ------------------------

	cudaChannelFormatDesc desc1 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex1;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex1, &desc1, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex1, 0, 0, h_tex1, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex1.addressMode[0] = cudaAddressModeClamp;

	tex1.addressMode[1] = cudaAddressModeClamp;

	tex1.filterMode = cudaFilterModePoint;

	tex1.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex1, d_aTex1, desc1) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//-------   Texture 2   ------------------------

	cudaChannelFormatDesc desc2 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex2;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex2, &desc2, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex2, 0, 0, h_tex2, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex2.addressMode[0] = cudaAddressModeClamp;

	tex2.addressMode[1] = cudaAddressModeClamp;

	tex2.filterMode = cudaFilterModePoint;

	tex2.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex2, d_aTex2, desc2) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//-------   Texture 3   ------------------------

	cudaChannelFormatDesc desc3 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex3;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex3, &desc3, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex3, 0, 0, h_tex3, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex3.addressMode[0] = cudaAddressModeClamp;

	tex3.addressMode[1] = cudaAddressModeClamp;

	tex3.filterMode = cudaFilterModePoint;

	tex3.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex3, d_aTex3, desc3) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//-------   Texture 4   ------------------------

	cudaChannelFormatDesc desc4 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex4;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex4, &desc4, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex4, 0, 0, h_tex4, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex4.addressMode[0] = cudaAddressModeClamp;

	tex4.addressMode[1] = cudaAddressModeClamp;

	tex4.filterMode = cudaFilterModePoint;

	tex4.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex4, d_aTex4, desc4) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//---------------------------------------------

#if (SHOWBUG == 0)

	// allocate device memory for result

	unsigned int cbFailed = nBlocks * sizeof(unsigned int);

    unsigned int *d_failed;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_failed, cbFailed));

    CUT_CHECK_ERROR( "cudaMalloc() failed" );

#endif //!BUG

	// setup execution parameters

    dim3  grid( nGridX, nGridY, 1);

    dim3  threads( nThreads, 1, 1);

	//--------  EXECUTE KERNEL -------

    unsigned int timer = 0;

    CUT_SAFE_CALL( cutCreateTimer( &timer));

    CUT_SAFE_CALL( cutStartTimer( timer));

   kernel_TestTexFetch2<<< grid, threads, 0 >>>( d_failed );

    CUT_CHECK_ERROR("Kernel execution failed");

   CUT_SAFE_CALL( cutStopTimer( timer));

    printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));

    CUT_SAFE_CALL( cutDeleteTimer( timer));

	// test the results

	unsigned int *h_failed = (unsigned int *) malloc(cbFailed);

    CUDA_SAFE_CALL( cudaMemcpy(h_failed, d_failed, cbFailed, cudaMemcpyDeviceToHost) );

	for (int bid = 0; bid < nBlocks; bid++)

  if (h_failed[bid] != 0)

  	printf("%u\tFAILED\t%u\n", bid, h_failed[bid]);

#ifdef RUN_KERNEL_TWICE

    CUT_SAFE_CALL( cutCreateTimer( &timer));

    CUT_SAFE_CALL( cutStartTimer( timer));

   kernel_TestTexFetch2<<< grid, threads, 0 >>>( d_failed );

    CUT_CHECK_ERROR("Kernel execution failed");

   CUT_SAFE_CALL( cutStopTimer( timer));

    printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));

    CUT_SAFE_CALL( cutDeleteTimer( timer));

	// test the results

    CUDA_SAFE_CALL( cudaMemcpy(h_failed, d_failed, cbFailed, cudaMemcpyDeviceToHost) );

	for (int bid = 0; bid < nBlocks; bid++)

  if (h_failed[bid] != 0)

  	printf("%u\tFAILED\t%u\n", bid, h_failed[bid]);

#endif // RUN_KERNEL_TWICE

   // cleanup memory

    

	CUDA_SAFE_CALL( cudaUnbindTexture(tex4) );

	CUDA_SAFE_CALL( cudaUnbindTexture(tex3) );

	CUDA_SAFE_CALL( cudaUnbindTexture(tex2) );

	CUDA_SAFE_CALL( cudaUnbindTexture(tex1) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex4) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex3) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex2) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex1) );

   CUDA_SAFE_CALL( cudaFree(d_failed));

   free(h_failed);

	free(h_tex4);

    free(h_tex3);

    free(h_tex2);

    free(h_tex1);

   CUT_EXIT(argc, argv);

}

seibert · March 28, 2007, 4:50pm

Just tested on Ubuntu Linux 6.06 (32 bit) with the NVIDIA Linux drivers (9751) and an 8800 GTX card. I added a printf at the beginning to show whether SHOWBUG was enabled for the run.

SHOWBUG enabled

Processing time: 0.164000 (ms)

SHOWBUG not enabled

Processing time: 0.166000 (ms)

No crashing or other strange behavior.

jhanweck · March 28, 2007, 6:18pm

Thanks… Were you able to run it multiple times consecutively in both cases?

The first run typically works, but the 2nd fails. (Maybe different on different setups – could you run it 8 or so times in a row?)

yanyh · July 17, 2009, 4:34pm

I have noticed the same problem in my case, using the code exactly similar to the posted one. only the first one was sucessful, all other three are failed. is this a bug from the driver or the sdk (i am using the latest driver and the sdk).

Topic		Replies	Views
memory allocation bug? CUDA Programming and Performance	1	1845	March 27, 2007
cudaMalloc error in big loop CUDA Programming and Performance	12	15593	May 21, 2008
Using Textures CUDA Programming and Performance	10	21788	March 29, 2007
Reading R8G8B8A8 texture using tex2D() causes strange result. CUDA Programming and Performance	27	2841	April 28, 2018
Using texture memory over iterations causes incorrect read/write of some lines CUDA Programming and Performance cuda	2	512	September 2, 2020
Texture Cache Startup Issue Simple Texture Cache Starter example CUDA Programming and Performance	8	4015	March 17, 2010
texture memory performance on a multiGPU system takes too much time to setup a texture for some GPUs CUDA Programming and Performance	9	1391	February 16, 2012
Why is kernel getting failed? CUDA Programming and Performance	8	3155	November 28, 2018
cudaBindTexture failure CUDA Programming and Performance	3	5589	November 4, 2011
Can we do malloc inside a __global__ function CUDA Programming and Performance	26	9646	February 21, 2010

Memory allocation bug? using several textures

Related topics