memory allocation bug?

jhanweck · March 27, 2007, 9:38pm

When using several textures, I find that if I allocate device memory (using cudaMalloc) BEFORE allocating/binding the textures, writes to the device memory corrupt the textures.

If I allocate the device memory AFTER allocating/binding the textures, there is no corruption.

The problem consistently occurs every other run of the application, but consecutive calls to the kernel are fine, leading me to believe something isn’t getting allocated/deallocated properly.

(I suppose I could be doing something completely wrong in creating the textures… However, the program works perfectly in the emulator every time.)

Attached is a cuTexMemBug.cu file that produces the problem.

#define SHOWBUG 1 to cause the bug to appear (Line 12)
#define SHOWBUG 0 to cause it to disappear

It appears at least 3 textures are required before the problem manifests, so the code uses 4 textures.

Platform:
WinXP Pro SP2, Dell Precision 390
GeForce 8800 GTS as non-primary display
(GeForce FX 5200 as primary display, dualview mode)
nVidia driver 97.73
CUDA v0.8
VisualStudio 2005

jhanweck · March 27, 2007, 9:39pm

For some reason I can’t attach a file to the BB post… code follows:

/**

* Texture Memory Bug?

* 27-Mar-2007  jhanweck

*/

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <cutil.h>

// SET THIS TO 0 TO CAUSE THE BUG TO DISAPPEAR

#define SHOWBUG 1

extern __shared__ float shmem[];

/**

* TEXTURES

*/

texture<float, 2, cudaReadModeElementType> tex1;

texture<float, 2, cudaReadModeElementType> tex2;

texture<float, 2, cudaReadModeElementType> tex3;

texture<float, 2, cudaReadModeElementType> tex4;

/**

* Helper function for testing near equality of two floats within epsilon.

*/

__device__ bool feqf(float x1, float x2, float eps)

{

	return fabsf(x2 - x1) < eps;

}

/**

* KERNEL

*

* This kernel fetches values from texture memory

* and compares the values to their correct values.

* Returns bits indicating which texture fetch did not compare.

*/

__global__ void kernel_TestTexFetch2(

	unsigned int *g_failed

	)

{

	const unsigned int nThreads = blockDim.x;

	const unsigned int tid = threadIdx.x;

	const unsigned int bidX = blockIdx.x;

	const unsigned int bidY = blockIdx.y;

	const unsigned int bid = bidY * gridDim.x + bidX;

	const float EPS = 1.0e-5;

	__shared__ unsigned int bitsTex1, bitsTex2, bitsTex3, bitsTex4;

	if (tid == 0)

	{

  bitsTex1 = 0; 

  bitsTex2 = 0;

  bitsTex3 = 0;

  bitsTex4 = 0;

	}

	__syncthreads();

	float fTex1 = texfetch(tex1, tid, bidY);

	if (!feqf(fTex1, 0.1f + (float) tid / nThreads, EPS))

  bitsTex1 = 0x0001;

	float fTex2 = texfetch(tex2, tid, bidY);

	if (!feqf(fTex2, 0.2f + (float) tid / nThreads, EPS))

  bitsTex2 = 0x0002;

	float fTex3 = texfetch(tex3, tid, bidY);

	if (!feqf(fTex3, 0.3f + (float) tid / nThreads, EPS))

  bitsTex3 = 0x0004;

	float fTex4 = texfetch(tex4, tid, bidY);

	if (!feqf(fTex4, 0.4f + (float) tid / nThreads, EPS))

  bitsTex4 = 0x0008;

	__syncthreads();

	if (tid == 0)

	{

  g_failed[bid] = bitsTex1 | bitsTex2 | bitsTex3 | bitsTex4;

	}

	__syncthreads();

}

/**

* Texture memory bug:

* Many textures seem to create instabilities?

*/

int main( int argc, char** argv) 

{

    CUT_CHECK_DEVICE();

	const int nThreads = 64;

	const int nGridY = 10;

	const int nGridX = 200;

	const int nBlocks = nGridX * nGridY;

	//  Allocate some host memory for holding texture values.

	//  The first 3 textures are 2D textures, nThreads wide x nGridY high.

	//  The 4th texture is 2D texture of size 1 x nGridY

	float *h_tex1 = (float *) malloc(nGridY * nThreads * sizeof(float));

	float *h_tex2 = (float *) malloc(nGridY * nThreads * sizeof(float));

	float *h_tex3 = (float *) malloc(nGridY * nThreads * sizeof(float));

	float *h_tex4 = (float *) malloc(nGridY * nThreads * sizeof(float));

	// initialize the texture values with some identifying values

	for (int bidY = 0; bidY < nGridY; bidY++)

	{

  for (int tid = 0; tid < nThreads; tid++)

  {

  	h_tex1[bidY * nThreads + tid] = 0.1f + (float) tid / nThreads;

  	h_tex2[bidY * nThreads + tid] = 0.2f + (float) tid / nThreads;

  	h_tex3[bidY * nThreads + tid] = 0.3f + (float) tid / nThreads;

  	h_tex4[bidY * nThreads + tid] = 0.4f + (float) tid / nThreads;

  }

	}

#if (SHOWBUG != 0)

	// allocate device memory for result

	unsigned int cbFailed = nBlocks * sizeof(unsigned int);

    unsigned int *d_failed;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_failed, cbFailed));

    CUT_CHECK_ERROR( "cudaMalloc() failed" );

#endif //SHOWBUG

	//

	// Initialize the 4 textures

	//

	//-------   Texture 1   ------------------------

	cudaChannelFormatDesc desc1 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex1;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex1, &desc1, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex1, 0, 0, h_tex1, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex1.addressMode[0] = cudaAddressModeClamp;

	tex1.addressMode[1] = cudaAddressModeClamp;

	tex1.filterMode = cudaFilterModePoint;

	tex1.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex1, d_aTex1, desc1) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//-------   Texture 2   ------------------------

	cudaChannelFormatDesc desc2 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex2;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex2, &desc2, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex2, 0, 0, h_tex2, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex2.addressMode[0] = cudaAddressModeClamp;

	tex2.addressMode[1] = cudaAddressModeClamp;

	tex2.filterMode = cudaFilterModePoint;

	tex2.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex2, d_aTex2, desc2) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//-------   Texture 3   ------------------------

	cudaChannelFormatDesc desc3 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex3;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex3, &desc3, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex3, 0, 0, h_tex3, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex3.addressMode[0] = cudaAddressModeClamp;

	tex3.addressMode[1] = cudaAddressModeClamp;

	tex3.filterMode = cudaFilterModePoint;

	tex3.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex3, d_aTex3, desc3) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//-------   Texture 4   ------------------------

	cudaChannelFormatDesc desc4 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

	cudaArray *d_aTex4;

    CUDA_SAFE_CALL( cudaMallocArray(&d_aTex4, &desc4, nThreads, nGridY) );

    CUT_CHECK_ERROR( "cudaMallocArray() failed" );

   CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex4, 0, 0, h_tex4, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );

    CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );

	

	tex4.addressMode[0] = cudaAddressModeClamp;

	tex4.addressMode[1] = cudaAddressModeClamp;

	tex4.filterMode = cudaFilterModePoint;

	tex4.normalized = false;

   CUDA_SAFE_CALL( cudaBindTexture(tex4, d_aTex4, desc4) );

    CUT_CHECK_ERROR( "cudaBindTexture() failed" );

	//---------------------------------------------

#if (SHOWBUG == 0)

	// allocate device memory for result

	unsigned int cbFailed = nBlocks * sizeof(unsigned int);

    unsigned int *d_failed;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_failed, cbFailed));

    CUT_CHECK_ERROR( "cudaMalloc() failed" );

#endif //!BUG

	// setup execution parameters

    dim3  grid( nGridX, nGridY, 1);

    dim3  threads( nThreads, 1, 1);

	//--------  EXECUTE KERNEL -------

    unsigned int timer = 0;

    CUT_SAFE_CALL( cutCreateTimer( &timer));

    CUT_SAFE_CALL( cutStartTimer( timer));

   kernel_TestTexFetch2<<< grid, threads, 0 >>>( d_failed );

    CUT_CHECK_ERROR("Kernel execution failed");

   CUT_SAFE_CALL( cutStopTimer( timer));

    printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));

    CUT_SAFE_CALL( cutDeleteTimer( timer));

	// test the results

	unsigned int *h_failed = (unsigned int *) malloc(cbFailed);

    CUDA_SAFE_CALL( cudaMemcpy(h_failed, d_failed, cbFailed, cudaMemcpyDeviceToHost) );

	for (int bid = 0; bid < nBlocks; bid++)

  if (h_failed[bid] != 0)

  	printf("%u\tFAILED\t%u\n", bid, h_failed[bid]);

#ifdef RUN_KERNEL_TWICE

    CUT_SAFE_CALL( cutCreateTimer( &timer));

    CUT_SAFE_CALL( cutStartTimer( timer));

   kernel_TestTexFetch2<<< grid, threads, 0 >>>( d_failed );

    CUT_CHECK_ERROR("Kernel execution failed");

   CUT_SAFE_CALL( cutStopTimer( timer));

    printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));

    CUT_SAFE_CALL( cutDeleteTimer( timer));

	// test the results

    CUDA_SAFE_CALL( cudaMemcpy(h_failed, d_failed, cbFailed, cudaMemcpyDeviceToHost) );

	for (int bid = 0; bid < nBlocks; bid++)

  if (h_failed[bid] != 0)

  	printf("%u\tFAILED\t%u\n", bid, h_failed[bid]);

#endif // RUN_KERNEL_TWICE

   // cleanup memory

    

	CUDA_SAFE_CALL( cudaUnbindTexture(tex4) );

	CUDA_SAFE_CALL( cudaUnbindTexture(tex3) );

	CUDA_SAFE_CALL( cudaUnbindTexture(tex2) );

	CUDA_SAFE_CALL( cudaUnbindTexture(tex1) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex4) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex3) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex2) );

	CUDA_SAFE_CALL( cudaFreeArray(d_aTex1) );

   CUDA_SAFE_CALL( cudaFree(d_failed));

   free(h_failed);

	free(h_tex4);

    free(h_tex3);

    free(h_tex2);

    free(h_tex1);

   CUT_EXIT(argc, argv);

}

Topic		Replies	Views
Memory allocation bug? using several textures CUDA Programming and Performance	3	2763	July 17, 2009
Problem with texture memory CUDA Programming and Performance	10	1779	August 6, 2010
Allocating memory simutanously with GL is it correctly implemented? CUDA Programming and Performance	3	2576	August 14, 2007
Problem with texture memory CUDA Programming and Performance	8	2479	March 2, 2012
2D textures bound to Pitch Linear Memory texture doesn't access correct information CUDA Programming and Performance	2	20229	December 21, 2010
fetch texture memory error! CUDA Programming and Performance	0	601	April 3, 2013
Texture memory error. CUDA Programming and Performance	1	2620	July 27, 2011
problem using Texture all values 0 CUDA Programming and Performance	2	3829	August 25, 2010
Texture contains wrong values Using tex1Dfetch CUDA Programming and Performance	17	6680	February 17, 2009
Weird behaviour with large data and textures CUDA Programming and Performance	0	678	August 3, 2009

memory allocation bug?

Related topics