When using several textures, I find that if I allocate device memory (using cudaMalloc) BEFORE allocating/binding the textures, writes to the device memory corrupt the textures.
If I allocate the device memory AFTER allocating/binding the textures, there is no corruption.
It appears at least 3 textures are required before the problem manifests, so the code uses 4 textures. Sorry, I haven’t been able to narrow it down further than that.
The problem consistently occurs every other run of the application. Consecutive calls to the kernel within the app are fine (when the app runs), leading me to believe something isn’t getting allocated/deallocated properly.
Could I be doing something completely wrong in creating the textures??? The program works perfectly in the emulator every time.
Can anyone else replicate this problem… either Windows or Linux???
Below is cuTexMemBug.cu that produces the problem.
#define SHOWBUG 1 to cause the bug to appear (Line 12)
#define SHOWBUG 0 to cause it to disappear
Platform:
WinXP Pro SP2, Dell Precision 390
GeForce 8800 GTS as non-primary display
(GeForce FX 5200 as primary display, dualview mode)
nVidia driver 97.73
CUDA v0.8
VisualStudio 2005
/**
* Texture Memory Bug?
* 27-Mar-2007 jhanweck
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
// SET THIS TO 0 TO CAUSE THE BUG TO DISAPPEAR
#define SHOWBUG 1
extern __shared__ float shmem[];
/**
* TEXTURES
*/
texture<float, 2, cudaReadModeElementType> tex1;
texture<float, 2, cudaReadModeElementType> tex2;
texture<float, 2, cudaReadModeElementType> tex3;
texture<float, 2, cudaReadModeElementType> tex4;
/**
* Helper function for testing near equality of two floats within epsilon.
*/
__device__ bool feqf(float x1, float x2, float eps)
{
return fabsf(x2 - x1) < eps;
}
/**
* KERNEL
*
* This kernel fetches values from texture memory
* and compares the values to their correct values.
* Returns bits indicating which texture fetch did not compare.
*/
__global__ void kernel_TestTexFetch2(
unsigned int *g_failed
)
{
const unsigned int nThreads = blockDim.x;
const unsigned int tid = threadIdx.x;
const unsigned int bidX = blockIdx.x;
const unsigned int bidY = blockIdx.y;
const unsigned int bid = bidY * gridDim.x + bidX;
const float EPS = 1.0e-5;
__shared__ unsigned int bitsTex1, bitsTex2, bitsTex3, bitsTex4;
if (tid == 0)
{
bitsTex1 = 0;
bitsTex2 = 0;
bitsTex3 = 0;
bitsTex4 = 0;
}
__syncthreads();
float fTex1 = texfetch(tex1, tid, bidY);
if (!feqf(fTex1, 0.1f + (float) tid / nThreads, EPS))
bitsTex1 = 0x0001;
float fTex2 = texfetch(tex2, tid, bidY);
if (!feqf(fTex2, 0.2f + (float) tid / nThreads, EPS))
bitsTex2 = 0x0002;
float fTex3 = texfetch(tex3, tid, bidY);
if (!feqf(fTex3, 0.3f + (float) tid / nThreads, EPS))
bitsTex3 = 0x0004;
float fTex4 = texfetch(tex4, tid, bidY);
if (!feqf(fTex4, 0.4f + (float) tid / nThreads, EPS))
bitsTex4 = 0x0008;
__syncthreads();
if (tid == 0)
{
g_failed[bid] = bitsTex1 | bitsTex2 | bitsTex3 | bitsTex4;
}
__syncthreads();
}
/**
* Texture memory bug:
* Many textures seem to create instabilities?
*/
int main( int argc, char** argv)
{
CUT_CHECK_DEVICE();
const int nThreads = 64;
const int nGridY = 10;
const int nGridX = 200;
const int nBlocks = nGridX * nGridY;
// Allocate some host memory for holding texture values.
// The first 3 textures are 2D textures, nThreads wide x nGridY high.
// The 4th texture is 2D texture of size 1 x nGridY
float *h_tex1 = (float *) malloc(nGridY * nThreads * sizeof(float));
float *h_tex2 = (float *) malloc(nGridY * nThreads * sizeof(float));
float *h_tex3 = (float *) malloc(nGridY * nThreads * sizeof(float));
float *h_tex4 = (float *) malloc(nGridY * nThreads * sizeof(float));
// initialize the texture values with some identifying values
for (int bidY = 0; bidY < nGridY; bidY++)
{
for (int tid = 0; tid < nThreads; tid++)
{
h_tex1[bidY * nThreads + tid] = 0.1f + (float) tid / nThreads;
h_tex2[bidY * nThreads + tid] = 0.2f + (float) tid / nThreads;
h_tex3[bidY * nThreads + tid] = 0.3f + (float) tid / nThreads;
h_tex4[bidY * nThreads + tid] = 0.4f + (float) tid / nThreads;
}
}
#if (SHOWBUG != 0)
// allocate device memory for result
unsigned int cbFailed = nBlocks * sizeof(unsigned int);
unsigned int *d_failed;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_failed, cbFailed));
CUT_CHECK_ERROR( "cudaMalloc() failed" );
#endif //SHOWBUG
//
// Initialize the 4 textures
//
//------- Texture 1 ------------------------
cudaChannelFormatDesc desc1 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *d_aTex1;
CUDA_SAFE_CALL( cudaMallocArray(&d_aTex1, &desc1, nThreads, nGridY) );
CUT_CHECK_ERROR( "cudaMallocArray() failed" );
CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex1, 0, 0, h_tex1, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );
CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );
tex1.addressMode[0] = cudaAddressModeClamp;
tex1.addressMode[1] = cudaAddressModeClamp;
tex1.filterMode = cudaFilterModePoint;
tex1.normalized = false;
CUDA_SAFE_CALL( cudaBindTexture(tex1, d_aTex1, desc1) );
CUT_CHECK_ERROR( "cudaBindTexture() failed" );
//------- Texture 2 ------------------------
cudaChannelFormatDesc desc2 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *d_aTex2;
CUDA_SAFE_CALL( cudaMallocArray(&d_aTex2, &desc2, nThreads, nGridY) );
CUT_CHECK_ERROR( "cudaMallocArray() failed" );
CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex2, 0, 0, h_tex2, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );
CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );
tex2.addressMode[0] = cudaAddressModeClamp;
tex2.addressMode[1] = cudaAddressModeClamp;
tex2.filterMode = cudaFilterModePoint;
tex2.normalized = false;
CUDA_SAFE_CALL( cudaBindTexture(tex2, d_aTex2, desc2) );
CUT_CHECK_ERROR( "cudaBindTexture() failed" );
//------- Texture 3 ------------------------
cudaChannelFormatDesc desc3 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *d_aTex3;
CUDA_SAFE_CALL( cudaMallocArray(&d_aTex3, &desc3, nThreads, nGridY) );
CUT_CHECK_ERROR( "cudaMallocArray() failed" );
CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex3, 0, 0, h_tex3, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );
CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );
tex3.addressMode[0] = cudaAddressModeClamp;
tex3.addressMode[1] = cudaAddressModeClamp;
tex3.filterMode = cudaFilterModePoint;
tex3.normalized = false;
CUDA_SAFE_CALL( cudaBindTexture(tex3, d_aTex3, desc3) );
CUT_CHECK_ERROR( "cudaBindTexture() failed" );
//------- Texture 4 ------------------------
cudaChannelFormatDesc desc4 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *d_aTex4;
CUDA_SAFE_CALL( cudaMallocArray(&d_aTex4, &desc4, nThreads, nGridY) );
CUT_CHECK_ERROR( "cudaMallocArray() failed" );
CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex4, 0, 0, h_tex4, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );
CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );
tex4.addressMode[0] = cudaAddressModeClamp;
tex4.addressMode[1] = cudaAddressModeClamp;
tex4.filterMode = cudaFilterModePoint;
tex4.normalized = false;
CUDA_SAFE_CALL( cudaBindTexture(tex4, d_aTex4, desc4) );
CUT_CHECK_ERROR( "cudaBindTexture() failed" );
//---------------------------------------------
#if (SHOWBUG == 0)
// allocate device memory for result
unsigned int cbFailed = nBlocks * sizeof(unsigned int);
unsigned int *d_failed;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_failed, cbFailed));
CUT_CHECK_ERROR( "cudaMalloc() failed" );
#endif //!BUG
// setup execution parameters
dim3 grid( nGridX, nGridY, 1);
dim3 threads( nThreads, 1, 1);
//-------- EXECUTE KERNEL -------
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
kernel_TestTexFetch2<<< grid, threads, 0 >>>( d_failed );
CUT_CHECK_ERROR("Kernel execution failed");
CUT_SAFE_CALL( cutStopTimer( timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
// test the results
unsigned int *h_failed = (unsigned int *) malloc(cbFailed);
CUDA_SAFE_CALL( cudaMemcpy(h_failed, d_failed, cbFailed, cudaMemcpyDeviceToHost) );
for (int bid = 0; bid < nBlocks; bid++)
if (h_failed[bid] != 0)
printf("%u\tFAILED\t%u\n", bid, h_failed[bid]);
#ifdef RUN_KERNEL_TWICE
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
kernel_TestTexFetch2<<< grid, threads, 0 >>>( d_failed );
CUT_CHECK_ERROR("Kernel execution failed");
CUT_SAFE_CALL( cutStopTimer( timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
// test the results
CUDA_SAFE_CALL( cudaMemcpy(h_failed, d_failed, cbFailed, cudaMemcpyDeviceToHost) );
for (int bid = 0; bid < nBlocks; bid++)
if (h_failed[bid] != 0)
printf("%u\tFAILED\t%u\n", bid, h_failed[bid]);
#endif // RUN_KERNEL_TWICE
// cleanup memory
CUDA_SAFE_CALL( cudaUnbindTexture(tex4) );
CUDA_SAFE_CALL( cudaUnbindTexture(tex3) );
CUDA_SAFE_CALL( cudaUnbindTexture(tex2) );
CUDA_SAFE_CALL( cudaUnbindTexture(tex1) );
CUDA_SAFE_CALL( cudaFreeArray(d_aTex4) );
CUDA_SAFE_CALL( cudaFreeArray(d_aTex3) );
CUDA_SAFE_CALL( cudaFreeArray(d_aTex2) );
CUDA_SAFE_CALL( cudaFreeArray(d_aTex1) );
CUDA_SAFE_CALL( cudaFree(d_failed));
free(h_failed);
free(h_tex4);
free(h_tex3);
free(h_tex2);
free(h_tex1);
CUT_EXIT(argc, argv);
}