For some reason I can’t attach a file to the BB post… code follows:
/**
* Texture Memory Bug?
* 27-Mar-2007 jhanweck
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
// SET THIS TO 0 TO CAUSE THE BUG TO DISAPPEAR
#define SHOWBUG 1
extern __shared__ float shmem[];
/**
* TEXTURES
*/
texture<float, 2, cudaReadModeElementType> tex1;
texture<float, 2, cudaReadModeElementType> tex2;
texture<float, 2, cudaReadModeElementType> tex3;
texture<float, 2, cudaReadModeElementType> tex4;
/**
* Helper function for testing near equality of two floats within epsilon.
*/
__device__ bool feqf(float x1, float x2, float eps)
{
return fabsf(x2 - x1) < eps;
}
/**
* KERNEL
*
* This kernel fetches values from texture memory
* and compares the values to their correct values.
* Returns bits indicating which texture fetch did not compare.
*/
__global__ void kernel_TestTexFetch2(
unsigned int *g_failed
)
{
const unsigned int nThreads = blockDim.x;
const unsigned int tid = threadIdx.x;
const unsigned int bidX = blockIdx.x;
const unsigned int bidY = blockIdx.y;
const unsigned int bid = bidY * gridDim.x + bidX;
const float EPS = 1.0e-5;
__shared__ unsigned int bitsTex1, bitsTex2, bitsTex3, bitsTex4;
if (tid == 0)
{
bitsTex1 = 0;
bitsTex2 = 0;
bitsTex3 = 0;
bitsTex4 = 0;
}
__syncthreads();
float fTex1 = texfetch(tex1, tid, bidY);
if (!feqf(fTex1, 0.1f + (float) tid / nThreads, EPS))
bitsTex1 = 0x0001;
float fTex2 = texfetch(tex2, tid, bidY);
if (!feqf(fTex2, 0.2f + (float) tid / nThreads, EPS))
bitsTex2 = 0x0002;
float fTex3 = texfetch(tex3, tid, bidY);
if (!feqf(fTex3, 0.3f + (float) tid / nThreads, EPS))
bitsTex3 = 0x0004;
float fTex4 = texfetch(tex4, tid, bidY);
if (!feqf(fTex4, 0.4f + (float) tid / nThreads, EPS))
bitsTex4 = 0x0008;
__syncthreads();
if (tid == 0)
{
g_failed[bid] = bitsTex1 | bitsTex2 | bitsTex3 | bitsTex4;
}
__syncthreads();
}
/**
* Texture memory bug:
* Many textures seem to create instabilities?
*/
int main( int argc, char** argv)
{
CUT_CHECK_DEVICE();
const int nThreads = 64;
const int nGridY = 10;
const int nGridX = 200;
const int nBlocks = nGridX * nGridY;
// Allocate some host memory for holding texture values.
// The first 3 textures are 2D textures, nThreads wide x nGridY high.
// The 4th texture is 2D texture of size 1 x nGridY
float *h_tex1 = (float *) malloc(nGridY * nThreads * sizeof(float));
float *h_tex2 = (float *) malloc(nGridY * nThreads * sizeof(float));
float *h_tex3 = (float *) malloc(nGridY * nThreads * sizeof(float));
float *h_tex4 = (float *) malloc(nGridY * nThreads * sizeof(float));
// initialize the texture values with some identifying values
for (int bidY = 0; bidY < nGridY; bidY++)
{
for (int tid = 0; tid < nThreads; tid++)
{
h_tex1[bidY * nThreads + tid] = 0.1f + (float) tid / nThreads;
h_tex2[bidY * nThreads + tid] = 0.2f + (float) tid / nThreads;
h_tex3[bidY * nThreads + tid] = 0.3f + (float) tid / nThreads;
h_tex4[bidY * nThreads + tid] = 0.4f + (float) tid / nThreads;
}
}
#if (SHOWBUG != 0)
// allocate device memory for result
unsigned int cbFailed = nBlocks * sizeof(unsigned int);
unsigned int *d_failed;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_failed, cbFailed));
CUT_CHECK_ERROR( "cudaMalloc() failed" );
#endif //SHOWBUG
//
// Initialize the 4 textures
//
//------- Texture 1 ------------------------
cudaChannelFormatDesc desc1 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *d_aTex1;
CUDA_SAFE_CALL( cudaMallocArray(&d_aTex1, &desc1, nThreads, nGridY) );
CUT_CHECK_ERROR( "cudaMallocArray() failed" );
CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex1, 0, 0, h_tex1, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );
CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );
tex1.addressMode[0] = cudaAddressModeClamp;
tex1.addressMode[1] = cudaAddressModeClamp;
tex1.filterMode = cudaFilterModePoint;
tex1.normalized = false;
CUDA_SAFE_CALL( cudaBindTexture(tex1, d_aTex1, desc1) );
CUT_CHECK_ERROR( "cudaBindTexture() failed" );
//------- Texture 2 ------------------------
cudaChannelFormatDesc desc2 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *d_aTex2;
CUDA_SAFE_CALL( cudaMallocArray(&d_aTex2, &desc2, nThreads, nGridY) );
CUT_CHECK_ERROR( "cudaMallocArray() failed" );
CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex2, 0, 0, h_tex2, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );
CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );
tex2.addressMode[0] = cudaAddressModeClamp;
tex2.addressMode[1] = cudaAddressModeClamp;
tex2.filterMode = cudaFilterModePoint;
tex2.normalized = false;
CUDA_SAFE_CALL( cudaBindTexture(tex2, d_aTex2, desc2) );
CUT_CHECK_ERROR( "cudaBindTexture() failed" );
//------- Texture 3 ------------------------
cudaChannelFormatDesc desc3 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *d_aTex3;
CUDA_SAFE_CALL( cudaMallocArray(&d_aTex3, &desc3, nThreads, nGridY) );
CUT_CHECK_ERROR( "cudaMallocArray() failed" );
CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex3, 0, 0, h_tex3, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );
CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );
tex3.addressMode[0] = cudaAddressModeClamp;
tex3.addressMode[1] = cudaAddressModeClamp;
tex3.filterMode = cudaFilterModePoint;
tex3.normalized = false;
CUDA_SAFE_CALL( cudaBindTexture(tex3, d_aTex3, desc3) );
CUT_CHECK_ERROR( "cudaBindTexture() failed" );
//------- Texture 4 ------------------------
cudaChannelFormatDesc desc4 = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *d_aTex4;
CUDA_SAFE_CALL( cudaMallocArray(&d_aTex4, &desc4, nThreads, nGridY) );
CUT_CHECK_ERROR( "cudaMallocArray() failed" );
CUDA_SAFE_CALL( cudaMemcpyToArray(d_aTex4, 0, 0, h_tex4, nThreads * nGridY * sizeof(float), cudaMemcpyHostToDevice) );
CUT_CHECK_ERROR( "cudaMemcpyToArray() failed" );
tex4.addressMode[0] = cudaAddressModeClamp;
tex4.addressMode[1] = cudaAddressModeClamp;
tex4.filterMode = cudaFilterModePoint;
tex4.normalized = false;
CUDA_SAFE_CALL( cudaBindTexture(tex4, d_aTex4, desc4) );
CUT_CHECK_ERROR( "cudaBindTexture() failed" );
//---------------------------------------------
#if (SHOWBUG == 0)
// allocate device memory for result
unsigned int cbFailed = nBlocks * sizeof(unsigned int);
unsigned int *d_failed;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_failed, cbFailed));
CUT_CHECK_ERROR( "cudaMalloc() failed" );
#endif //!BUG
// setup execution parameters
dim3 grid( nGridX, nGridY, 1);
dim3 threads( nThreads, 1, 1);
//-------- EXECUTE KERNEL -------
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
kernel_TestTexFetch2<<< grid, threads, 0 >>>( d_failed );
CUT_CHECK_ERROR("Kernel execution failed");
CUT_SAFE_CALL( cutStopTimer( timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
// test the results
unsigned int *h_failed = (unsigned int *) malloc(cbFailed);
CUDA_SAFE_CALL( cudaMemcpy(h_failed, d_failed, cbFailed, cudaMemcpyDeviceToHost) );
for (int bid = 0; bid < nBlocks; bid++)
if (h_failed[bid] != 0)
printf("%u\tFAILED\t%u\n", bid, h_failed[bid]);
#ifdef RUN_KERNEL_TWICE
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
kernel_TestTexFetch2<<< grid, threads, 0 >>>( d_failed );
CUT_CHECK_ERROR("Kernel execution failed");
CUT_SAFE_CALL( cutStopTimer( timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
// test the results
CUDA_SAFE_CALL( cudaMemcpy(h_failed, d_failed, cbFailed, cudaMemcpyDeviceToHost) );
for (int bid = 0; bid < nBlocks; bid++)
if (h_failed[bid] != 0)
printf("%u\tFAILED\t%u\n", bid, h_failed[bid]);
#endif // RUN_KERNEL_TWICE
// cleanup memory
CUDA_SAFE_CALL( cudaUnbindTexture(tex4) );
CUDA_SAFE_CALL( cudaUnbindTexture(tex3) );
CUDA_SAFE_CALL( cudaUnbindTexture(tex2) );
CUDA_SAFE_CALL( cudaUnbindTexture(tex1) );
CUDA_SAFE_CALL( cudaFreeArray(d_aTex4) );
CUDA_SAFE_CALL( cudaFreeArray(d_aTex3) );
CUDA_SAFE_CALL( cudaFreeArray(d_aTex2) );
CUDA_SAFE_CALL( cudaFreeArray(d_aTex1) );
CUDA_SAFE_CALL( cudaFree(d_failed));
free(h_failed);
free(h_tex4);
free(h_tex3);
free(h_tex2);
free(h_tex1);
CUT_EXIT(argc, argv);
}