[quote name=‘SPWorley’ post=‘561307’ date=‘Jul 3 2009, 02:09 PM’]
Post your failing test code.
I’ve used atomics extensively. I’ve often had problems too, but it’s always been because I screwed up. Once you get fancy (especially in try-until-success test loops) the chances of design error is huge.
If you have correct code that fails, post it so NVidia can figure it out.
As for fixed point, you likely should choose your range and just make two mapping functions something like:
[codebox]typedef struct {
int mX; // columns
int mY; // rows
int mZ; // slices
float* elements;
} Matrix3D;
// device code
global void
k_TestAtomicExch( Matrix3D m, int aDim )
{
// 3D thread blocks: 8x8x8,
volatile int bDimX = blockDim.x;
volatile int bDimY = blockDim.y;
volatile int bDimZ = blockDim.z;
volatile int bDimXY = blockDim.x*blockDim.y;
volatile float bThreads = (float)(blockDim.x*blockDim.y*blockDim.z);
int c;
int r;
int s;;
int gNdx;
float* tmp;
__syncthreads();
if (threadIdx.x == bDimX-1 && threadIdx.y == bDimY-1 && threadIdx.z == bDimZ-1) {
for (c=0; c<aDim; c++) {
for (r=0; r<aDim; r++) {
for (s=0; s<aDim; s++) {
gNdx = c + r*aDim + s*aDim*aDim;
tmp = &m.elements[gNdx];
atomicExch( &m.elements[gNdx], ( *tmp + bThreads ) );
}
}
}
}
}
// host side
extern “C” void
ciTestAtomicExch( Matrix3D &m3d, int aDim )
{
m3d.mX = aDim;
m3d.mY = aDim;
m3d.mZ = aDim;
const int bThreads = 512;
// using 8x8x8 3D blocks
int gxDim = m3d.mX/8;
int gyDim = m3d.mY/8;
int gzDim = m3d.mZ/8;
int blocks = gxDim * gyDim * gzDim;
int tThreads = bThreads * blocks;
int eCnt = 0;
int sCnt = 0;
float sum = 0;
float avgVal;
float minVal = FLT_MAX;
float value;
bool error = false;
dim3 dimBlock(8,8,8);
dim3 dimGrid(gxDim,gyDim*gzDim);
cutilSafeCall( cudaMalloc( (void**) &m3d.elements, tThreads * sizeof( float )) );
float* h_fArray = (float*) calloc( tThreads, sizeof( float ));
cutilSafeCall( cudaMemcpy( m3d.elements, h_fArray, tThreads * sizeof( float ), cudaMemcpyHostToDevice) );
k_TestAtomicExch <<< dimGrid, dimBlock >>> ( m3d, aDim );
cudaThreadSynchronize();
cutilSafeCall( cudaMemcpy( h_fArray, m3d.elements, tThreads * sizeof( float ), cudaMemcpyDeviceToHost) );
for (int c=0; c<aDim; ++c) {
for (int r=0; r<aDim; ++r) {
for (int s=0; s<aDim; ++s) {
value = h_fArray[c + r*aDim + s*aDim*aDim];
if (value < minVal) minVal = value;
sum = sum + value;
if ( (int)h_fArray[c + r*aDim + s*aDim*aDim] != tThreads ) {
error = true;
eCnt++;
// printf(" Found error at index %d, %d, %d illegal value of %g\n",c,r,s,h_fArray[c + r*aDim + s*aDim*aDim]);
}
else sCnt++;
}
}
}
avgVal = sum/(float)(eCnt+sCnt);
if (!error) printf(" Success!! No errors found in %d blocks!",blocks);
else {
printf(" Errors found: %d correct values, %d errors\n",sCnt,eCnt);
printf(" Success Rate: %g %\n",(float)sCnt/(float)(sCnt+eCnt)*100);
printf(" Min value: %g, Avg: %g\n",minVal,avgVal);
}
free(h_fArray);
cutilSafeCall( cudaFree(m3d.elements) );
}[/codebox]
Output from a run typically will have numbers like this:
Errors found: 0 correct values, 32768 errors
Success Rate: 0
Min value: 2048, Avg: 22229.1