Hi,
I try to do a inclusive scan on 128 int on a GTX 470, but it fails !
It works fine on a Quadro FX380 !!
It only works if I put ‘barriers’ everywhere !!! But the goal is to avoid synchro ! It should be naturally handled by the warp !
inline
uint4 inclusive_scan_128(uint4 initialValue, __local uint* bitsOnCount)
{
__local uint localBuffer[256];
const uint tid = (uint)get_local_id(0);
uint lane = tid & SIMT_1;
uint block = tid >> 5;
//---- scan : 4 bits
uint4 localBits = initialValue;
localBits.y += localBits.x;
localBits.z += localBits.y;
localBits.w += localBits.z;
//---- scan the last 4x32 bits (The sum in the previous scan)
uint tid2 = block * 2 * SIMT + lane;
localBuffer[tid2] = 0;
tid2 += SIMT;
localBuffer[tid2] = localBits.w;
localBuffer[tid2] += localBuffer[tid2 - 1];
localBuffer[tid2] += localBuffer[tid2 - 2];
localBuffer[tid2] += localBuffer[tid2 - 4];
localBuffer[tid2] += localBuffer[tid2 - 8];
localBuffer[tid2] += localBuffer[tid2 - 16];
//---- Add the sum to create a scan of 128 bits
return localBits + localBuffer[tid2 - 1];
}