I have a race condition problem with below code, so I want to resolve it with atomic division , but we don’t have this atomic function, anybody have ideas? Thank you so much.
// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>0; s>>=1)
{
if (tid < (blockSize - s))
{
// race condition problem starts
myProduct *= sdata[tid + s];
while( myProduct > 0x0000FFFF){
myProduct = myProduct/256;
}
sdata[tid] = myProduct;
// race condition problem ends
}
__syncthreads();
}
Now that I have: This looks like you attempted a reduction, but got the if-condition wrong, introducing a write-after-read hazard. No need for atomic operations, just fix the synchronization. The following code should work (assuming I’m not yet too tired…)
// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>0; s>>=1) {
if (tid < s) {
myProduct *= sdata[tid + s];
while( myProduct > 0x0000FFFF) {
myProduct = myProduct/256;
}
sdata[tid] = myProduct;
}
__syncthreads();
}
If I’ve been too tired and the code doesn’t work, try inserting a [font=“Courier New”]__syncthreads();[/font] between after the [font=“Courier New”]myProduct *=[/font].
for example: we have p1, p2, p3, …p8, This function is only for calculating P1+P2+P3…+P8, the final result is P12345678, P7531, p62, p51, p4, p3, p2, p1 using warp, but if we need get all results of p1+p2+p3…+p8, p1+p2+p3+…P7, p1+p2+p3…+p6, …p2+p1, p1, do you have any suggestion?
template <class T, unsigned int blockSize>
__global__ void
reduce5_bak(T *g_idata, T *g_odata, unsigned int n)
{
T *sdata = SharedMemory<T>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;
T mySum = (i < n) ? g_idata[i] : 0;
if (i + blockSize < n)
mySum += g_idata[i+blockSize];
sdata[tid] = mySum;
__syncthreads();
// do reduction in shared mem
if (blockSize >= 512) { if (tid < 256) { sdata[tid] = mySum = mySum + sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] = mySum = mySum + sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] = mySum = mySum + sdata[tid + 64]; } __syncthreads(); }
#ifndef __DEVICE_EMULATION__
if (tid < 32)
#endif
{
// now that we are using warp-synchronous programming (below)
// we need to declare our shared memory volatile so that the compiler
// doesn't reorder stores to it and induce incorrect behavior.
volatile T* smem = sdata;
if (blockSize >= 64) { smem[tid] = mySum = mySum + smem[tid + 32]; EMUSYNC; }
if (blockSize >= 32) { smem[tid] = mySum = mySum + smem[tid + 16]; EMUSYNC; }
if (blockSize >= 16) { smem[tid] = mySum = mySum + smem[tid + 8]; EMUSYNC; }
if (blockSize >= 8) { smem[tid] = mySum = mySum + smem[tid + 4]; EMUSYNC; }
if (blockSize >= 4) { smem[tid] = mySum = mySum + smem[tid + 2]; EMUSYNC; }
if (blockSize >= 2) { smem[tid] = mySum = mySum + smem[tid + 1]; EMUSYNC; }
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}