This one does NOT work (tmp is shared mem):
__syncthreads();
uint32_t stride = blockDim.x >> 1;
while(threadIdx.x < stride){
tmp[threadIdx.x] += tmp[threadIdx.x + stride];
stride >>= 1;
__syncthreads();
}
__syncthreads(); // Just in case :-)
This one works (tmp is shared mem):
__syncthreads();
while(stride){
if(threadIdx.x < stride)
tmp[threadIdx.x] += tmp[threadIdx.x + stride];
stride >>= 1;
__syncthreads();
}
__syncthreads(); // Just in case :-)