Hybrid Atomic Reduction

In order to get a better handle on working with CUDA threads and blocks I’m trying to perform a sort of array reduction in which rather than summing up each block like a binary tree I atomically add two blocks together and then perform the reduction as if they were a single block.

template <unsigned int blockSize>
__global__ void reduce5(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  sdata[tid] = 0;
  // reduce multiple elements per thread
  while (i < n) {
    sdata[tid] += g_idata[i] + g_idata[i+blockSize];
    i += gridSize;
  }
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      sdata[tid] += sdata[tid + 256];
  __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      sdata[tid] += sdata[tid + 128];
  __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      sdata[tid] += sdata[tid + 64];
  __syncthreads();
  }
  if (tid < 32) warpReduce<blockSize>(sdata, tid);
  // write result for block to global memory
  if (tid == 0) {
    g_odata[blockIdx.x] = sdata[0];
    if (blockIdx.x + blocks/2 < blocks)
      atomicAdd(&g_odata[blockIdx.x], g_odata[blockIdx.x+(numBlocks/2)]);
  }
}

I took the sample reduction code provided by the SDK and added this single line at the bottom:

atomicAdd(&g_odata[blockIdx.x], g_odata[blockIdx.x+(blockDim.x/2)]);

I’m trying to do a sort of sequential atomic addition here so that for 8 blocks block 1 will be atomically added to block 4, block 2 to 5, etc. Am I going in the right direction?