In order to get a better handle on working with CUDA threads and blocks I’m trying to perform a sort of array reduction in which rather than summing up each block like a binary tree I atomically add two blocks together and then perform the reduction as if they were a single block.
template <unsigned int blockSize>
__global__ void reduce5(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
// reduce multiple elements per thread
while (i < n) {
sdata[tid] += g_idata[i] + g_idata[i+blockSize];
i += gridSize;
}
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
sdata[tid] += sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
sdata[tid] += sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
sdata[tid] += sdata[tid + 64];
__syncthreads();
}
if (tid < 32) warpReduce<blockSize>(sdata, tid);
// write result for block to global memory
if (tid == 0) {
g_odata[blockIdx.x] = sdata[0];
if (blockIdx.x + blocks/2 < blocks)
atomicAdd(&g_odata[blockIdx.x], g_odata[blockIdx.x+(numBlocks/2)]);
}
}
I took the sample reduction code provided by the SDK and added this single line at the bottom:
atomicAdd(&g_odata[blockIdx.x], g_odata[blockIdx.x+(blockDim.x/2)]);
I’m trying to do a sort of sequential atomic addition here so that for 8 blocks block 1 will be atomically added to block 4, block 2 to 5, etc. Am I going in the right direction?