hi Folks:
A reduction kernel from Mark Harris:
__global__ voidreduce6(int *g_idata, int *g_odata, unsigned int n)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + tid;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid] = 0;
while (i < n){sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; }
__syncthreads();
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32)warpReduce(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
But the following line I am not sure:
while (i < n){sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; }
n is the size of g_idata (input). I am wondering if there is a typo in this line. Should it be:
while (i + blockSize < n){sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; }
or something like that? Thanks.