Hi Tera,

I want to optimize the code according to kernel 5 in Nvidia parallel reduction example http://developer.download.nvidia.com/compute/cuda/1_1/Website/projects/reduction/doc/reduction.pdf, but I have a problem:

for example: we have p1, p2, p3, …p8, This function is only for calculating P1+P2+P3…+P8, the final result is P12345678, P7531, p62, p51, p4, p3, p2, p1 using warp, but if we need get all results of p1+p2+p3…+p8, p1+p2+p3+…P7, p1+p2+p3…+p6, …p2+p1, p1, do you have any suggestion?

```
template <class T, unsigned int blockSize>
__global__ void
reduce5_bak(T *g_idata, T *g_odata, unsigned int n)
{
T *sdata = SharedMemory<T>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;
T mySum = (i < n) ? g_idata[i] : 0;
if (i + blockSize < n)
mySum += g_idata[i+blockSize];
sdata[tid] = mySum;
__syncthreads();
// do reduction in shared mem
if (blockSize >= 512) { if (tid < 256) { sdata[tid] = mySum = mySum + sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] = mySum = mySum + sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] = mySum = mySum + sdata[tid + 64]; } __syncthreads(); }
#ifndef __DEVICE_EMULATION__
if (tid < 32)
#endif
{
// now that we are using warp-synchronous programming (below)
// we need to declare our shared memory volatile so that the compiler
// doesn't reorder stores to it and induce incorrect behavior.
volatile T* smem = sdata;
if (blockSize >= 64) { smem[tid] = mySum = mySum + smem[tid + 32]; EMUSYNC; }
if (blockSize >= 32) { smem[tid] = mySum = mySum + smem[tid + 16]; EMUSYNC; }
if (blockSize >= 16) { smem[tid] = mySum = mySum + smem[tid + 8]; EMUSYNC; }
if (blockSize >= 8) { smem[tid] = mySum = mySum + smem[tid + 4]; EMUSYNC; }
if (blockSize >= 4) { smem[tid] = mySum = mySum + smem[tid + 2]; EMUSYNC; }
if (blockSize >= 2) { smem[tid] = mySum = mySum + smem[tid + 1]; EMUSYNC; }
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
```