In order to reduce 192 elements, I’ve decided to use 64 threads. First, we reduce 128 to 192 into 0 to 64, then it comes back to the usual parallel reduction case.
Why then this code:
if (tid < 64) {
smem[tid] += smem[tid + 128];
__syncthreads();
}
if (tid < 64) {
smem[tid] += smem[tid + 64];
__syncthreads();
}
// unrolling warp
if (tid < 32) {
volatile float* vsmem = smem;
vsmem[tid] += vsmem[tid + 32];
vsmem[tid] += vsmem[tid + 16];
vsmem[tid] += vsmem[tid + 8];
vsmem[tid] += vsmem[tid + 4];
vsmem[tid] += vsmem[tid + 2];
vsmem[tid] += vsmem[tid + 1];
}
doesn’t provide the same results as this code:
if(tid ==0) {
unsigned int shift = 64;
for (unsigned int i = 0; i < shift; i++) {
smem[i] += smem[i + 128];
}
for (unsigned int i = 0; i < shift; i++) {
smem[i] += smem[i + shift];
}
shift /= 2; // 32
for (unsigned int i = 0; i < shift; i++) {
smem[i] += smem[i + shift];
}
shift /= 2; // 16
for (unsigned int i = 0; i < shift; i++) {
smem[i] += smem[i + shift];
}
shift /= 2; // 8
for (unsigned int i = 0; i < shift; i++) {
smem[i] += smem[i + shift];
}
shift /= 2; // 4
for (unsigned int i = 0; i < shift; i++) {
smem[i] += smem[i + shift];
}
shift /= 2; // 2
for (unsigned int i = 0; i < shift; i++) {
smem[i] += smem[i + shift];
}
smem[0] += smem[1];
}