this is the original code I used for reduction -
for (unsigned int s = (BLOCK_SIZE * BLOCK_SIZE) / 2; s>0; s>>=1)
{
if ((BLOCK_SIZE * ty + tx) < s)
temps[BLOCK_SIZE * ty + tx] += temps[BLOCK_SIZE * ty + tx+s];
__syncthreads();
}
float VarienceS = temps[0];
this is the unrolled code by cuda, and it gives me differnt outputs the the original , and I dont know why -
if (BLOCK_SIZE * ty + tx < 128) temps[BLOCK_SIZE * ty + tx]+= temps[BLOCK_SIZE * ty + tx + 128];
__syncthreads();
if (BLOCK_SIZE * ty + tx < 64) temps[BLOCK_SIZE * ty + tx]+= temps[BLOCK_SIZE * ty + tx + 64];
__syncthreads();
if (BLOCK_SIZE * ty + tx < 32)
{
temps[BLOCK_SIZE * ty + tx] +=temps[BLOCK_SIZE * ty + tx + 32];
temps[BLOCK_SIZE * ty + tx] += temps[BLOCK_SIZE * ty + tx + 16];
temps[BLOCK_SIZE * ty + tx] += temps[BLOCK_SIZE * ty + tx + 8];
temps[BLOCK_SIZE * ty + tx] += temps[BLOCK_SIZE * ty + tx + 4];
temps[BLOCK_SIZE * ty + tx] += temps[BLOCK_SIZE * ty + tx + 2];
temps[BLOCK_SIZE * ty + tx] += temps[BLOCK_SIZE * ty + tx + 1];
}
Num Of Threads is known and is 256 ,
BLOCKSIZE IS 16