I have used reduction kernel to compute the sum.
Please,Can anyone tell me whats wrong with this : I am getting wrong output
__global__ void compute(int *ga){
int id = threadIdx.x + (blockDim.x * blockIdx.x);
extern __shared__ int s[];
s[id] = MAX;
if (id<MAX)
{
s[id] = ga[id];
}
__syncthreads();
//if (s[id] % 3 == 0 || s[id] % 5 == 0)
// s[id] = id;
//applying the reeduction
if(blockDim.x >= 1024){
if (id < 512){
s[id] = s[id] + s[id + 512];
}
__syncthreads();
}
__syncthreads();
if (blockDim.x >= 512){
if (id < 256)
{
s[id] = s[id] + s[id + 256];
}
__syncthreads();
}
if (blockDim.x >= 256){
if (id < 128)
{
s[id] = s[id] + s[id + 128];
}
__syncthreads();
}
if (blockDim.x >= 128){
if (id < 64)
{
s[id] = s[id] + s[id + 64];
}
__syncthreads();
}
//if this is the last warp
if (id < 32)
{
if (blockDim.x >= 64)
s[id] = s[id] + s[id + 32];
if (blockDim.x >= 32)
s[id] = s[id] + s[id + 16];
if (blockDim.x >= 16)
s[id] = s[id] + s[id + 8];
if (blockDim.x >= 8)
s[id] = s[id] + s[id + 4];
if (blockDim.x >= 4)
s[id] = s[id] + s[id + 2];
if (blockDim.x >= 2)
s[id] = s[id] + s[id + 1];
}
//thread zero will store min of this block i.e. s[0];
if (id == 0)
{
ga[blockIdx.x] = s[0];
}
}