Hi, I have a problem with a buffer sum reduction. This is the kernel code:
__kernel void evaluate(__global const uint *pFirstBuffer,
__global const double *pSecondBuffer,
const uint N,
__global uint *pPartialSumBuffer,
__global uint *pOutTestBuffer)
{
uint gid = get_global_id(0);
uint lid = get_local_id(0);
uint grp_id = get_group_id(0);
pPartialSumBuffer[grp_id] = 0;
__local uint lcl_acc[LOCAL_WORK_SIZE]; //I call this kernel with a local work size of 256
lcl_acc[lid] = 0; //and a global size of 220928 (multiple of 256)
barrier(CLK_LOCAL_MEM_FENCE);
if (gid <= N)
{
double output = … //Here i do some simple computation using pSecondBuffer
if (output <= 0)
pOutTestBuffer[gid] = 1;
else
pOutTestBuffer[gid] = 0;
if (output <= 0)
{
lcl_acc[lid] = 1;
}
}
uint priv_acc = lcl_acc[lid];
barrier(CLK_LOCAL_MEM_FENCE);
uint dist = LOCAL_WORK_SIZE;
while ( dist > 1 )
{
dist >>= 1;
if (gid <= N && lid < dist)
{
priv_acc += lcl_acc[lid + dist];
lcl_acc[lid] = priv_acc;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (gid <= N && lid == 0)
{
pPartialSumBuffer[grp_id] = priv_acc;
}
}
Now, after kernel execution, if I check the number of ‘1’ values reducted in pPartialSumBuffer I obtain 2249, while if I check the number of ‘1’ in pOutTestBuffer I obtain 2248.
Why this difference? Something wrong in the way I reduce the local buffer in pPartialSumBuffer?
Tks to all.