Hi,
The following kernel implements the prescan Algorthim as given in SDK. I am wondering how can the kernel work when TEMP is not defind anywhere. Or is it that i am missing any important concept here?
global void scan_best(float *g_odata, float *g_idata, int n)
{
// Dynamically allocated shared memory for scan kernels
extern shared float temp[];
int thid = threadIdx.x;
int ai = thid;
int bi = thid + (n/2);
// compute spacing to avoid bank conflicts
int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
// Cache the computational window in shared memory
TEMP(ai + bankOffsetA) = g_idata[ai];
TEMP(bi + bankOffsetB) = g_idata[bi];
int offset = 1;
// build the sum in place up the tree
for (int d = n/2; d > 0; d >>= 1)
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
ai += CONFLICT_FREE_OFFSET(ai);
bi += CONFLICT_FREE_OFFSET(bi);
TEMP(bi) += TEMP(ai);
}
offset *= 2;
}
// scan back down the tree
// clear the last element
if (thid == 0)
{
int index = n - 1;
index += CONFLICT_FREE_OFFSET(index);
TEMP(index) = 0;
}
// traverse down the tree building the scan in place
for (int d = 1; d < n; d *= 2)
{
offset /= 2;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
ai += CONFLICT_FREE_OFFSET(ai);
bi += CONFLICT_FREE_OFFSET(bi);
float t = TEMP(ai);
TEMP(ai) = TEMP(bi);
TEMP(bi) += t;
}
}
__syncthreads();
// write results to global memory
g_odata[ai] = TEMP(ai + bankOffsetA);
g_odata[bi] = TEMP(bi + bankOffsetB);
}
#endif // #ifndef SCAN_BEST_KERNEL_H