Hi,
I have a project I have to write a kernel to scan an array of any size:
1- size can be non power of 2
2- array can be dispatched in many threads blocks
3- size could be less than the threads in a single block
I wrote a programs which gives me results I wanted. But I’m not sure
this is the best "parallel efficient " way.
global void reduceKernel(float* _iarray, float* _oarray)
{
//shared memory array
shared float sdata[BLOCK_SIZE];
int tid = threadIdx.x;
int i = blockIdx.x *(blockDim.x *2 ) + threadIdx.x;
//each thread load data to shared memory
sdata[tid] = 0;
//total number of blocks which can contain the array
int nblocks = (int)ARRAYSIZE / BLOCK_SIZE + ( (ARRAYSIZE % BLOCK_SIZE) == 0 ? 0:1);
//case where the number of elements in the array is higher than the number of threads in a single block
if(ARRAYSIZE < BLOCK_SIZE){
if(tid < ARRAYSIZE)
sdata[i] = _iarray[i];
}
//case where the array elements are dispatched in multiple threads blocks
else{
for (int k = 0; k < nblocks; k++){
if((i + k*blockDim.x) < ARRAYSIZE)
sdata[tid] += _iarray[i + k * blockDim.x];
}
}
//synchronization
__syncthreads();
//loop over sdata
for(int j = blockDim.x/2; j > 0; j>>=1)
{
if(tid < j){
sdata[tid] += sdata[tid + j];
}
__syncthreads();
}
//write back the data to the output array
if(tid == 0 )
_oarray[0] = sdata[0];
}
Any comments will be welcome
Thank you