I’m trying to reduce a table of any size. This is how I programmed
device float FullReduction(float* _iarray,int row, int col)
{
//shared memory array shared float sdata[BLOCK_SIZE];
int tid = threadIdx.x;
//int i = blockIdx.x *(blockDim.x) + threadIdx.x;
int size = col * row;
//each thread load data to shared memory
sdata[tid] = 0;
__syncthreads();
//total number of blocks which can contain the array
int nblocks = (int)size / BLOCK_SIZE + ( (size % BLOCK_SIZE) == 0 ? 0:1);
//case where the number of elements in the array is higher than the number of threads in a single block
if(nblocks < 2){
if(tid < size)
sdata[tid] = _iarray[tid];
__syncthreads();
}
//case where the array elements are dispatched in multiple threads block
else{
for (int k = 0; k < nblocks; k++){
if((tid + k*BLOCK_SIZE) < size)
sdata[tid] += _iarray[tid + k * BLOCK_SIZE];
__syncthreads();
}
}
//loop over sdata
for(int j = BLOCK_SIZE / 2; j > 0; j>>=1)
{
if(tid < j){
sdata[tid] += sdata[tid + j];
}
__syncthreads();
}
//write back the data to the output array
//if(tid == 0 )
return sdata[0];
}
The result is ok when my array is dispatched into multiple blocks.
But when it is only one block the program doesn’t return what I expected.
I’m trying to reduce a table of any size. This is how I programmed
device float FullReduction(float* _iarray,int row, int col)
{
//shared memory array shared float sdata[BLOCK_SIZE];
int tid = threadIdx.x;
//int i = blockIdx.x *(blockDim.x) + threadIdx.x;
int size = col * row;
//each thread load data to shared memory
sdata[tid] = 0;
__syncthreads();
//total number of blocks which can contain the array
int nblocks = (int)size / BLOCK_SIZE + ( (size % BLOCK_SIZE) == 0 ? 0:1);
//case where the number of elements in the array is higher than the number of threads in a single block
if(nblocks < 2){
if(tid < size)
sdata[tid] = _iarray[tid];
__syncthreads();
}
//case where the array elements are dispatched in multiple threads block
else{
for (int k = 0; k < nblocks; k++){
if((tid + k*BLOCK_SIZE) < size)
sdata[tid] += _iarray[tid + k * BLOCK_SIZE];
__syncthreads();
}
}
//loop over sdata
for(int j = BLOCK_SIZE / 2; j > 0; j>>=1)
{
if(tid < j){
sdata[tid] += sdata[tid + j];
}
__syncthreads();
}
//write back the data to the output array
//if(tid == 0 )
return sdata[0];
}
The result is ok when my array is dispatched into multiple blocks.
But when it is only one block the program doesn’t return what I expected.