As mentioned above, the original code had a bug and was missing blocksize*2. Unfortunately, it also suffers from the Fermi incompatibility mentioned in the tuning guide. It also crashes if it isn’t comparing a power of 2 length array. The Nvidia reduction sample shows the correct way to do it. Here is the code tweaked to work correctly on fermi and any length:
////////////////////////////
/*
This version adds multiple elements per thread sequentially. This reduces the overall
cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
(Brent's Theorem optimization)
*/
template <class T, unsigned int blockSize>
__global__ void reduce_min_kernel(T *g_odata, T *g_idata, unsigned int n)
{
SharedMemory<T> smem;
T *sdata = smem.getPointer();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;
unsigned int gridSize = blockSize*2*gridDim.x;
// we reduce multiple elements per thread. The number is determined by the
// number of active thread blocks (via gridSize). More blocks will result
// in a larger gridSize and therefore fewer elements per thread
T thmin = fminf(g_idata[i], g_idata[i + blockSize]);
i += gridSize;
while (i < n)
{
if (i + blockSize < n)//otherwise, it is off the end of the array and causes 0 result
{
T a = fminf(g_idata[i], g_idata[i + blockSize]);
thmin = fminf(thmin, a);
}
i += gridSize;
}
sdata[tid] = thmin;
__syncthreads();
// do reduction in shared mem
if (blockSize >= 512) { if (tid < 256) { sdata[tid] = fminf(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] = fminf(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] = fminf(sdata[tid], sdata[tid + 64]); } __syncthreads(); }
if (tid < 32)
{
volatile T* smem2 = sdata;
if (blockSize >= 64) { smem2[tid] = fminf(smem2[tid], smem2[tid + 32]); }
if (blockSize >= 32) { smem2[tid] = fminf(smem2[tid], smem2[tid + 16]); }
if (blockSize >= 16) { smem2[tid] = fminf(smem2[tid], smem2[tid + 8]); }
if (blockSize >= 8) { smem2[tid] = fminf(smem2[tid], smem2[tid + 4]); }
if (blockSize >= 4) { smem2[tid] = fminf(smem2[tid], smem2[tid + 2]); }
if (blockSize >= 2) { smem2[tid] = fminf(smem2[tid], smem2[tid + 1]); }
}
//__syncthreads();//new
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
///////////////////
// Wrapper function for kernel launch
/////////////////////////////////////////////////////////////
///////////////////
template <class T>
T reduce_min(T *d_idata, int size)//T *d_odata, T *d_idata, int size)
{
const int maxThreads = 128;
const int maxBlocks = 128; // TODO: Test/Increase for future devices w/ more processors
int threads = 1;
if( size != 1 ) {
threads = (size < maxThreads*2) ? size / 2 : maxThreads;
}
int blocks = size / (threads * 2);
blocks = min(maxBlocks, blocks);
T* d_odata;
HANDLE_ERROR( cudaMalloc((void**) &d_odata, blocks*sizeof(T)) ) ;
dim3 dimBlock(threads, 1, 1);
dim3 dimGrid(blocks, 1, 1);
int smemSize = threads * sizeof(T);
switch (threads)
{
case 512:
reduce_min_kernel<T, 512><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;
case 256:
reduce_min_kernel<T, 256><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;
case 128:
reduce_min_kernel<T, 128><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;//goes here
case 64:
reduce_min_kernel<T, 64><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;
case 32:
reduce_min_kernel<T, 32><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;
case 16:
reduce_min_kernel<T, 16><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;
case 8:
reduce_min_kernel<T, 8><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;
case 4:
reduce_min_kernel<T, 4><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;
case 2:
reduce_min_kernel<T, 2><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;
case 1:
reduce_min_kernel<T, 1><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;
default:
exit(1);
}
//
T* h_odata = new T[blocks];
CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, blocks*sizeof(T), cudaMemcpyDeviceToHost) );
//CPrecisionTimer* timer = new CPrecisionTimer(); timer->Start();
T result = h_odata[0];
for( int i = 1; i < blocks; i++ ) {
result = min(result, h_odata[i]);
}
//double d = timer->Stop();
delete[] h_odata;
cudaFree(d_odata);
return result;
}