I am writing a function which will find the minimum value and the index at which value was found a 1D array using CUDA.
I started by modifying the reduction code for finding sum of values in 1d array. The code work fine for sum function but I am not able to get it work for finding minimum.
Actual function is below and in the test example array size is 1024. So, I am using shuffle reduction part and think this is the culprit. Problem is the out put values in g_oIdxs (gives the index) per block, and g_odata (gives the minimum value) per block is wrong compared to plain sequential CPU code.
Also values in g_odata is all zero (0) when I print it in host.
Thanks in advance!
template<class T, unsigned int blockSize, bool nIsPow2>
__global__ void reduceMin6(T *g_idata, int *g_idxs, T *g_odata, int *g_oIdxs, unsigned int n) {
T *sdata = SharedMemory<T>();
int *sdataIdx = SharedMemory<int>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
T myMin = 99999;
int myMinIdx = -1;
// we reduce multiple elements per thread. The number is determined by the
// number of active thread blocks (via gridDim). More blocks will result
// in a larger gridSize and therefore fewer elements per thread
while (i < n) {
myMinIdx = MIN_IDX(g_idata[i], myMin, g_idxs[i], myMinIdx);
myMin = MIN(g_idata[i], myMin);
// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
if (nIsPow2 || i + blockSize < n){
//myMin += g_idata[i + blockSize];
myMinIdx = MIN_IDX(g_idata[i + blockSize], myMin, g_idxs[i + blockSize], myMinIdx);
myMin = MIN(g_idata[i + blockSize], myMin);
}
i += gridSize;
}
// each thread puts its local sum into shared memory
sdata[tid] = myMin;
sdataIdx[tid] = myMinIdx;
__syncthreads();
// do reduction in shared mem
if ((blockSize >= 512) && (tid < 256)) {
//sdata[tid] = mySum = mySum + sdata[tid + 256];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 256], myMin, sdataIdx[tid + 256], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 256], myMin);
}
__syncthreads();
if ((blockSize >= 256) && (tid < 128)) {
//sdata[tid] = myMin = myMin + sdata[tid + 128];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 128], myMin, sdataIdx[tid + 128], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 128], myMin);
}
__syncthreads();
if ((blockSize >= 128) && (tid < 64)) {
//sdata[tid] = myMin = myMin + sdata[tid + 64];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 64], myMin, sdataIdx[tid + 64], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 64], myMin);
}
__syncthreads();
#if (__CUDA_ARCH__ >= 300 )
if (tid < 32) {
// Fetch final intermediate sum from 2nd warp
if (blockSize >= 64){
//myMin += sdata[tid + 32];
myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx);
myMin = MIN(sdata[tid + 32], myMin);
}
// Reduce final warp using shuffle
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
//myMin += __shfl_down(myMin, offset);
float tempMyMin = __shfl_down(myMin, offset);
int tempMyMinIdx = __shfl_down(myMinIdx, offset);
myMinIdx = MIN_IDX(tempMyMin, myMin, tempMyMinIdx , myMinIdx);
myMin = MIN(tempMyMin, myMin);
}
}
#else
// fully unroll reduction within a single warp
if ((blockSize >= 64) && (tid < 32))
{
//sdata[tid] = myMin = myMin + sdata[tid + 32];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 32], myMin);
}
__syncthreads();
if ((blockSize >= 32) && (tid < 16))
{
//sdata[tid] = myMin = myMin + sdata[tid + 16];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 16], myMin, sdataIdx[tid + 16], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 16], myMin);
}
__syncthreads();
if ((blockSize >= 16) && (tid < 8))
{
//sdata[tid] = myMin = myMin + sdata[tid + 8];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 8], myMin, sdataIdx[tid + 8], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 8], myMin);
}
__syncthreads();
if ((blockSize >= 8) && (tid < 4))
{
//sdata[tid] = myMin = myMin + sdata[tid + 4];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 4], myMin, sdataIdx[tid + 4], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 4], myMin);
}
__syncthreads();
if ((blockSize >= 4) && (tid < 2))
{
//sdata[tid] = myMin = myMin + sdata[tid + 2];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 2], myMin, sdataIdx[tid + 2], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 2], myMin);
}
__syncthreads();
if ((blockSize >= 2) && ( tid < 1))
{
//sdata[tid] = myMin = myMin + sdata[tid + 1];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 1], myMin, sdataIdx[tid + 1], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 1], myMin);
}
__syncthreads();
#endif
__syncthreads();
// write result for this block to global mem
if (tid == 0){
g_odata[blockIdx.x] = myMin;
g_oIdxs[blockIdx.x] = myMinIdx;
}
}