Reduction to find minimum value (__shfl_down) using warp shuffle

I am writing a function which will find the minimum value and the index at which value was found a 1D array using CUDA.

I started by modifying the reduction code for finding sum of values in 1d array. The code work fine for sum function but I am not able to get it work for finding minimum.

Actual function is below and in the test example array size is 1024. So, I am using shuffle reduction part and think this is the culprit. Problem is the out put values in g_oIdxs (gives the index) per block, and g_odata (gives the minimum value) per block is wrong compared to plain sequential CPU code.

Also values in g_odata is all zero (0) when I print it in host.

Thanks in advance!

template<class T, unsigned int blockSize, bool nIsPow2>
__global__ void reduceMin6(T *g_idata, int *g_idxs, T *g_odata, int *g_oIdxs, unsigned int n) {
	T *sdata = SharedMemory<T>();

	int *sdataIdx = SharedMemory<int>();

	// perform first level of reduction,
	// reading from global memory, writing to shared memory
	unsigned int tid = threadIdx.x;
	unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
	unsigned int gridSize = blockSize * 2 * gridDim.x;

	T myMin = 99999;
	int myMinIdx = -1;
	// we reduce multiple elements per thread.  The number is determined by the
	// number of active thread blocks (via gridDim).  More blocks will result
	// in a larger gridSize and therefore fewer elements per thread
	while (i < n) {
		myMinIdx  = MIN_IDX(g_idata[i], myMin, g_idxs[i], myMinIdx);
		myMin = MIN(g_idata[i], myMin);

		// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
		if (nIsPow2 || i + blockSize < n){
			//myMin += g_idata[i + blockSize];
			myMinIdx  = MIN_IDX(g_idata[i + blockSize], myMin, g_idxs[i + blockSize], myMinIdx);
			myMin = MIN(g_idata[i + blockSize], myMin);
		}

		i += gridSize;
	}

	// each thread puts its local sum into shared memory
	sdata[tid] = myMin;
	sdataIdx[tid] = myMinIdx;
	__syncthreads();

	// do reduction in shared mem
	if ((blockSize >= 512) && (tid < 256)) {
		//sdata[tid] = mySum = mySum + sdata[tid + 256];

		sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 256], myMin, sdataIdx[tid + 256], myMinIdx);
		sdata[tid] = myMin = MIN(sdata[tid + 256], myMin);
	}

	__syncthreads();

	if ((blockSize >= 256) && (tid < 128)) {
		//sdata[tid] = myMin = myMin + sdata[tid + 128];

		sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 128], myMin, sdataIdx[tid + 128], myMinIdx);
		sdata[tid] = myMin = MIN(sdata[tid + 128], myMin);
	}

	__syncthreads();

	if ((blockSize >= 128) && (tid < 64)) {
		//sdata[tid] = myMin = myMin + sdata[tid + 64];

		sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 64], myMin, sdataIdx[tid + 64], myMinIdx);
		sdata[tid] = myMin = MIN(sdata[tid + 64], myMin);
	}

	__syncthreads();

#if (__CUDA_ARCH__ >= 300 )
	if (tid < 32) {
		// Fetch final intermediate sum from 2nd warp
		if (blockSize >= 64){
		//myMin += sdata[tid + 32];
			myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx);
			myMin = MIN(sdata[tid + 32], myMin);
		}
		// Reduce final warp using shuffle
		for (int offset = warpSize / 2; offset > 0; offset /= 2) {
			//myMin += __shfl_down(myMin, offset);
			float tempMyMin = __shfl_down(myMin, offset);
			int tempMyMinIdx = __shfl_down(myMinIdx, offset);
			myMinIdx = MIN_IDX(tempMyMin, myMin, tempMyMinIdx , myMinIdx);
			myMin = MIN(tempMyMin, myMin);
		}

	}
#else
	// fully unroll reduction within a single warp
	if ((blockSize >= 64) && (tid < 32))
	{
		//sdata[tid] = myMin = myMin + sdata[tid + 32];
		sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx);
		sdata[tid] = myMin = MIN(sdata[tid + 32], myMin);
	}

	__syncthreads();

	if ((blockSize >= 32) && (tid < 16))
	{
		//sdata[tid] = myMin = myMin + sdata[tid + 16];

		sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 16], myMin, sdataIdx[tid + 16], myMinIdx);
		sdata[tid] = myMin = MIN(sdata[tid + 16], myMin);
	}

	__syncthreads();

	if ((blockSize >= 16) && (tid < 8))
	{
		//sdata[tid] = myMin = myMin + sdata[tid + 8];

		sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 8], myMin, sdataIdx[tid + 8], myMinIdx);
		sdata[tid] = myMin = MIN(sdata[tid + 8], myMin);
	}

	__syncthreads();

	if ((blockSize >= 8) && (tid < 4))
	{
		//sdata[tid] = myMin = myMin + sdata[tid + 4];

		sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 4], myMin, sdataIdx[tid + 4], myMinIdx);
		sdata[tid] = myMin = MIN(sdata[tid + 4], myMin);
	}

	__syncthreads();

	if ((blockSize >= 4) && (tid < 2))
	{
		//sdata[tid] = myMin = myMin + sdata[tid + 2];
		sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 2], myMin, sdataIdx[tid + 2], myMinIdx);
		sdata[tid] = myMin = MIN(sdata[tid + 2], myMin);
	}

	__syncthreads();

	if ((blockSize >= 2) && ( tid < 1))
	{
		//sdata[tid] = myMin = myMin + sdata[tid + 1];
		sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 1], myMin, sdataIdx[tid + 1], myMinIdx);
		sdata[tid] = myMin = MIN(sdata[tid + 1], myMin);
	}

	__syncthreads();
#endif

	__syncthreads();
	// write result for this block to global mem
	if (tid == 0){
		g_odata[blockIdx.x] = myMin;
		g_oIdxs[blockIdx.x] = myMinIdx;
	}
}

cross posted:

http://stackoverflow.com/questions/38176136/finding-minimum-value-in-array-and-its-index-using-cuda-shfl-down-function

If there’s no need for this to be a device function, thrust beat you to the punch.

Just give it an iterable range and it’ll return the iterator that matches your criteria.

Thank you! for helping me! This was my first time asking in the community so had cross posted same question in stackoverflow too…