Min/Max functions missing?

Hi,

Table A-1 (page 63-64) of the CUDA Programming Guide 0.8.2 mentions the builtin functions Min and Max. When I try to use these functions, I get a compiler error that they cannot be found. All other math functions seem to work fine… are these functions currently missing?

Try min and max instead of Min and Max.

Yeah I tried that… didn’t work though.

Funny I found that there were a whole lot of overloaded functions for min & max but the compiler was confused about which one to use so I had to provide the usual macro defines to get my code to compile.
Eric

Which OS?

Have you tried minf and maxf?

Well I checked again and lowercase seems to be working now :). I probably forgot to recompile my kernel file the first time I checked it O:). Anyway, it seems to be a small error in the documentation.

Also, always make sure you undefine the windows.h min and max macros using NOMINMAX, they tend to always get in the way.

As mentioned above, the original code had a bug and was missing blocksize*2. Unfortunately, it also suffers from the Fermi incompatibility mentioned in the tuning guide. It also crashes if it isn’t comparing a power of 2 length array. The Nvidia reduction sample shows the correct way to do it. Here is the code tweaked to work correctly on fermi and any length:

////////////////////////////

/*

    This version adds multiple elements per thread sequentially.  This reduces the overall

    cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).

    (Brent's Theorem optimization)

*/

template <class T, unsigned int blockSize>

__global__ void reduce_min_kernel(T *g_odata, T *g_idata, unsigned int n)

{

    SharedMemory<T> smem;

    T *sdata = smem.getPointer();

// perform first level of reduction,

    // reading from global memory, writing to shared memory

    unsigned int tid = threadIdx.x;

    unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;

    unsigned int gridSize = blockSize*2*gridDim.x;

// we reduce multiple elements per thread.  The number is determined by the 

    // number of active thread blocks (via gridSize).  More blocks will result

    // in a larger gridSize and therefore fewer elements per thread

    T thmin = fminf(g_idata[i], g_idata[i + blockSize]);

    i += gridSize;

    while (i < n)

    {

		if (i + blockSize < n)//otherwise, it is off the end of the array and causes 0 result

		{

			T a = fminf(g_idata[i], g_idata[i + blockSize]);

			thmin = fminf(thmin, a);

		}  

		i += gridSize;

    }

    sdata[tid] = thmin;

    __syncthreads();

// do reduction in shared mem

    if (blockSize >= 512) { if (tid < 256) { sdata[tid] = fminf(sdata[tid], sdata[tid + 256]); } __syncthreads(); }

    if (blockSize >= 256) { if (tid < 128) { sdata[tid] = fminf(sdata[tid], sdata[tid + 128]); } __syncthreads(); }

    if (blockSize >= 128) { if (tid <  64) { sdata[tid] = fminf(sdata[tid], sdata[tid +  64]); } __syncthreads(); }

if (tid < 32)

    {

		volatile T* smem2 = sdata;

        if (blockSize >=  64) { smem2[tid] = fminf(smem2[tid], smem2[tid + 32]); }

        if (blockSize >=  32) { smem2[tid] = fminf(smem2[tid], smem2[tid + 16]); }

        if (blockSize >=  16) { smem2[tid] = fminf(smem2[tid], smem2[tid +  8]); }

        if (blockSize >=   8) { smem2[tid] = fminf(smem2[tid], smem2[tid +  4]); }

        if (blockSize >=   4) { smem2[tid] = fminf(smem2[tid], smem2[tid +  2]); }

        if (blockSize >=   2) { smem2[tid] = fminf(smem2[tid], smem2[tid +  1]); }

    }

     //__syncthreads();//new

    // write result for this block to global mem 

    if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

///////////////////

// Wrapper function for kernel launch

/////////////////////////////////////////////////////////////

///////////////////

template <class T>

T reduce_min(T *d_idata, int size)//T *d_odata, T *d_idata, int size)

{

  const int maxThreads = 128;

  const int maxBlocks  = 128; // TODO: Test/Increase for future devices w/ more processors

  int threads = 1;

if( size != 1 ) {

    threads = (size < maxThreads*2) ? size / 2 : maxThreads;

  }

  int blocks = size / (threads * 2);

  blocks = min(maxBlocks, blocks);

T* d_odata;

  HANDLE_ERROR( cudaMalloc((void**) &d_odata, blocks*sizeof(T)) ) ;

dim3 dimBlock(threads, 1, 1);

  dim3 dimGrid(blocks, 1, 1);

  int smemSize = threads * sizeof(T);

switch (threads)

  {

	  case 512:

		  reduce_min_kernel<T, 512><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

	  case 256:

		  reduce_min_kernel<T, 256><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

	  case 128:

		  reduce_min_kernel<T, 128><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;//goes here

	  case 64:

		  reduce_min_kernel<T,  64><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

	  case 32:

		  reduce_min_kernel<T,  32><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

	  case 16:

		  reduce_min_kernel<T,  16><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

	  case  8:

		  reduce_min_kernel<T,   8><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

	  case  4:

		  reduce_min_kernel<T,   4><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

	  case  2:

		  reduce_min_kernel<T,   2><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

	  case  1:

		  reduce_min_kernel<T,   1><<< dimGrid, dimBlock, smemSize >>>(d_odata, d_idata, size); break;

	  default:

		  exit(1);

  }

//

T* h_odata = new T[blocks];

  CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, blocks*sizeof(T), cudaMemcpyDeviceToHost) );

//CPrecisionTimer* timer = new CPrecisionTimer(); timer->Start();

T result = h_odata[0];

  for( int i = 1; i < blocks; i++ ) {

    result = min(result, h_odata[i]);

  }

//double d = timer->Stop();

delete[] h_odata;

cudaFree(d_odata);

return result;

}