Cuda atomicMax for float

Since cuda doesn’t provide atomicMax for float, I made my own:

static inline __device__ float atomicMax(float *addr, float value) {
  float old = *addr, assumed;
  if (old >= value) return old;
  do {
    assumed = old;
    old = atomicCAS((unsigned int *)addr, __float_as_int(assumed),
                    __float_as_int(value));

  } while (old != assumed);

  return old;
}

It works when I use this function for the block reduction. However, after some thoughts, I found this function should be like this:

static inline __device__ float atomicMax(float *addr, float value) {
  float old = *addr, assumed;
  if (old >= value) return old;
  do {
    assumed = old;
    if (assumed > value) break;
    old = atomicCAS((unsigned int *)addr, __float_as_int(assumed),
                    __float_as_int(value));

  } while (old != assumed);

  return old;
}

In the first version, after another block with a larger value modified the *addr, the block would still try to write its own value (if larger than the original *addr).
Why everything still goes normal in my first version code?

Here are the global function used to call the atomicMax:

template <typename T, ReduceType reduce_type>
__global__ void CudaReduceKernel(void *input_data, int lenth,
                                 void *output_data) {
  __shared__ T sdata[kBlockDim];
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx >= lenth) {
    return;
  }
  sdata[threadIdx.x] = static_cast<T *>(input_data)[idx];
  __syncthreads();

  for (int s = 1; threadIdx.x + s < lenth && threadIdx.x + s < blockDim.x;
       s *= 2) {
    if (threadIdx.x % (2 * s) == 0) {
      sdata[threadIdx.x] = CudaReduceOperate<T, reduce_type>(
          sdata[threadIdx.x], sdata[threadIdx.x + s]);
    }
    __syncthreads();
  }

  if (threadIdx.x == 0) {
    CudaReduceAtomicOperate<T, reduce_type>(static_cast<T *>(output_data),
                                            sdata[0]);
  }
  return;
}

Thanks in advances!

Hi @code1011,
Please note, this forum branch is dedicated to CUDA-GDB tool support. Your question might be better suited for CUDA Programming and Performance - NVIDIA Developer Forums branch.

I have moved your topic there.

Thanks! I was confused with these categories before.

Your question is unclear. It seems you have already pointed out that your first version is broken. it seems you have already identified that there are some scenarios that don’t work correctly with the first version of code.

I don’t know why “everything still goes normal in my first version code”.

Also, this may be of interest.

Thanks for the reply.
What confused me is that I know the first version is broken, however, it can output the correct results the same as my second version (the good one). I am here to search for help to construct a scenario that the first version cannot output the correct results.
I have tried to construct some scenarios, hoping to make the first version go error.

  float old = *addr, assumed;
  if (old >= value) return old;
  do {
    assumed = old;
    if (assumed > value) break;

    __nanosleep(static_cast<unsigned>(1 / value * 100000));
    old = atomicCAS((unsigned int *)addr, __float_as_int(assumed),
                    __float_as_int(value));
  } while (old != assumed);

  return old;

In my expectation, the code will work like this (for example):
The old is 0 initially. Thread A with value 0.3 tried to write in, but the _nanosleep let it sleep long. In the meanwhile, thread B with 0.9 successfully writes (1/0.9 is smaller than 1/0.3, it would sleep short). Thread A wake up and continue to write 0.3. So we can get 0.3 as output, which is not the corrent result (the maximum should be 0.9).

The link you provided is quite useful. I saw it before and did some coding. I know how can let float atomicMax works right. I just can’t figure out the problem above: why a wrong code still work.

It doesn’t seem to be that hard to make it break:

$ cat t1920.cu
#include <cstdio>
static inline __device__ float atomicMax(float *addr, float value) {
  float old = *addr, assumed;
  if (old >= value) return old;
  do {
    assumed = old;
    old = atomicCAS((unsigned int *)addr, __float_as_int(assumed),
                    __float_as_int(value));

  } while (old != assumed);

  return old;
}
__device__ float x = 0.0f;
__global__ void k(){

  float my_val = (threadIdx.x)*0.01f;
  atomicMax(&x, my_val);
  __syncthreads();
  if (threadIdx.x == 0) printf("%f\n", x);
}

int main(){

  k<<<1,32>>>();
  cudaDeviceSynchronize();
}

$ nvcc -o t1920 t1920.cu
$ ./t1920
0.010000
$

CUDA 11.4, CentOS 7, Tesla V100, driver 470.57.02

The correct output would be 0.31. For example, using one of the realizations from the article I linked:

$ cat t1920.cu
#include <cstdio>
#if 0
static inline __device__ float atomicMax(float *addr, float value) {
  float old = *addr, assumed;
  if (old >= value) return old;
  do {
    assumed = old;
    old = atomicCAS((unsigned int *)addr, __float_as_int(assumed),
                    __float_as_int(value));

  } while (old != assumed);

  return old;
}
#endif
__device__ static float atomicMax(float* address, float val)
{
    int* address_as_i = (int*) address;
    int old = *address_as_i, assumed;
    do {
        assumed = old;
        old = ::atomicCAS(address_as_i, assumed,
            __float_as_int(::fmaxf(val, __int_as_float(assumed))));
    } while (assumed != old);
    return __int_as_float(old);
}
__device__ float x = 0.0f;
__global__ void k(){

  float my_val = (threadIdx.x)*0.01f;
  atomicMax(&x, my_val);
  __syncthreads();
  if (threadIdx.x == 0) printf("%f\n", x);
}

int main(){

  k<<<1,32>>>();
  cudaDeviceSynchronize();
}

$ nvcc -o t1920 t1920.cu
$ ./t1920
0.310000
$

In my first example (that produces the wrong answer) if you replace this:

float my_val = (threadIdx.x)*0.01f;

with this:

float my_val = (31-threadIdx.x)*0.01f;

it will produce the correct answer. As you’ve already pointed out, your broken example should be sensitive to ordering. I won’t be able to get into discussions of ordering of operations amongst threads in the same warp; that is unspecified.

I tried your code. It works for me. Thanks!
I provided another implementation here, just in case someone need that:

__device__ void atomicMax(float *const addr, const float val) {
  if (*addr >= val) return;

  unsigned int *const addr_as_ui = (unsigned int *)addr;
  unsigned int old = *addr_as_ui, assumed;
  do {
    assumed = old;
    if (__uint_as_float(assumed) >= val) break;
    old = atomicCAS(addr_as_ui, assumed, __float_as_uint(val));
  } while (assumed != old);
}

Notice my second version is still wrong. The atomicCAS return int type, should be a __int_as_float in the front of it since it will compare with a float type.

The following works for a single warp. And only if your use case can guarantee the hardware is CUDA.

Okay, it looks insane, but bear with me.

    int i = threadIdx.x;

    __shared__ float smax
    float *scores; // ...
    smax0 = 0;

    __syncthreads();

    while (smax0 < scores0[i]) smax0 = scores[i];

    __syncthreads();

For 32 random values, this rather surprisingly averages 4 iterations (instead of log2(32)==5). It’s an interesting puzzle to see why.

I’m actually using this, tested on lot’s of data, and so far it hasn’t failed.

Explanation:

The usual response is “write collisions are bad, so don’t do that”.

Write collisions are write collisions. If you don’t assume they won’t happen, and you account for them precisely, then you are fine. Write collisions have well-defined behavior: one of the writes will succeed. The rest will fail. It’s that simple. For something like an Add reduction, you really need to use the usual reduction pattern. But what about max?

Each iteration, we expect write collisions. If the write succeeds, the next comparision will exit the loop. If the write fails due to a collision, or it get over-written by another thread, there are two possibilities: the other value is less than the current thread scores[i], in which case the while loop continues, or the other value is greater or equal, in which case we don’t mind exiting the loop.

This is problematic for multiple warps because one threads may swap out after the compare but before the write. If another thread does the compare, the write, another compare and exits the loop before the first one resumes, then we have a problem. I other words, write collisions are not a problem at all, but synchronization between warps is.

If you have more than 32 values in your array, you’d need an array of max_per_warp, and have each warp, then apply the miracle max on that array. Add another level for every power of 32.

Aside from generalities about how we’re supposed to do reductions, does anyone know an actual sequence of events that this will fail?

floating point types have their bits layed out in such a way that you can perform just a bit comparison, just as you would do with integers. (assuming you don’t have NaN, Inf or denormalized values, not sure about these, also negative zero might be a problem)

so it should be enough to reinterpret_cast the float/double to an int/longlong (signed, becausethe sign-bit is 1 when the value is negative). And then just do the integer version of the atomicMax function which cuda already provides. Same with atomicMin.

did not check if it works, but this is the idea:

__device__ float atomicMax(float * addr, float val)
{
    int * addr_int = reinterpret_cast<int*>(addr);
    int * val_int = reinterpret_cast<int*>(&val);
    atomicMax(addr_int, *val_int);
}

Things are not quite that simple, because int uses two’s complement representation while float uses sign-magnitude representation. E.g. max (-2.0f, -4.0f) performed by casting to int and using an int-based max() computation returns -4.0f.

(1) If both float operands have a sign bit of 0, the int-based max() works fine for zero, subnormals, normals, and infinity. If does not work correctly if either operand is NaN.

(2) If the sign bits of the two float operands differ, the int-based max() computation works correctly for subnormals, normals, and infinities. It may fail for operands that are zeros (once cast to int positive and negative zero have two distinct values, but their value is equal in float), and it does not work correctly if either operand is NaN.

(3) If both float operands have the sign bit set, one has to reverse use of min/max on the operands cast to int, as demonstrated by the example I gave earlier.