Cuda atomicMax for float

Since cuda doesn’t provide atomicMax for float, I made my own:

static inline __device__ float atomicMax(float *addr, float value) {
  float old = *addr, assumed;
  if (old >= value) return old;
  do {
    assumed = old;
    old = atomicCAS((unsigned int *)addr, __float_as_int(assumed),
                    __float_as_int(value));

  } while (old != assumed);

  return old;
}

It works when I use this function for the block reduction. However, after some thoughts, I found this function should be like this:

static inline __device__ float atomicMax(float *addr, float value) {
  float old = *addr, assumed;
  if (old >= value) return old;
  do {
    assumed = old;
    if (assumed > value) break;
    old = atomicCAS((unsigned int *)addr, __float_as_int(assumed),
                    __float_as_int(value));

  } while (old != assumed);

  return old;
}

In the first version, after another block with a larger value modified the *addr, the block would still try to write its own value (if larger than the original *addr).
Why everything still goes normal in my first version code?

Here are the global function used to call the atomicMax:

template <typename T, ReduceType reduce_type>
__global__ void CudaReduceKernel(void *input_data, int lenth,
                                 void *output_data) {
  __shared__ T sdata[kBlockDim];
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx >= lenth) {
    return;
  }
  sdata[threadIdx.x] = static_cast<T *>(input_data)[idx];
  __syncthreads();

  for (int s = 1; threadIdx.x + s < lenth && threadIdx.x + s < blockDim.x;
       s *= 2) {
    if (threadIdx.x % (2 * s) == 0) {
      sdata[threadIdx.x] = CudaReduceOperate<T, reduce_type>(
          sdata[threadIdx.x], sdata[threadIdx.x + s]);
    }
    __syncthreads();
  }

  if (threadIdx.x == 0) {
    CudaReduceAtomicOperate<T, reduce_type>(static_cast<T *>(output_data),
                                            sdata[0]);
  }
  return;
}

Thanks in advances!

Hi @code1011,
Please note, this forum branch is dedicated to CUDA-GDB tool support. Your question might be better suited for CUDA Programming and Performance - NVIDIA Developer Forums branch.

I have moved your topic there.

Thanks! I was confused with these categories before.

Your question is unclear. It seems you have already pointed out that your first version is broken. it seems you have already identified that there are some scenarios that don’t work correctly with the first version of code.

I don’t know why “everything still goes normal in my first version code”.

Also, this may be of interest.

Thanks for the reply.
What confused me is that I know the first version is broken, however, it can output the correct results the same as my second version (the good one). I am here to search for help to construct a scenario that the first version cannot output the correct results.
I have tried to construct some scenarios, hoping to make the first version go error.

  float old = *addr, assumed;
  if (old >= value) return old;
  do {
    assumed = old;
    if (assumed > value) break;

    __nanosleep(static_cast<unsigned>(1 / value * 100000));
    old = atomicCAS((unsigned int *)addr, __float_as_int(assumed),
                    __float_as_int(value));
  } while (old != assumed);

  return old;

In my expectation, the code will work like this (for example):
The old is 0 initially. Thread A with value 0.3 tried to write in, but the _nanosleep let it sleep long. In the meanwhile, thread B with 0.9 successfully writes (1/0.9 is smaller than 1/0.3, it would sleep short). Thread A wake up and continue to write 0.3. So we can get 0.3 as output, which is not the corrent result (the maximum should be 0.9).

The link you provided is quite useful. I saw it before and did some coding. I know how can let float atomicMax works right. I just can’t figure out the problem above: why a wrong code still work.

It doesn’t seem to be that hard to make it break:

$ cat t1920.cu
#include <cstdio>
static inline __device__ float atomicMax(float *addr, float value) {
  float old = *addr, assumed;
  if (old >= value) return old;
  do {
    assumed = old;
    old = atomicCAS((unsigned int *)addr, __float_as_int(assumed),
                    __float_as_int(value));

  } while (old != assumed);

  return old;
}
__device__ float x = 0.0f;
__global__ void k(){

  float my_val = (threadIdx.x)*0.01f;
  atomicMax(&x, my_val);
  __syncthreads();
  if (threadIdx.x == 0) printf("%f\n", x);
}

int main(){

  k<<<1,32>>>();
  cudaDeviceSynchronize();
}

$ nvcc -o t1920 t1920.cu
$ ./t1920
0.010000
$

CUDA 11.4, CentOS 7, Tesla V100, driver 470.57.02

The correct output would be 0.31. For example, using one of the realizations from the article I linked:

$ cat t1920.cu
#include <cstdio>
#if 0
static inline __device__ float atomicMax(float *addr, float value) {
  float old = *addr, assumed;
  if (old >= value) return old;
  do {
    assumed = old;
    old = atomicCAS((unsigned int *)addr, __float_as_int(assumed),
                    __float_as_int(value));

  } while (old != assumed);

  return old;
}
#endif
__device__ static float atomicMax(float* address, float val)
{
    int* address_as_i = (int*) address;
    int old = *address_as_i, assumed;
    do {
        assumed = old;
        old = ::atomicCAS(address_as_i, assumed,
            __float_as_int(::fmaxf(val, __int_as_float(assumed))));
    } while (assumed != old);
    return __int_as_float(old);
}
__device__ float x = 0.0f;
__global__ void k(){

  float my_val = (threadIdx.x)*0.01f;
  atomicMax(&x, my_val);
  __syncthreads();
  if (threadIdx.x == 0) printf("%f\n", x);
}

int main(){

  k<<<1,32>>>();
  cudaDeviceSynchronize();
}

$ nvcc -o t1920 t1920.cu
$ ./t1920
0.310000
$

In my first example (that produces the wrong answer) if you replace this:

float my_val = (threadIdx.x)*0.01f;

with this:

float my_val = (31-threadIdx.x)*0.01f;

it will produce the correct answer. As you’ve already pointed out, your broken example should be sensitive to ordering. I won’t be able to get into discussions of ordering of operations amongst threads in the same warp; that is unspecified.

I tried your code. It works for me. Thanks!
I provided another implementation here, just in case someone need that:

__device__ void atomicMax(float *const addr, const float val) {
  if (*addr >= val) return;

  unsigned int *const addr_as_ui = (unsigned int *)addr;
  unsigned int old = *addr_as_ui, assumed;
  do {
    assumed = old;
    if (__uint_as_float(assumed) >= val) break;
    old = atomicCAS(addr_as_ui, assumed, __float_as_uint(val));
  } while (assumed != old);
}

Notice my second version is still wrong. The atomicCAS return int type, should be a __int_as_float in the front of it since it will compare with a float type.