Since cuda doesn’t provide atomicMax for float, I made my own:

```
static inline __device__ float atomicMax(float *addr, float value) {
float old = *addr, assumed;
if (old >= value) return old;
do {
assumed = old;
old = atomicCAS((unsigned int *)addr, __float_as_int(assumed),
__float_as_int(value));
} while (old != assumed);
return old;
}
```

It works when I use this function for the block reduction. However, after some thoughts, I found this function should be like this:

```
static inline __device__ float atomicMax(float *addr, float value) {
float old = *addr, assumed;
if (old >= value) return old;
do {
assumed = old;
if (assumed > value) break;
old = atomicCAS((unsigned int *)addr, __float_as_int(assumed),
__float_as_int(value));
} while (old != assumed);
return old;
}
```

In the first version, after another block with a larger value modified the *addr, the block would still try to write its own value (if larger than the original *addr).

Why everything still goes normal in my first version code?

Here are the global function used to call the atomicMax:

```
template <typename T, ReduceType reduce_type>
__global__ void CudaReduceKernel(void *input_data, int lenth,
void *output_data) {
__shared__ T sdata[kBlockDim];
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= lenth) {
return;
}
sdata[threadIdx.x] = static_cast<T *>(input_data)[idx];
__syncthreads();
for (int s = 1; threadIdx.x + s < lenth && threadIdx.x + s < blockDim.x;
s *= 2) {
if (threadIdx.x % (2 * s) == 0) {
sdata[threadIdx.x] = CudaReduceOperate<T, reduce_type>(
sdata[threadIdx.x], sdata[threadIdx.x + s]);
}
__syncthreads();
}
if (threadIdx.x == 0) {
CudaReduceAtomicOperate<T, reduce_type>(static_cast<T *>(output_data),
sdata[0]);
}
return;
}
```

Thanks in advances!