half calculation generates incorrect result

Hi,

I have code which performs reduce sum operation. The input shape is (2,3,4) and axis is -1. The code works for float dtype. However, as I replaced float with half. The output is incorrect, Could someone helps me expalian why it is wrong and how to fix it?

The code is attached here:

extern "C" __global__ void fused_sum_3_kernel0( half* __restrict__ placeholder,  half* __restrict__ placeholder_red) {
   half placeholder_red_rf[1];
  __shared__ half red_buf0[1024];
  placeholder_red_rf[0] = __float2half_rn(0.000000e+00f);
  if (((int)threadIdx.y) < 6) {
    if (((int)threadIdx.x) < 4) {
      placeholder_red_rf[0] = (placeholder_red_rf[0] + placeholder[((((int)threadIdx.y) * 4) + ((int)threadIdx.x))]);
    }
  }
  __syncthreads();
  ((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((((((int)threadIdx.y) < 6) && (((int)threadIdx.y) < 6)) && (((int)threadIdx.y) < 6)) && (((int)threadIdx.y) < 6)) ? placeholder_red_rf[0] : __float2half_rn(0.000000e+00f));
  __syncthreads();
  if (((int)threadIdx.x) < 16) {
    ((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 16)]);
    ((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 8)]);
    ((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 4)]);
    ((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 2)]);
    ((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 1)]);
  }
  __syncthreads();
  if (((int)threadIdx.y) < 6) {
    if (((int)threadIdx.x) == 0) {
      placeholder_red[((int)threadIdx.y)] = ((volatile __shared__ half*)red_buf0)[(((int)threadIdx.y) * 32)];
    }
  }
}

As I put all 1 input, the results should be:
array([[[4.],
[4.],
[4.]],…

but I got:
x: array([[[3.],
[3.],
[3.]],…

My test of your code produces output of all 4:

$ cat t1538.cu
#include <cuda_fp16.h>
#include <iostream>

#ifndef USE_FLOAT
typedef half ft;
#else
typedef float ft;
#endif
__global__ void fused_sum_3_kernel0( ft* __restrict__ placeholder,  ft* __restrict__ placeholder_red) {
   ft placeholder_red_rf[1];
  __shared__ ft red_buf0[1024];
#ifndef USE_FLOAT
  const half my_zero = __float2half_rn(0.000000e+00f);
#else
  const float my_zero = 0.0f;
#endif
  placeholder_red_rf[0] = my_zero;
  if (((int)threadIdx.y) < 6) {
    if (((int)threadIdx.x) < 4) {
      placeholder_red_rf[0] = (placeholder_red_rf[0] + placeholder[((((int)threadIdx.y) * 4) + ((int)threadIdx.x))]);
    }
  }
  __syncthreads();
  ((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((((((int)threadIdx.y) < 6) && (((int)threadIdx.y) < 6)) && (((int)threadIdx.y) < 6)) && (((int)threadIdx.y) < 6)) ? placeholder_red_rf[0] : my_zero);
  __syncthreads();
  if (((int)threadIdx.x) < 16) {
    ((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 16)]));
    ((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 8)]));
    ((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 4)]));
    ((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 2)]));
    ((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 1)]));
  }
  __syncthreads();
  if (((int)threadIdx.y) < 6) {
    if (((int)threadIdx.x) == 0) {
      placeholder_red[((int)threadIdx.y)] = ((volatile __shared__ ft*)red_buf0)[(((int)threadIdx.y) * 32)];
    }
  }
}

int main(){
 const int ds = 1024;
 ft *h_i, *h_o, *d_i, *d_o;
 h_i = (ft *)malloc(ds * sizeof(ft));
 h_o = (ft *)malloc(ds * sizeof(ft));
 cudaMalloc(&d_i, ds * sizeof(ft));
 cudaMalloc(&d_o, ds * sizeof(ft));
 for (int i = 0; i < ds; i++) h_i[i] = (ft)1.0f;
 cudaMemset(d_o, 0, ds*sizeof(ft));
 cudaMemcpy(d_i, h_i, ds*sizeof(ft), cudaMemcpyHostToDevice);
 fused_sum_3_kernel0<<<1, dim3(32,32)>>>(d_i, d_o);
 cudaMemcpy(h_o, d_o, ds*sizeof(ft), cudaMemcpyDeviceToHost);
 for (int i = 0; i < 32; i++){
   for (int j = 0; j < 32; j++) std::cout << (float)h_o[i*32+j] << " ";
   std::cout << std::endl;}
 return 0;
}
$ nvcc -o t1538 t1538.cu -arch=sm_70
t1538.cu(24): warning: attribute "__shared__" does not apply here

t1538.cu(27): warning: attribute "__shared__" does not apply here

t1538.cu(27): warning: attribute "__shared__" does not apply here

t1538.cu(27): warning: attribute "__shared__" does not apply here

t1538.cu(28): warning: attribute "__shared__" does not apply here

t1538.cu(28): warning: attribute "__shared__" does not apply here

t1538.cu(28): warning: attribute "__shared__" does not apply here

t1538.cu(29): warning: attribute "__shared__" does not apply here

t1538.cu(29): warning: attribute "__shared__" does not apply here

t1538.cu(29): warning: attribute "__shared__" does not apply here

t1538.cu(30): warning: attribute "__shared__" does not apply here

t1538.cu(30): warning: attribute "__shared__" does not apply here

t1538.cu(30): warning: attribute "__shared__" does not apply here

t1538.cu(31): warning: attribute "__shared__" does not apply here

t1538.cu(31): warning: attribute "__shared__" does not apply here

t1538.cu(31): warning: attribute "__shared__" does not apply here

t1538.cu(36): warning: attribute "__shared__" does not apply here

t1538.cu(24): warning: attribute "__shared__" does not apply here

t1538.cu(27): warning: attribute "__shared__" does not apply here

t1538.cu(27): warning: attribute "__shared__" does not apply here

t1538.cu(27): warning: attribute "__shared__" does not apply here

t1538.cu(28): warning: attribute "__shared__" does not apply here

t1538.cu(28): warning: attribute "__shared__" does not apply here

t1538.cu(28): warning: attribute "__shared__" does not apply here

t1538.cu(29): warning: attribute "__shared__" does not apply here

t1538.cu(29): warning: attribute "__shared__" does not apply here

t1538.cu(29): warning: attribute "__shared__" does not apply here

t1538.cu(30): warning: attribute "__shared__" does not apply here

t1538.cu(30): warning: attribute "__shared__" does not apply here

t1538.cu(30): warning: attribute "__shared__" does not apply here

t1538.cu(31): warning: attribute "__shared__" does not apply here

t1538.cu(31): warning: attribute "__shared__" does not apply here

t1538.cu(31): warning: attribute "__shared__" does not apply here

t1538.cu(36): warning: attribute "__shared__" does not apply here

$ ./t1538
4 4 4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
$

I had to make some changes to get it to compile, but nothing that should affect functionality.

I’m guessing you are calling this maybe from cupy, just a guess. My guess is the problem lies elsewhere in your code, perhaps in the interface to this function. I also assume you are compiling for Pascal or newer architecture.