half calculation generates incorrect result

zxy844288792 · October 18, 2019, 10:09pm

Hi,

I have code which performs reduce sum operation. The input shape is (2,3,4) and axis is -1. The code works for float dtype. However, as I replaced float with half. The output is incorrect, Could someone helps me expalian why it is wrong and how to fix it?

The code is attached here:

extern "C" __global__ void fused_sum_3_kernel0( half* __restrict__ placeholder,  half* __restrict__ placeholder_red) {
   half placeholder_red_rf[1];
  __shared__ half red_buf0[1024];
  placeholder_red_rf[0] = __float2half_rn(0.000000e+00f);
  if (((int)threadIdx.y) < 6) {
    if (((int)threadIdx.x) < 4) {
      placeholder_red_rf[0] = (placeholder_red_rf[0] + placeholder[((((int)threadIdx.y) * 4) + ((int)threadIdx.x))]);
    }
  }
  __syncthreads();
  ((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((((((int)threadIdx.y) < 6) && (((int)threadIdx.y) < 6)) && (((int)threadIdx.y) < 6)) && (((int)threadIdx.y) < 6)) ? placeholder_red_rf[0] : __float2half_rn(0.000000e+00f));
  __syncthreads();
  if (((int)threadIdx.x) < 16) {
    ((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 16)]);
    ((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 8)]);
    ((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 4)]);
    ((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 2)]);
    ((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((volatile __shared__ half*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] + ((volatile __shared__ half*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 1)]);
  }
  __syncthreads();
  if (((int)threadIdx.y) < 6) {
    if (((int)threadIdx.x) == 0) {
      placeholder_red[((int)threadIdx.y)] = ((volatile __shared__ half*)red_buf0)[(((int)threadIdx.y) * 32)];
    }
  }
}

As I put all 1 input, the results should be:
array([[[4.],
[4.],
[4.]],…

but I got:
x: array([[[3.],
[3.],
[3.]],…

Robert_Crovella · October 19, 2019, 3:33pm

My test of your code produces output of all 4:

$ cat t1538.cu
#include <cuda_fp16.h>
#include <iostream>

#ifndef USE_FLOAT
typedef half ft;
#else
typedef float ft;
#endif
__global__ void fused_sum_3_kernel0( ft* __restrict__ placeholder,  ft* __restrict__ placeholder_red) {
   ft placeholder_red_rf[1];
  __shared__ ft red_buf0[1024];
#ifndef USE_FLOAT
  const half my_zero = __float2half_rn(0.000000e+00f);
#else
  const float my_zero = 0.0f;
#endif
  placeholder_red_rf[0] = my_zero;
  if (((int)threadIdx.y) < 6) {
    if (((int)threadIdx.x) < 4) {
      placeholder_red_rf[0] = (placeholder_red_rf[0] + placeholder[((((int)threadIdx.y) * 4) + ((int)threadIdx.x))]);
    }
  }
  __syncthreads();
  ((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = (((((((int)threadIdx.y) < 6) && (((int)threadIdx.y) < 6)) && (((int)threadIdx.y) < 6)) && (((int)threadIdx.y) < 6)) ? placeholder_red_rf[0] : my_zero);
  __syncthreads();
  if (((int)threadIdx.x) < 16) {
    ((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 16)]));
    ((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 8)]));
    ((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 4)]));
    ((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 2)]));
    ((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))] = ((ft)(((volatile __shared__ ft*)red_buf0)[((((int)threadIdx.y) * 32) + ((int)threadIdx.x))]) + (ft)(((volatile __shared__ ft*)red_buf0)[(((((int)threadIdx.y) * 32) + ((int)threadIdx.x)) + 1)]));
  }
  __syncthreads();
  if (((int)threadIdx.y) < 6) {
    if (((int)threadIdx.x) == 0) {
      placeholder_red[((int)threadIdx.y)] = ((volatile __shared__ ft*)red_buf0)[(((int)threadIdx.y) * 32)];
    }
  }
}

int main(){
 const int ds = 1024;
 ft *h_i, *h_o, *d_i, *d_o;
 h_i = (ft *)malloc(ds * sizeof(ft));
 h_o = (ft *)malloc(ds * sizeof(ft));
 cudaMalloc(&d_i, ds * sizeof(ft));
 cudaMalloc(&d_o, ds * sizeof(ft));
 for (int i = 0; i < ds; i++) h_i[i] = (ft)1.0f;
 cudaMemset(d_o, 0, ds*sizeof(ft));
 cudaMemcpy(d_i, h_i, ds*sizeof(ft), cudaMemcpyHostToDevice);
 fused_sum_3_kernel0<<<1, dim3(32,32)>>>(d_i, d_o);
 cudaMemcpy(h_o, d_o, ds*sizeof(ft), cudaMemcpyDeviceToHost);
 for (int i = 0; i < 32; i++){
   for (int j = 0; j < 32; j++) std::cout << (float)h_o[i*32+j] << " ";
   std::cout << std::endl;}
 return 0;
}
$ nvcc -o t1538 t1538.cu -arch=sm_70
t1538.cu(24): warning: attribute "__shared__" does not apply here

t1538.cu(27): warning: attribute "__shared__" does not apply here

t1538.cu(27): warning: attribute "__shared__" does not apply here

t1538.cu(27): warning: attribute "__shared__" does not apply here

t1538.cu(28): warning: attribute "__shared__" does not apply here

t1538.cu(28): warning: attribute "__shared__" does not apply here

t1538.cu(28): warning: attribute "__shared__" does not apply here

t1538.cu(29): warning: attribute "__shared__" does not apply here

t1538.cu(29): warning: attribute "__shared__" does not apply here

t1538.cu(29): warning: attribute "__shared__" does not apply here

t1538.cu(30): warning: attribute "__shared__" does not apply here

t1538.cu(30): warning: attribute "__shared__" does not apply here

t1538.cu(30): warning: attribute "__shared__" does not apply here

t1538.cu(31): warning: attribute "__shared__" does not apply here

t1538.cu(31): warning: attribute "__shared__" does not apply here

t1538.cu(31): warning: attribute "__shared__" does not apply here

t1538.cu(36): warning: attribute "__shared__" does not apply here

t1538.cu(24): warning: attribute "__shared__" does not apply here

t1538.cu(27): warning: attribute "__shared__" does not apply here

t1538.cu(27): warning: attribute "__shared__" does not apply here

t1538.cu(27): warning: attribute "__shared__" does not apply here

t1538.cu(28): warning: attribute "__shared__" does not apply here

t1538.cu(28): warning: attribute "__shared__" does not apply here

t1538.cu(28): warning: attribute "__shared__" does not apply here

t1538.cu(29): warning: attribute "__shared__" does not apply here

t1538.cu(29): warning: attribute "__shared__" does not apply here

t1538.cu(29): warning: attribute "__shared__" does not apply here

t1538.cu(30): warning: attribute "__shared__" does not apply here

t1538.cu(30): warning: attribute "__shared__" does not apply here

t1538.cu(30): warning: attribute "__shared__" does not apply here

t1538.cu(31): warning: attribute "__shared__" does not apply here

t1538.cu(31): warning: attribute "__shared__" does not apply here

t1538.cu(31): warning: attribute "__shared__" does not apply here

t1538.cu(36): warning: attribute "__shared__" does not apply here

$ ./t1538
4 4 4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
$

I had to make some changes to get it to compile, but nothing that should affect functionality.

I’m guessing you are calling this maybe from cupy, just a guess. My guess is the problem lies elsewhere in your code, perhaps in the interface to this function. I also assume you are compiling for Pascal or newer architecture.

Topic		Replies	Views
__hadd not working correctly CUDA Programming and Performance cuda	3	418	October 19, 2023
How to cuda half and half functions CUDA Programming and Performance	5	4177	January 10, 2019
Problem manipulating half precision variables in CUDA kernel Jetson TX2	3	1579	November 28, 2017
error when trying to use half (fp16) CUDA Programming and Performance	16	20572	October 13, 2015
Precision is be influenced when adopting the __half(fp16) dataType CUDA Programming and Performance cuda , programming	1	489	July 6, 2023
Poor half performance CUDA Programming and Performance	13	2660	June 19, 2025
Converting a kernel from floats and ints to halfs is 6x slower CUDA Programming and Performance cuda	14	1355	October 16, 2023
Reduction Reduction Reduction................. Precision Confusion Race Condition...... HELP! CUDA Programming and Performance	16	10655	December 8, 2009
float reduction, cpu and cuda answers differ CUDA Programming and Performance	4	3452	April 1, 2008
Why these two scripts give different results? CUDA Programming and Performance	5	1880	June 19, 2012

half calculation generates incorrect result

Related topics