Half performance on a100

We are evaluating the half performance on a100 using artificial kernel provided in the post:
https://forums.developer.nvidia.com/t/poor-half-performance/111626

#include "cuda_fp16.h"
#define lc 1048576*2

__device__ __forceinline__ float computation(float a, float b, float c){
  return __fmaf_rn(a,b,c);
}

__device__ __forceinline__ half2 computation(half2 a, half2 b, half2 c){
  return __hfma2(a,b,c);
}

template<typename T>
__global__ void add_comp_kernel(T* a, T* b, T* c, long int N){
  int idx = blockIdx.x*blockDim.x+threadIdx.x;
  if(idx < N){
    T a_d = a[idx];
    T b_d = b[idx];
    T c_d = c[idx];
    for(int i = 0; i<lc; ++i){
      c_d = computation(a_d,b_d, c_d);
    }
    c[idx]=c_d;
  }
}

int main(int argc, char** argv){

  long int N = 1048576ULL;
  int threads = 1024;

  {

    float *d_a, *d_b, *d_c;

    cudaMalloc(&d_a, N*sizeof(float));
    cudaMalloc(&d_b, N*sizeof(float));
    cudaMalloc(&d_c, N*sizeof(float));
    add_comp_kernel<<<(int)ceil(float(N)/threads),threads>>>(d_a, d_b, d_c, N);
    cudaDeviceSynchronize();

  }
  {
    half2 *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, N/2*sizeof(half2));
    cudaMalloc(&d_b, N/2*sizeof(half2));
    cudaMalloc(&d_c, N/2*sizeof(half2));
    add_comp_kernel<<<ceil(float(N/2)/threads),threads>>>(d_a, d_b, d_c, N/2);
    cudaDeviceSynchronize();

  }
}


nvcc -arch=sm_80 -o fp16 fp16.cu    
ncu -f --kernel-id ::add_comp_kernel: --metrics gpu__time_duration.sum ./fp16
==PROF== Connected to process 18194 (/home/wqwang/fp16/fp16)
==PROF== Profiling "add_comp_kernel": 0%....50%....100% - 1 pass
==PROF== Profiling "add_comp_kernel": 0%....50%....100% - 1 pass
==PROF== Disconnected from process 18194
[18194] fp16@127.0.0.1
  void add_comp_kernel<float>(float*, float*, float*, long), 2021-Jan-13 20:59:22, Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    gpu__time_duration.sum                                                         msecond                         306.45
    ---------------------------------------------------------------------- --------------- ------------------------------

  void add_comp_kernel<__half2>(__half2*, __half2*, __half2*, long), 2021-Jan-13 20:59:22, Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    gpu__time_duration.sum                                                         msecond                         144.70
    ---------------------------------------------------------------------- --------------- ------------------------------

Currently, we observe roughly x2 half performance versus float. Yet, as mentioned in the a100 white paper, half performance is expected to be around 4x that of float. Can someone give suggestions on how to achieve expected half performance? Thank you in advance!

截屏2021-01-13 下午9.00.57