cublasHgemm is slower than cublasSgemm in CUDA 11.1 when I use?

I use CUDA 11.1, My GPU is GTX1660 with capability 7.5.
I use cublasSgemm and cublasHgemm to test fp16 and fp32.
GTX1660 has fast fp16 mode which should be 2x faster than fp32, but when I run it on GTX1660, it did not faster:

When I run it on a docker container with cuda 10.2, the result is true:

I use fp16 to test a 512 x 512 matrix multiply, cuda11.1 with cudnn8.0.5 and cuda10.2 with cudnn8.0.4 call different kernel to execute it:
cuda11.1 with cudnn8.0.5

cuda10.2 with cudnn8.0.4

Is this a bug in cuda 11.1 or my code has something wrong?

My code:

#include <iostream>
#include <cuda_runtime_api.h>
#include <cuda_fp16.h>
#include <cublas_v2.h>

int main() {

  int min_m_k_n = 2;
  int max_m_k_n = 4096*4;
  int repeats = 5;

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  
  cublasHandle_t handle = nullptr;
  cublasCreate_v2(&handle);

  std::cout << "**************************** Test FP32 ****************************" << std::endl;
  float *ha_fp32 = new float[max_m_k_n * max_m_k_n];
  float *hb_fp32 = new float[max_m_k_n * max_m_k_n];
  
  // Generated data in cpu
  for (uint32_t i=0; i<max_m_k_n * max_m_k_n; ++i) {
    ha_fp32[i] = i;
    hb_fp32[i] = i / 10.f;
  }


  float *dA_fp32, *dB_fp32, *dC_fp32;
  cudaMallocManaged((void **)&dA_fp32, sizeof(float) * max_m_k_n * max_m_k_n);
  cudaMallocManaged((void **)&dB_fp32, sizeof(float) * max_m_k_n * max_m_k_n);
  cudaMallocManaged((void **)&dC_fp32, sizeof(float) * max_m_k_n * max_m_k_n);
  float alpha = 1.f;
  float beta = 0.f;

  cudaMemcpy(dA_fp32, ha_fp32, sizeof(float) * max_m_k_n * max_m_k_n, cudaMemcpyHostToDevice);
  cudaMemcpy(dB_fp32, hb_fp32, sizeof(float) * max_m_k_n * max_m_k_n, cudaMemcpyHostToDevice);

  for(int size = min_m_k_n; size <= max_m_k_n; size=size*2) {
    float sum = 0.0;
    for(int rep = 0; rep < repeats; ++rep) {
      cudaEventRecord(start, 0);
      cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, size, size, size, &alpha, dA_fp32, size, dB_fp32, size, &beta, dC_fp32, size); 
      cudaEventRecord(stop, 0);
      cudaEventSynchronize(stop);
      float elapsed;
      cudaEventElapsedTime(&elapsed, start, stop);
      sum += elapsed / 1000.f;
    }
    std::cout << "FP32: Compute matrix size " << size << "x" << size << " use: " <<  sum / repeats << " s" << std::endl;
  }

  delete[] ha_fp32;
  delete[] hb_fp32;
  cudaFree(dA_fp32);
  cudaFree(dB_fp32);
  cudaFree(dC_fp32);

  std::cout << std::endl << std::endl;

  std::cout << "**************************** Test FP16 ****************************" << std::endl;
  __half *ha_fp16 = new __half[max_m_k_n * max_m_k_n];
  __half *hb_fp16 = new __half[max_m_k_n * max_m_k_n];
  
  // Generated data in cpu
  for (uint32_t i=0; i<max_m_k_n * max_m_k_n; ++i) {
    float data = i;
    ha_fp16[i] = __float2half(data);
    hb_fp16[i] = __float2half(data / 10.f);
  }


  __half *dA_fp16, *dB_fp16, *dC_fp16;
  cudaMallocManaged((void **)&dA_fp16, sizeof(__half) * max_m_k_n * max_m_k_n);
  cudaMallocManaged((void **)&dB_fp16, sizeof(__half) * max_m_k_n * max_m_k_n);
  cudaMallocManaged((void **)&dC_fp16, sizeof(__half) * max_m_k_n * max_m_k_n);
  __half alpha_h = __float2half(1.f);
  __half beta_h = __float2half(0.f);

  cudaMemcpy(dA_fp16, ha_fp16, sizeof(__half) * max_m_k_n * max_m_k_n, cudaMemcpyHostToDevice);
  cudaMemcpy(dB_fp16, hb_fp16, sizeof(__half) * max_m_k_n * max_m_k_n, cudaMemcpyHostToDevice);

  for(int size = min_m_k_n; size <= max_m_k_n; size=size*2) {
    float sum = 0.0;
    for(int rep = 0; rep < repeats; ++rep) {
      cudaEventRecord(start, 0);
      cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, size, size, size, &alpha_h, dA_fp16, size, dB_fp16, size, &beta_h, dC_fp16, size); 
      cudaEventRecord(stop, 0);
      cudaEventSynchronize(stop);
      float elapsed;
      cudaEventElapsedTime(&elapsed, start, stop);
      sum += elapsed / 1000.f;
    }
    std::cout << "fp16: Compute matrix size " << size << "x" << size << " use: " <<  sum / repeats << " s" << std::endl;
  }

  delete[] ha_fp16;
  delete[] hb_fp16;
  cudaFree(dA_fp16);
  cudaFree(dB_fp16);
  cudaFree(dC_fp16);

  return 0;
}

Recently, through testing, I found that calling cuSparse library on Tesla A100 or Tesla P100 with CUDA11.1 is much slower than that with CUDA9.0.This should be a bug in CUDA 11.1