cublasHgemm is slower than cublasSgemm in CUDA 11.1 when I use?

I use CUDA 11.1, My GPU is GTX1660 with capability 7.5.
I use cublasSgemm and cublasHgemm to test fp16 and fp32.
GTX1660 has fast fp16 mode which should be 2x faster than fp32, but when I run it on GTX1660, it did not faster:

When I run it on a docker container with cuda 10.2, the result is true:

I use fp16 to test a 512 x 512 matrix multiply, cuda11.1 with cudnn8.0.5 and cuda10.2 with cudnn8.0.4 call different kernel to execute it:
cuda11.1 with cudnn8.0.5

cuda10.2 with cudnn8.0.4

Is this a bug in cuda 11.1 or my code has something wrong?

My code:

#include <iostream>
#include <cuda_runtime_api.h>
#include <cuda_fp16.h>
#include <cublas_v2.h>

int main() {

  int min_m_k_n = 2;
  int max_m_k_n = 4096*4;
  int repeats = 5;

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  
  cublasHandle_t handle = nullptr;
  cublasCreate_v2(&handle);

  std::cout << "**************************** Test FP32 ****************************" << std::endl;
  float *ha_fp32 = new float[max_m_k_n * max_m_k_n];
  float *hb_fp32 = new float[max_m_k_n * max_m_k_n];
  
  // Generated data in cpu
  for (uint32_t i=0; i<max_m_k_n * max_m_k_n; ++i) {
    ha_fp32[i] = i;
    hb_fp32[i] = i / 10.f;
  }


  float *dA_fp32, *dB_fp32, *dC_fp32;
  cudaMallocManaged((void **)&dA_fp32, sizeof(float) * max_m_k_n * max_m_k_n);
  cudaMallocManaged((void **)&dB_fp32, sizeof(float) * max_m_k_n * max_m_k_n);
  cudaMallocManaged((void **)&dC_fp32, sizeof(float) * max_m_k_n * max_m_k_n);
  float alpha = 1.f;
  float beta = 0.f;

  cudaMemcpy(dA_fp32, ha_fp32, sizeof(float) * max_m_k_n * max_m_k_n, cudaMemcpyHostToDevice);
  cudaMemcpy(dB_fp32, hb_fp32, sizeof(float) * max_m_k_n * max_m_k_n, cudaMemcpyHostToDevice);

  for(int size = min_m_k_n; size <= max_m_k_n; size=size*2) {
    float sum = 0.0;
    for(int rep = 0; rep < repeats; ++rep) {
      cudaEventRecord(start, 0);
      cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, size, size, size, &alpha, dA_fp32, size, dB_fp32, size, &beta, dC_fp32, size); 
      cudaEventRecord(stop, 0);
      cudaEventSynchronize(stop);
      float elapsed;
      cudaEventElapsedTime(&elapsed, start, stop);
      sum += elapsed / 1000.f;
    }
    std::cout << "FP32: Compute matrix size " << size << "x" << size << " use: " <<  sum / repeats << " s" << std::endl;
  }

  delete[] ha_fp32;
  delete[] hb_fp32;
  cudaFree(dA_fp32);
  cudaFree(dB_fp32);
  cudaFree(dC_fp32);

  std::cout << std::endl << std::endl;

  std::cout << "**************************** Test FP16 ****************************" << std::endl;
  __half *ha_fp16 = new __half[max_m_k_n * max_m_k_n];
  __half *hb_fp16 = new __half[max_m_k_n * max_m_k_n];
  
  // Generated data in cpu
  for (uint32_t i=0; i<max_m_k_n * max_m_k_n; ++i) {
    float data = i;
    ha_fp16[i] = __float2half(data);
    hb_fp16[i] = __float2half(data / 10.f);
  }


  __half *dA_fp16, *dB_fp16, *dC_fp16;
  cudaMallocManaged((void **)&dA_fp16, sizeof(__half) * max_m_k_n * max_m_k_n);
  cudaMallocManaged((void **)&dB_fp16, sizeof(__half) * max_m_k_n * max_m_k_n);
  cudaMallocManaged((void **)&dC_fp16, sizeof(__half) * max_m_k_n * max_m_k_n);
  __half alpha_h = __float2half(1.f);
  __half beta_h = __float2half(0.f);

  cudaMemcpy(dA_fp16, ha_fp16, sizeof(__half) * max_m_k_n * max_m_k_n, cudaMemcpyHostToDevice);
  cudaMemcpy(dB_fp16, hb_fp16, sizeof(__half) * max_m_k_n * max_m_k_n, cudaMemcpyHostToDevice);

  for(int size = min_m_k_n; size <= max_m_k_n; size=size*2) {
    float sum = 0.0;
    for(int rep = 0; rep < repeats; ++rep) {
      cudaEventRecord(start, 0);
      cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, size, size, size, &alpha_h, dA_fp16, size, dB_fp16, size, &beta_h, dC_fp16, size); 
      cudaEventRecord(stop, 0);
      cudaEventSynchronize(stop);
      float elapsed;
      cudaEventElapsedTime(&elapsed, start, stop);
      sum += elapsed / 1000.f;
    }
    std::cout << "fp16: Compute matrix size " << size << "x" << size << " use: " <<  sum / repeats << " s" << std::endl;
  }

  delete[] ha_fp16;
  delete[] hb_fp16;
  cudaFree(dA_fp16);
  cudaFree(dB_fp16);
  cudaFree(dC_fp16);

  return 0;
}

Recently, through testing, I found that calling cuSparse library on Tesla A100 or Tesla P100 with CUDA11.1 is much slower than that with CUDA9.0.This should be a bug in CUDA 11.1

Hi ZJQ007,

I’ve tested your code above with my TITAN RTX (CC 7.5) with CUDA 11.1 and CUDA 10.2.
Results are as expected.

CUDA 11.1

**************************** Test FP32 ****************************
FP32: Compute matrix size 2x2 use: 3.9776e-05 s
FP32: Compute matrix size 4x4 use: 4.5888e-06 s
FP32: Compute matrix size 8x8 use: 4.7808e-06 s
FP32: Compute matrix size 16x16 use: 4.3392e-06 s
FP32: Compute matrix size 32x32 use: 8.3584e-06 s
FP32: Compute matrix size 64x64 use: 7.4624e-06 s
FP32: Compute matrix size 128x128 use: 1.14368e-05 s
FP32: Compute matrix size 256x256 use: 1.85664e-05 s
FP32: Compute matrix size 512x512 use: 4.15424e-05 s
FP32: Compute matrix size 1024x1024 use: 0.000174003 s
FP32: Compute matrix size 2048x2048 use: 0.00115537 s
FP32: Compute matrix size 4096x4096 use: 0.00873704 s
FP32: Compute matrix size 8192x8192 use: 0.0733965 s
FP32: Compute matrix size 16384x16384 use: 0.596964 s


**************************** Test FP16 ****************************
fp16: Compute matrix size 2x2 use: 3.64096e-05 s
fp16: Compute matrix size 4x4 use: 5.4208e-06 s
fp16: Compute matrix size 8x8 use: 5.9392e-06 s
fp16: Compute matrix size 16x16 use: 4.8512e-06 s
fp16: Compute matrix size 32x32 use: 5.6512e-06 s
fp16: Compute matrix size 64x64 use: 5.1392e-06 s
fp16: Compute matrix size 128x128 use: 1.2e-05 s
fp16: Compute matrix size 256x256 use: 1.12512e-05 s
fp16: Compute matrix size 512x512 use: 2.6368e-05 s
fp16: Compute matrix size 1024x1024 use: 3.9168e-05 s
fp16: Compute matrix size 2048x2048 use: 0.000229235 s
fp16: Compute matrix size 4096x4096 use: 0.00135612 s
fp16: Compute matrix size 8192x8192 use: 0.00954235 s
fp16: Compute matrix size 16384x16384 use: 0.0764096 s

CUDA 10.2

**************************** Test FP32 ****************************
FP32: Compute matrix size 2x2 use: 0.000609478 s
FP32: Compute matrix size 4x4 use: 4.2688e-06 s
FP32: Compute matrix size 8x8 use: 4.2368e-06 s
FP32: Compute matrix size 16x16 use: 7.7824e-05 s
FP32: Compute matrix size 32x32 use: 6.6432e-06 s
FP32: Compute matrix size 64x64 use: 6.79936e-05 s
FP32: Compute matrix size 128x128 use: 7.4688e-06 s
FP32: Compute matrix size 256x256 use: 1.93792e-05 s
FP32: Compute matrix size 512x512 use: 3.76256e-05 s
FP32: Compute matrix size 1024x1024 use: 0.00018768 s
FP32: Compute matrix size 2048x2048 use: 0.00116396 s
FP32: Compute matrix size 4096x4096 use: 0.0087014 s
FP32: Compute matrix size 8192x8192 use: 0.0727864 s
FP32: Compute matrix size 16384x16384 use: 0.576091 s


**************************** Test FP16 ****************************
fp16: Compute matrix size 2x2 use: 0.000232378 s
fp16: Compute matrix size 4x4 use: 9.4976e-06 s
fp16: Compute matrix size 8x8 use: 9.9392e-06 s
fp16: Compute matrix size 16x16 use: 1.03552e-05 s
fp16: Compute matrix size 32x32 use: 1.20192e-05 s
fp16: Compute matrix size 64x64 use: 1.49504e-05 s
fp16: Compute matrix size 128x128 use: 9.41376e-05 s
fp16: Compute matrix size 256x256 use: 2.70272e-05 s
fp16: Compute matrix size 512x512 use: 0.000175955 s
fp16: Compute matrix size 1024x1024 use: 0.000125894 s
fp16: Compute matrix size 2048x2048 use: 0.000808128 s
fp16: Compute matrix size 4096x4096 use: 0.00580365 s
fp16: Compute matrix size 8192x8192 use: 0.0355324 s
fp16: Compute matrix size 16384x16384 use: 0.263431 s

Note: It’s not uncommon for cuBLAS to call new/different kernels between versions.