I use CUDA 11.1, My GPU is GTX1660 with capability 7.5.
I use cublasSgemm and cublasHgemm to test fp16 and fp32.
GTX1660 has fast fp16 mode which should be 2x faster than fp32, but when I run it on GTX1660, it did not faster:
When I run it on a docker container with cuda 10.2, the result is true:
I use fp16 to test a 512 x 512 matrix multiply, cuda11.1 with cudnn8.0.5 and cuda10.2 with cudnn8.0.4 call different kernel to execute it:
cuda11.1 with cudnn8.0.5
cuda10.2 with cudnn8.0.4
Is this a bug in cuda 11.1 or my code has something wrong?
My code:
#include <iostream>
#include <cuda_runtime_api.h>
#include <cuda_fp16.h>
#include <cublas_v2.h>
int main() {
int min_m_k_n = 2;
int max_m_k_n = 4096*4;
int repeats = 5;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cublasHandle_t handle = nullptr;
cublasCreate_v2(&handle);
std::cout << "**************************** Test FP32 ****************************" << std::endl;
float *ha_fp32 = new float[max_m_k_n * max_m_k_n];
float *hb_fp32 = new float[max_m_k_n * max_m_k_n];
// Generated data in cpu
for (uint32_t i=0; i<max_m_k_n * max_m_k_n; ++i) {
ha_fp32[i] = i;
hb_fp32[i] = i / 10.f;
}
float *dA_fp32, *dB_fp32, *dC_fp32;
cudaMallocManaged((void **)&dA_fp32, sizeof(float) * max_m_k_n * max_m_k_n);
cudaMallocManaged((void **)&dB_fp32, sizeof(float) * max_m_k_n * max_m_k_n);
cudaMallocManaged((void **)&dC_fp32, sizeof(float) * max_m_k_n * max_m_k_n);
float alpha = 1.f;
float beta = 0.f;
cudaMemcpy(dA_fp32, ha_fp32, sizeof(float) * max_m_k_n * max_m_k_n, cudaMemcpyHostToDevice);
cudaMemcpy(dB_fp32, hb_fp32, sizeof(float) * max_m_k_n * max_m_k_n, cudaMemcpyHostToDevice);
for(int size = min_m_k_n; size <= max_m_k_n; size=size*2) {
float sum = 0.0;
for(int rep = 0; rep < repeats; ++rep) {
cudaEventRecord(start, 0);
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, size, size, size, &alpha, dA_fp32, size, dB_fp32, size, &beta, dC_fp32, size);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsed;
cudaEventElapsedTime(&elapsed, start, stop);
sum += elapsed / 1000.f;
}
std::cout << "FP32: Compute matrix size " << size << "x" << size << " use: " << sum / repeats << " s" << std::endl;
}
delete[] ha_fp32;
delete[] hb_fp32;
cudaFree(dA_fp32);
cudaFree(dB_fp32);
cudaFree(dC_fp32);
std::cout << std::endl << std::endl;
std::cout << "**************************** Test FP16 ****************************" << std::endl;
__half *ha_fp16 = new __half[max_m_k_n * max_m_k_n];
__half *hb_fp16 = new __half[max_m_k_n * max_m_k_n];
// Generated data in cpu
for (uint32_t i=0; i<max_m_k_n * max_m_k_n; ++i) {
float data = i;
ha_fp16[i] = __float2half(data);
hb_fp16[i] = __float2half(data / 10.f);
}
__half *dA_fp16, *dB_fp16, *dC_fp16;
cudaMallocManaged((void **)&dA_fp16, sizeof(__half) * max_m_k_n * max_m_k_n);
cudaMallocManaged((void **)&dB_fp16, sizeof(__half) * max_m_k_n * max_m_k_n);
cudaMallocManaged((void **)&dC_fp16, sizeof(__half) * max_m_k_n * max_m_k_n);
__half alpha_h = __float2half(1.f);
__half beta_h = __float2half(0.f);
cudaMemcpy(dA_fp16, ha_fp16, sizeof(__half) * max_m_k_n * max_m_k_n, cudaMemcpyHostToDevice);
cudaMemcpy(dB_fp16, hb_fp16, sizeof(__half) * max_m_k_n * max_m_k_n, cudaMemcpyHostToDevice);
for(int size = min_m_k_n; size <= max_m_k_n; size=size*2) {
float sum = 0.0;
for(int rep = 0; rep < repeats; ++rep) {
cudaEventRecord(start, 0);
cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, size, size, size, &alpha_h, dA_fp16, size, dB_fp16, size, &beta_h, dC_fp16, size);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsed;
cudaEventElapsedTime(&elapsed, start, stop);
sum += elapsed / 1000.f;
}
std::cout << "fp16: Compute matrix size " << size << "x" << size << " use: " << sum / repeats << " s" << std::endl;
}
delete[] ha_fp16;
delete[] hb_fp16;
cudaFree(dA_fp16);
cudaFree(dB_fp16);
cudaFree(dC_fp16);
return 0;
}