We are evaluating the half performance on a100 using artificial kernel provided in the post:
https://forums.developer.nvidia.com/t/poor-half-performance/111626
#include "cuda_fp16.h"
#define lc 1048576*2
__device__ __forceinline__ float computation(float a, float b, float c){
return __fmaf_rn(a,b,c);
}
__device__ __forceinline__ half2 computation(half2 a, half2 b, half2 c){
return __hfma2(a,b,c);
}
template<typename T>
__global__ void add_comp_kernel(T* a, T* b, T* c, long int N){
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if(idx < N){
T a_d = a[idx];
T b_d = b[idx];
T c_d = c[idx];
for(int i = 0; i<lc; ++i){
c_d = computation(a_d,b_d, c_d);
}
c[idx]=c_d;
}
}
int main(int argc, char** argv){
long int N = 1048576ULL;
int threads = 1024;
{
float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, N*sizeof(float));
cudaMalloc(&d_b, N*sizeof(float));
cudaMalloc(&d_c, N*sizeof(float));
add_comp_kernel<<<(int)ceil(float(N)/threads),threads>>>(d_a, d_b, d_c, N);
cudaDeviceSynchronize();
}
{
half2 *d_a, *d_b, *d_c;
cudaMalloc(&d_a, N/2*sizeof(half2));
cudaMalloc(&d_b, N/2*sizeof(half2));
cudaMalloc(&d_c, N/2*sizeof(half2));
add_comp_kernel<<<ceil(float(N/2)/threads),threads>>>(d_a, d_b, d_c, N/2);
cudaDeviceSynchronize();
}
}
nvcc -arch=sm_80 -o fp16 fp16.cu
ncu -f --kernel-id ::add_comp_kernel: --metrics gpu__time_duration.sum ./fp16
==PROF== Connected to process 18194 (/home/wqwang/fp16/fp16)
==PROF== Profiling "add_comp_kernel": 0%....50%....100% - 1 pass
==PROF== Profiling "add_comp_kernel": 0%....50%....100% - 1 pass
==PROF== Disconnected from process 18194
[18194] fp16@127.0.0.1
void add_comp_kernel<float>(float*, float*, float*, long), 2021-Jan-13 20:59:22, Context 1, Stream 7
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
gpu__time_duration.sum msecond 306.45
---------------------------------------------------------------------- --------------- ------------------------------
void add_comp_kernel<__half2>(__half2*, __half2*, __half2*, long), 2021-Jan-13 20:59:22, Context 1, Stream 7
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
gpu__time_duration.sum msecond 144.70
---------------------------------------------------------------------- --------------- ------------------------------
Currently, we observe roughly x2 half performance versus float. Yet, as mentioned in the a100 white paper, half performance is expected to be around 4x that of float. Can someone give suggestions on how to achieve expected half performance? Thank you in advance!