Almost no performance improvement with half2

Hi,

In one my programs I’m using mixed precision calculation. The program works fine, but I wanted to further improve the performance. I changed from half to the half2 vector datatype.
The main program is written in FORTRAN but half2 is only available in the C/C++ part of CUDA.

I’ developed the kernel in C++ and integrated it into the FORTRAN program, the program works correctly, however there is almost no performance improvement when going from half to half2. I wonder if the developed kernel is correct?
Below is a small test benchmark program containing both - the half and half2 kernel – the performance improvement I observe on our DGX2 cluster is around 12%. I replaced all operations inside the half2 kernel with intrinsic functions. Is the usage of these functions correct? Is there anything I can do to further improve the performance?

Thank you for your help

makefile

ARCH = -ta=tesla:cc70
CC   = mpic++
CCU  = nvcc -ccbin=mpic++

RM   = /bin/rm
PROG = run

OBJS = main.o
#-mp
OPTS =  ${ARCH} -acc -Minfo=accel -Minfo -Mcuda=ptxinfo

%.o : %.c
        ${CC}  ${OPTS} -c ${CFLAGS} $<
%.o : %.cu
        ${CCU} -arch=sm_70 --ptxas-options=-v -Xcompiler "${OPTS}" -c ${CUFLAGS} $<

all : ${PROG}
${PROG} : ${OBJS}
        mpic++ ${OPTS} -Mcuda -o $@ ${OBJS} ${LDFLAGS} ${CFPMODEL} ${libs}

clean :
        ${RM} -f ${PROG} *.o *~

main.cu

#include <iostream>
#include <cuda_fp16.h>
#include <mpi.h>
using namespace std;

__global__
void kernel_half2(int iam0, int nx, int ny, int nz, int nv,
                              int nb, int nv2,
                              half2 omega1_,
                              half2 omega2_,
                              const half2*  __restrict__ vxyzv_all,
                              const half* __restrict__ vzl_to_vzl2_p,
                              const half2*  __restrict__ f,
                                    half2*  __restrict__ df,
                              int icolor)
{
   const int nb1  = 2*nb;

   const int nblock = 2;
   const int ncolor = 2;
   half2  dfc_, tmp1, tmp2, df_kn2,df_kn1, df_k0,df_kp1,df_kp2;

   const int l  = threadIdx.x; //  get the id
   const int jj = blockIdx.y*nblock;
   const int ii = ((jj/nblock + (icolor-1))%2 ) * nblock + (blockIdx.x)*nblock*ncolor;

   for(int i = ii;i<min(ii+nblock,nx);i++)
   {
     const half2 vzl_to_vzl2 = __half2(vzl_to_vzl2_p[i],vzl_to_vzl2_p[i]);
     for(int j = jj;j<min(jj+nblock,ny);j++)
     {
        df_kn2 = half2(0.0,0.0);
        df_kn1 = half2(0.0,0.0);
        const size_t index = 1 + l + ((nb + 0 + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv2));
        const size_t index1= 1 + l + ((nb + 1 + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv2));
        df_k0  = df[index];
        df_kp1 = df[index1];

        for(int k = 0;k<nz-2;k++)
        {
            const size_t index2= 1 + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv2));
           dfc_ = f[index2];
           const size_t index3= 1 + l + ((nb + k+2 + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv2));
           df_kp2 = df[index3];

           tmp1 = __hmul2(vzl_to_vzl2 ,__hsub2(df_kp2 , df_kn2));
           tmp2 = __hsub2(df_kn1,df_kp1);
           tmp2 = __hadd2(tmp1 , tmp2);
           const size_t index4= l + (8 + j*15 + i*ny*15)*nv;
           dfc_ = __hfma2(vxyzv_all[index4],  tmp2, dfc_);

           const size_t index5 = l + (0 + j*15 + i*ny*15)*nv;
           const size_t index6 = l + (1 + j*15 + i*ny*15)*nv;
           const size_t index7 = l + (2 + j*15 + i*ny*15)*nv;
           const size_t index8 = l + (3 + j*15 + i*ny*15)*nv;

           const size_t index9 = 1 + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i-1)))*(nv2));
           const size_t index10= 1 + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i+1)))*(nv2));
           const size_t index11= 1 + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i-2)))*(nv2));
           const size_t index12= 1 + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i+2)))*(nv2));
           dfc_ =  __hfma2(vxyzv_all[index5], df[index9], dfc_);
           dfc_ =  __hsub2(dfc_ , __hmul2(vxyzv_all[index6]  , df[index10]));
           dfc_ =  __hsub2(dfc_ , __hmul2(vxyzv_all[index7]  , df[index11]));
           dfc_ =  __hfma2(vxyzv_all[index8], df[index12], dfc_);

           const size_t index13 = l + (4 + j*15 + i*ny*15)*nv;
           const size_t index14 = l + (5 + j*15 + i*ny*15)*nv;
           const size_t index15 = l + (6 + j*15 + i*ny*15)*nv;
           const size_t index16 = l + (7 + j*15 + i*ny*15)*nv;

           const size_t index17= 1 + l + ((nb + k + (nz + nb1)*(nb + j-1 + (ny + nb1)*(nb+i)))*(nv2));
           const size_t index18= 1 + l + ((nb + k + (nz + nb1)*(nb + j+1 + (ny + nb1)*(nb+i)))*(nv2));
           const size_t index19= 1 + l + ((nb + k + (nz + nb1)*(nb + j-2 + (ny + nb1)*(nb+i)))*(nv2));
           const size_t index20= 1 + l + ((nb + k + (nz + nb1)*(nb + j+2 + (ny + nb1)*(nb+i)))*(nv2));
           dfc_ = __hfma2(vxyzv_all[index13]  , df[index17], dfc_);
           dfc_ = __hsub2(dfc_ ,  __hmul2(vxyzv_all[index14]  , df[index18]));
           dfc_ = __hsub2(dfc_ ,  __hmul2(vxyzv_all[index15]  , df[index19]));
           dfc_ = __hfma2(vxyzv_all[index16]  , df[index20], dfc_);

           const size_t index21 = l + (14 + j*15 + i*ny*15)*nv;
           dfc_ = __hmul2(dfc_   ,vxyzv_all[index21]);
           tmp1 = __hmul2(omega2_, df_k0);
           tmp2 = __hmul2(dfc_   , omega1_);
           const size_t index22= 1 + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv2));
           df[index22] =  __hadd2(tmp1 , tmp2);

           df_kn2 = df_kn1;
           df_k0  = df_kp1;
           df_kp1 = df_kp2;
           df_kn1 = df[index22];
        }

       }
    }
}

__global__
void kernel_half(int iam0, int nx, int ny, int nz, int nv, int nb,
                 half omega1_,
                 half omega2_,
                 const half* __restrict__ vxyzv_all,
                 const half* __restrict__ vzl_to_vzl2_p,
                 const half* __restrict__ f,
                       half* __restrict__ df,
                 int icolor)
{
   const int nb1  = 2*nb;

   const int nblock = 2;
   const int ncolor = 2;
   half vzl_to_vzl2, dfc_, tmp1, tmp2, df_kn2,df_kn1, df_k0,df_kp1,df_kp2;

   const int l = threadIdx.x; //  get the id
   const int jj = blockIdx.y*nblock;
   const int ii = ((jj/nblock + (icolor-1))%2 ) * nblock + (blockIdx.x)*nblock*ncolor;

   for(int i = ii;i<min(ii+nblock,nx);i++)
   {
     vzl_to_vzl2 = vzl_to_vzl2_p[i];
     for(int j = jj;j<min(jj+nblock,ny);j++)
     {
        df_kn2 = 0.0;
        df_kn1 = 0.0;
        const size_t index = nb + l + ((nb + 0 + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv + nb1));
        const size_t index1= nb + l + ((nb + 1 + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv + nb1));
        df_k0  = df[index];
        df_kp1 = df[index1];

        for(int k = 0;k<nz-2;k++)
        {
           const size_t index2= nb + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv + nb1));
           dfc_ = f[index2];
           const size_t index3= nb + l + ((nb + k+2 + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv + nb1));
           df_kp2 = df[index3];

           tmp1 = vzl_to_vzl2 * (df_kp2 - df_kn2);
           tmp2 = df_kn1 - df_kp1 ;
           tmp2 = tmp1 + tmp2;
           // vxyzv_all_low(nv,15,ny,nx)
           const size_t index4= l + (8 + j*15 + i*ny*15)*nv;
           dfc_ = dfc_ + vxyzv_all[index4] * tmp2;

           const size_t index5 = l + (0 + j*15 + i*ny*15)*nv;
           const size_t index6 = l + (1 + j*15 + i*ny*15)*nv;
           const size_t index7 = l + (2 + j*15 + i*ny*15)*nv;
           const size_t index8 = l + (3 + j*15 + i*ny*15)*nv;

           const size_t index9 = nb + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i-1)))*(nv + nb1));
           const size_t index10= nb + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i+1)))*(nv + nb1));
           const size_t index11= nb + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i-2)))*(nv + nb1));
           const size_t index12= nb + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i+2)))*(nv + nb1));
           dfc_ = dfc_ + vxyzv_all[index5]  * df[index9];
           dfc_ = dfc_ - vxyzv_all[index6]  * df[index10];
           dfc_ = dfc_ - vxyzv_all[index7]  * df[index11];
           dfc_ = dfc_ + vxyzv_all[index8]  * df[index12];

           const size_t index13 = l + (4 + j*15 + i*ny*15)*nv;
           const size_t index14 = l + (5 + j*15 + i*ny*15)*nv;
           const size_t index15 = l + (6 + j*15 + i*ny*15)*nv;
           const size_t index16 = l + (7 + j*15 + i*ny*15)*nv;

           const size_t index17= nb + l + ((nb + k + (nz + nb1)*(nb + j-1 + (ny + nb1)*(nb+i)))*(nv + nb1));
           const size_t index18= nb + l + ((nb + k + (nz + nb1)*(nb + j+1 + (ny + nb1)*(nb+i)))*(nv + nb1));
           const size_t index19= nb + l + ((nb + k + (nz + nb1)*(nb + j-2 + (ny + nb1)*(nb+i)))*(nv + nb1));
           const size_t index20= nb + l + ((nb + k + (nz + nb1)*(nb + j+2 + (ny + nb1)*(nb+i)))*(nv + nb1));
           dfc_ = dfc_ + vxyzv_all[index13]  * df[index17];
           dfc_ = dfc_ - vxyzv_all[index14]  * df[index18];
           dfc_ = dfc_ - vxyzv_all[index15]  * df[index19];
           dfc_ = dfc_ + vxyzv_all[index16]  * df[index20];

           const size_t index21 = l + (14 + j*15 + i*ny*15)*nv;
           dfc_ = dfc_ * vxyzv_all[index21];
           tmp1 = omega2_*df_k0;
           tmp2 = dfc_ * omega1_;
           const size_t index22= nb + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv + nb1));
           df[index22] =  tmp1 + tmp2;

           df_kn2 = df_kn1;
           df_k0  = df_kp1;
           df_kp1 = df_kp2;
           df_kn1 = df[index22];
        }

       }
    }
}


int main()
{
  const int nx  = 80;
  const int ny  = 80;
  const int nz  = 32;
  const int nv  = 96;
  const int nb  = 2;
  const int nb1 = 2*nb;
  const half omega1 = 0.05;
  const half omega2 = 0.06;
  const size_t vec_size = (nb1+nv)*(nb1+nz)*(nb1+ny)*(nb1+nx);
  half *f, *df, *d1, *data;
  cudaMalloc((void**)&f, vec_size*sizeof(half));
  cudaMalloc((void**)&df, vec_size*sizeof(half));
  cudaMalloc((void**)&d1, nx*sizeof(half));
  cudaMalloc((void**)&data, nv*15*ny*nx*sizeof(half));

  const int ncolor = 2;
  const int ss_iter = 1000;
  double k1 = 0.0;
  double k2 = 0.0;
  dim3 tBlock;
  dim3 grid  ;
  double t;
  int iicolor_no[ncolor*2] = { 1 , 2 , 3, 4};
  for(int ss = 0;ss<ss_iter;ss++){
         cout << "Iteration no: " << ss << "\n";
         for(int  iicolor = 0;iicolor<ncolor*2;iicolor++){
            int icolor  = iicolor_no[iicolor];

            tBlock  = dim3(nv,1,1);
            grid    = dim3(nx/4, ny/2,1);
            t = MPI_Wtime();
            kernel_half<<<grid,tBlock>>>(0,nx,ny,nz,nv,nb,
                                                omega1,omega2,
                                                data,
                                                d1,
                                                f,df, icolor);
            cudaDeviceSynchronize();
            k1 += MPI_Wtime() - t;

            tBlock = dim3(nv/2,1,1);
            grid   = dim3(nx/4, ny/2,1);
            const half2 o1 = half2(omega1,omega1);
            const half2 o2 = half2(omega2,omega2);
            t = MPI_Wtime();
            kernel_half2<<<grid,tBlock>>>(0,nx,ny,nz,nv/2,nb,(nv+2*nb)/2,
                                                      o1,o2,
                                                      (half2*) data,
                                                      d1,
                                                      (half2*) f,
                                                      (half2*) df,
                                                      icolor);
            cudaDeviceSynchronize();
            k2 += MPI_Wtime() - t;

     }

  }

In some of the CUDA Fortran package examples, we show how you can use a half2 vector type, FYI.

You may not see that much of a gain with half2 because it is not a substantial majority of the work in your kernel. There are still loops, memory accesses, address calculations, etc. going on, all which make up some of the overall execution time.

In the simplest model, you can think that the GPU has many 32-bit ALUs. 16-bit half operations only use half of the available resource, and 2 of those in vector form can fill the 32-bit ALU. But that same ALU might be used for indexing, address calculations, etc. Or, some time might be spent waiting on memory. You can look at the generated PTX to see what proportion of the generated instructions are half2 instructions. And you can use a low-level profiler like nsight compute to get instruction counts and memory stalls.

Thank you for the information! I never saw an example of FORTRAN half2, where exactly I can access the CUDA FORTRAN package?

where exactly I can access the CUDA FORTRAN package?

I believe Brent is referring to the TensorCore examples he wrote that can be found in:
“$PGI/linux86-64-llvm/20[19|20]/examples/CUDA-Fortran/TensorCores/”

Thank you for the information! I will check it.