Hi,
In one my programs I’m using mixed precision calculation. The program works fine, but I wanted to further improve the performance. I changed from half to the half2 vector datatype.
The main program is written in FORTRAN but half2 is only available in the C/C++ part of CUDA.
I’ developed the kernel in C++ and integrated it into the FORTRAN program, the program works correctly, however there is almost no performance improvement when going from half to half2. I wonder if the developed kernel is correct?
Below is a small test benchmark program containing both - the half and half2 kernel – the performance improvement I observe on our DGX2 cluster is around 12%. I replaced all operations inside the half2 kernel with intrinsic functions. Is the usage of these functions correct? Is there anything I can do to further improve the performance?
Thank you for your help
makefile
ARCH = -ta=tesla:cc70
CC = mpic++
CCU = nvcc -ccbin=mpic++
RM = /bin/rm
PROG = run
OBJS = main.o
#-mp
OPTS = ${ARCH} -acc -Minfo=accel -Minfo -Mcuda=ptxinfo
%.o : %.c
${CC} ${OPTS} -c ${CFLAGS} $<
%.o : %.cu
${CCU} -arch=sm_70 --ptxas-options=-v -Xcompiler "${OPTS}" -c ${CUFLAGS} $<
all : ${PROG}
${PROG} : ${OBJS}
mpic++ ${OPTS} -Mcuda -o $@ ${OBJS} ${LDFLAGS} ${CFPMODEL} ${libs}
clean :
${RM} -f ${PROG} *.o *~
main.cu
#include <iostream>
#include <cuda_fp16.h>
#include <mpi.h>
using namespace std;
__global__
void kernel_half2(int iam0, int nx, int ny, int nz, int nv,
int nb, int nv2,
half2 omega1_,
half2 omega2_,
const half2* __restrict__ vxyzv_all,
const half* __restrict__ vzl_to_vzl2_p,
const half2* __restrict__ f,
half2* __restrict__ df,
int icolor)
{
const int nb1 = 2*nb;
const int nblock = 2;
const int ncolor = 2;
half2 dfc_, tmp1, tmp2, df_kn2,df_kn1, df_k0,df_kp1,df_kp2;
const int l = threadIdx.x; // get the id
const int jj = blockIdx.y*nblock;
const int ii = ((jj/nblock + (icolor-1))%2 ) * nblock + (blockIdx.x)*nblock*ncolor;
for(int i = ii;i<min(ii+nblock,nx);i++)
{
const half2 vzl_to_vzl2 = __half2(vzl_to_vzl2_p[i],vzl_to_vzl2_p[i]);
for(int j = jj;j<min(jj+nblock,ny);j++)
{
df_kn2 = half2(0.0,0.0);
df_kn1 = half2(0.0,0.0);
const size_t index = 1 + l + ((nb + 0 + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv2));
const size_t index1= 1 + l + ((nb + 1 + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv2));
df_k0 = df[index];
df_kp1 = df[index1];
for(int k = 0;k<nz-2;k++)
{
const size_t index2= 1 + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv2));
dfc_ = f[index2];
const size_t index3= 1 + l + ((nb + k+2 + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv2));
df_kp2 = df[index3];
tmp1 = __hmul2(vzl_to_vzl2 ,__hsub2(df_kp2 , df_kn2));
tmp2 = __hsub2(df_kn1,df_kp1);
tmp2 = __hadd2(tmp1 , tmp2);
const size_t index4= l + (8 + j*15 + i*ny*15)*nv;
dfc_ = __hfma2(vxyzv_all[index4], tmp2, dfc_);
const size_t index5 = l + (0 + j*15 + i*ny*15)*nv;
const size_t index6 = l + (1 + j*15 + i*ny*15)*nv;
const size_t index7 = l + (2 + j*15 + i*ny*15)*nv;
const size_t index8 = l + (3 + j*15 + i*ny*15)*nv;
const size_t index9 = 1 + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i-1)))*(nv2));
const size_t index10= 1 + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i+1)))*(nv2));
const size_t index11= 1 + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i-2)))*(nv2));
const size_t index12= 1 + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i+2)))*(nv2));
dfc_ = __hfma2(vxyzv_all[index5], df[index9], dfc_);
dfc_ = __hsub2(dfc_ , __hmul2(vxyzv_all[index6] , df[index10]));
dfc_ = __hsub2(dfc_ , __hmul2(vxyzv_all[index7] , df[index11]));
dfc_ = __hfma2(vxyzv_all[index8], df[index12], dfc_);
const size_t index13 = l + (4 + j*15 + i*ny*15)*nv;
const size_t index14 = l + (5 + j*15 + i*ny*15)*nv;
const size_t index15 = l + (6 + j*15 + i*ny*15)*nv;
const size_t index16 = l + (7 + j*15 + i*ny*15)*nv;
const size_t index17= 1 + l + ((nb + k + (nz + nb1)*(nb + j-1 + (ny + nb1)*(nb+i)))*(nv2));
const size_t index18= 1 + l + ((nb + k + (nz + nb1)*(nb + j+1 + (ny + nb1)*(nb+i)))*(nv2));
const size_t index19= 1 + l + ((nb + k + (nz + nb1)*(nb + j-2 + (ny + nb1)*(nb+i)))*(nv2));
const size_t index20= 1 + l + ((nb + k + (nz + nb1)*(nb + j+2 + (ny + nb1)*(nb+i)))*(nv2));
dfc_ = __hfma2(vxyzv_all[index13] , df[index17], dfc_);
dfc_ = __hsub2(dfc_ , __hmul2(vxyzv_all[index14] , df[index18]));
dfc_ = __hsub2(dfc_ , __hmul2(vxyzv_all[index15] , df[index19]));
dfc_ = __hfma2(vxyzv_all[index16] , df[index20], dfc_);
const size_t index21 = l + (14 + j*15 + i*ny*15)*nv;
dfc_ = __hmul2(dfc_ ,vxyzv_all[index21]);
tmp1 = __hmul2(omega2_, df_k0);
tmp2 = __hmul2(dfc_ , omega1_);
const size_t index22= 1 + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv2));
df[index22] = __hadd2(tmp1 , tmp2);
df_kn2 = df_kn1;
df_k0 = df_kp1;
df_kp1 = df_kp2;
df_kn1 = df[index22];
}
}
}
}
__global__
void kernel_half(int iam0, int nx, int ny, int nz, int nv, int nb,
half omega1_,
half omega2_,
const half* __restrict__ vxyzv_all,
const half* __restrict__ vzl_to_vzl2_p,
const half* __restrict__ f,
half* __restrict__ df,
int icolor)
{
const int nb1 = 2*nb;
const int nblock = 2;
const int ncolor = 2;
half vzl_to_vzl2, dfc_, tmp1, tmp2, df_kn2,df_kn1, df_k0,df_kp1,df_kp2;
const int l = threadIdx.x; // get the id
const int jj = blockIdx.y*nblock;
const int ii = ((jj/nblock + (icolor-1))%2 ) * nblock + (blockIdx.x)*nblock*ncolor;
for(int i = ii;i<min(ii+nblock,nx);i++)
{
vzl_to_vzl2 = vzl_to_vzl2_p[i];
for(int j = jj;j<min(jj+nblock,ny);j++)
{
df_kn2 = 0.0;
df_kn1 = 0.0;
const size_t index = nb + l + ((nb + 0 + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv + nb1));
const size_t index1= nb + l + ((nb + 1 + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv + nb1));
df_k0 = df[index];
df_kp1 = df[index1];
for(int k = 0;k<nz-2;k++)
{
const size_t index2= nb + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv + nb1));
dfc_ = f[index2];
const size_t index3= nb + l + ((nb + k+2 + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv + nb1));
df_kp2 = df[index3];
tmp1 = vzl_to_vzl2 * (df_kp2 - df_kn2);
tmp2 = df_kn1 - df_kp1 ;
tmp2 = tmp1 + tmp2;
// vxyzv_all_low(nv,15,ny,nx)
const size_t index4= l + (8 + j*15 + i*ny*15)*nv;
dfc_ = dfc_ + vxyzv_all[index4] * tmp2;
const size_t index5 = l + (0 + j*15 + i*ny*15)*nv;
const size_t index6 = l + (1 + j*15 + i*ny*15)*nv;
const size_t index7 = l + (2 + j*15 + i*ny*15)*nv;
const size_t index8 = l + (3 + j*15 + i*ny*15)*nv;
const size_t index9 = nb + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i-1)))*(nv + nb1));
const size_t index10= nb + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i+1)))*(nv + nb1));
const size_t index11= nb + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i-2)))*(nv + nb1));
const size_t index12= nb + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i+2)))*(nv + nb1));
dfc_ = dfc_ + vxyzv_all[index5] * df[index9];
dfc_ = dfc_ - vxyzv_all[index6] * df[index10];
dfc_ = dfc_ - vxyzv_all[index7] * df[index11];
dfc_ = dfc_ + vxyzv_all[index8] * df[index12];
const size_t index13 = l + (4 + j*15 + i*ny*15)*nv;
const size_t index14 = l + (5 + j*15 + i*ny*15)*nv;
const size_t index15 = l + (6 + j*15 + i*ny*15)*nv;
const size_t index16 = l + (7 + j*15 + i*ny*15)*nv;
const size_t index17= nb + l + ((nb + k + (nz + nb1)*(nb + j-1 + (ny + nb1)*(nb+i)))*(nv + nb1));
const size_t index18= nb + l + ((nb + k + (nz + nb1)*(nb + j+1 + (ny + nb1)*(nb+i)))*(nv + nb1));
const size_t index19= nb + l + ((nb + k + (nz + nb1)*(nb + j-2 + (ny + nb1)*(nb+i)))*(nv + nb1));
const size_t index20= nb + l + ((nb + k + (nz + nb1)*(nb + j+2 + (ny + nb1)*(nb+i)))*(nv + nb1));
dfc_ = dfc_ + vxyzv_all[index13] * df[index17];
dfc_ = dfc_ - vxyzv_all[index14] * df[index18];
dfc_ = dfc_ - vxyzv_all[index15] * df[index19];
dfc_ = dfc_ + vxyzv_all[index16] * df[index20];
const size_t index21 = l + (14 + j*15 + i*ny*15)*nv;
dfc_ = dfc_ * vxyzv_all[index21];
tmp1 = omega2_*df_k0;
tmp2 = dfc_ * omega1_;
const size_t index22= nb + l + ((nb + k + (nz + nb1)*(nb + j + (ny + nb1)*(nb+i)))*(nv + nb1));
df[index22] = tmp1 + tmp2;
df_kn2 = df_kn1;
df_k0 = df_kp1;
df_kp1 = df_kp2;
df_kn1 = df[index22];
}
}
}
}
int main()
{
const int nx = 80;
const int ny = 80;
const int nz = 32;
const int nv = 96;
const int nb = 2;
const int nb1 = 2*nb;
const half omega1 = 0.05;
const half omega2 = 0.06;
const size_t vec_size = (nb1+nv)*(nb1+nz)*(nb1+ny)*(nb1+nx);
half *f, *df, *d1, *data;
cudaMalloc((void**)&f, vec_size*sizeof(half));
cudaMalloc((void**)&df, vec_size*sizeof(half));
cudaMalloc((void**)&d1, nx*sizeof(half));
cudaMalloc((void**)&data, nv*15*ny*nx*sizeof(half));
const int ncolor = 2;
const int ss_iter = 1000;
double k1 = 0.0;
double k2 = 0.0;
dim3 tBlock;
dim3 grid ;
double t;
int iicolor_no[ncolor*2] = { 1 , 2 , 3, 4};
for(int ss = 0;ss<ss_iter;ss++){
cout << "Iteration no: " << ss << "\n";
for(int iicolor = 0;iicolor<ncolor*2;iicolor++){
int icolor = iicolor_no[iicolor];
tBlock = dim3(nv,1,1);
grid = dim3(nx/4, ny/2,1);
t = MPI_Wtime();
kernel_half<<<grid,tBlock>>>(0,nx,ny,nz,nv,nb,
omega1,omega2,
data,
d1,
f,df, icolor);
cudaDeviceSynchronize();
k1 += MPI_Wtime() - t;
tBlock = dim3(nv/2,1,1);
grid = dim3(nx/4, ny/2,1);
const half2 o1 = half2(omega1,omega1);
const half2 o2 = half2(omega2,omega2);
t = MPI_Wtime();
kernel_half2<<<grid,tBlock>>>(0,nx,ny,nz,nv/2,nb,(nv+2*nb)/2,
o1,o2,
(half2*) data,
d1,
(half2*) f,
(half2*) df,
icolor);
cudaDeviceSynchronize();
k2 += MPI_Wtime() - t;
}
}