I have a kernel that is running slower than I would like. I believe that it has to do with it use of registers which is limiting occupancy. Here is the output from the compiler:
ptxas info : Compiling entry function ‘_Z21fcttrc_ijksw_Kernel_2iiiifiiPiPfS0_S0_S0_iiifffS0_S0
S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0’
ptxas info : Used 60 registers, 232+224 bytes smem, 20 bytes cmem[1], 20 bytes cmem[14]
Now 58 registers may just be what it takes to use. I don’t believe it. I don’t think the compiler
is reusing registers properly. The code is below, but basically the code looks like:
for(ns=1;ns<=ntr;ns++) {
chunk of code 1
chunk of code 2
chunk of code 3
chunk of code 4
chunk of code 5
chunk of code 6
}
The only thing that connects between the chunks is a register I used to sum one variable between each chunk of code.
I commented out parts of the code to see how the registers were being used, this is what I came up with:
Parts included Register Use
1 22
1-2 33
1-3 40
1-4 41
1-5 49
1-6 58
I think the problem has something to do with the macros I am using to index multi-dimensional arrays. However,
other kernels haven’t shown such poor register use. If I comment out the calculation of sflxph sflxpl, then register
use does not increase as I include more code chunks.
Any ideas on exploring this?
Thanks,
Craig
Code snippet:
#define FTNREF4D(i_index,j_index,k_index,l_index,i_size,j_size,k_siz
e,i_lb,j_lb,k_lb,l_lb) (i_size)(j_size)(k_size)(l_index-l_lb)+(i_size)(j_size)(k_index-k_lb)+(i_
size)(j_index-j_lb)+i_index-i_lb
#define FTNREF5D(i_index,j_index,k_index,l_index,m_index,i_size,j_si
ze,k_size,l_size,i_lb,j_lb,k_lb,l_lb,m_lb) (i_size)(j_size)(k_size)(l_size)(m_index-m_lb)+(i_size
)(j_size)(k_size)(l_index-l_lb)+(i_size)(j_size)(k_index-k_lb)+(i_size)(j_index-j_lb)+i_index-i_lb
// Code Chunk 1
p1=1;
p2=(p1+3-1)%6+1;
flxp_ij_v1=flxp_ij[FTNREF4D(i,j,p1,ivl,nx,nx,npp,1,1,1,1)];
flxp_ij_v2=flxp_ij[FTNREF4D(i - 1,j,p2,ivl,nx,nx,npp,1,1,1,1)];
v1=flxp_ij_v1+fabs(flxp_ij_v1);
v2=flxp_ij_v2+fabs(flxp_ij_v2);
sflxph = 0.5 * ((v1) * (trce_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)])
- (v2) * (trce_ij[FTNREF5D(i - 1,j,p2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i - 1,j,p2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)]));
sflxpl = 0.5 * ((v1) * trc1
- (v2) * trc_ij[FTNREF4D(i - 1,j,ivl,ns,nx,nx,nvl,1,1,1,1)]);
adfs_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] = sflxph - sflxpl;
fsl_temp += sflxpl;
// Code Chunk 2
p1=2;
p2=(p1+3-1)%6+1;
flxp_ij_v1=flxp_ij[FTNREF4D(i,j,p1,ivl,nx,nx,npp,1,1,1,1)];
flxp_ij_v2=flxp_ij[FTNREF4D(i,j-1,p2,ivl,nx,nx,npp,1,1,1,1)];
v1=flxp_ij_v1+fabs(flxp_ij_v1);
v2=flxp_ij_v2+fabs(flxp_ij_v2);
sflxph = 0.5 * ((v1) * (trce_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)])
- (v2) * (trce_ij[FTNREF5D(i,j-1,p2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j-1,p2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)]));
sflxpl = 0.5 * ((v1) * trc1
- (v2) * trc_ij[FTNREF4D(i,j-1,ivl,ns,nx,nx,nvl,1,1,1,1)]);
adfs_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] = sflxph - sflxpl;
fsl_temp += sflxpl;
// Code Chunk 3
p1=3;
p2=(p1+3-1)%6+1;
flxp_ij_v1=flxp_ij[FTNREF4D(i,j,p1,ivl,nx,nx,npp,1,1,1,1)];
flxp_ij_v2=flxp_ij[FTNREF4D(i+1,j-1,p2,ivl,nx,nx,npp,1,1,1,1)];
v1=flxp_ij_v1+fabs(flxp_ij_v1);
v2=flxp_ij_v2+fabs(flxp_ij_v2);
sflxph = 0.5 * ((v1) * (trce_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)])
- (v2) * (trce_ij[FTNREF5D(i+1,j-1,p2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i+1,j-1,p2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)]));
sflxpl = 0.5 * ((v1) * trc1
- (v2) * trc_ij[FTNREF4D(i+1,j-1,ivl,ns,nx,nx,nvl,1,1,1,1)]);
adfs_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] = sflxph - sflxpl;
fsl_temp += sflxpl;
// Code Chunk 4
p1=4;
p2=(p1+3-1)%6+1;
flxp_ij_v1=flxp_ij[FTNREF4D(i,j,p1,ivl,nx,nx,npp,1,1,1,1)];
flxp_ij_v2=flxp_ij[FTNREF4D(i+1,j,p2,ivl,nx,nx,npp,1,1,1,1)]
;
v1=flxp_ij_v1+fabs(flxp_ij_v1);
v2=flxp_ij_v2+fabs(flxp_ij_v2);
sflxph = 0.5 * ((v1) * (trce_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)])
- (v2) * (trce_ij[FTNREF5D(i+1,j,p2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i+1,j,p2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)]
));
sflxpl = 0.5 * ((v1) * trc1
- (v2) * trc_ij[FTNREF4D(i+1,j,ivl,ns,nx,nx,nvl,1,1,1,1)]);
adfs_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] = sflxph - sflxpl;
fsl_temp += sflxpl;
// Code Chunk 5
p1=5;
p2=(p1+3-1)%6+1;
flxp_ij_v1=flxp_ij[FTNREF4D(i,j,p1,ivl,nx,nx,npp,1,1,1,1)];
flxp_ij_v2=flxp_ij[FTNREF4D(i,j+1,p2,ivl,nx,nx,npp,1,1,1,1)]
;
v1=flxp_ij_v1+fabs(flxp_ij_v1);
v2=flxp_ij_v2+fabs(flxp_ij_v2);
sflxph = 0.5 * ((v1) * (trce_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)])
- (v2) * (trce_ij[FTNREF5D(i,j+1,p2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j+1,p2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)]
));
sflxpl = 0.5 * ((v1) * trc1
- (v2) * trc_ij[FTNREF4D(i,j+1,ivl,ns,nx,nx,nvl,1,1,1,1)]);
adfs_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] = sflxph - sflxpl;
fsl_temp += sflxpl;
// Code Chunk 6
p1=6;
p2=(p1+3-1)%6+1;
flxp_ij_v1=flxp_ij[FTNREF4D(i,j,p1,ivl,nx,nx,npp,1,1,1,1)];
flxp_ij_v2=flxp_ij[FTNREF4D(i-1,j+1,p2,ivl,nx,nx,npp,1,1,1,1)];
v1=flxp_ij_v1+fabs(flxp_ij_v1);
v2=flxp_ij_v2+fabs(flxp_ij_v2);
sflxph = 0.5 * ((v1) * (trce_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)])
- (v2) * (trce_ij[FTNREF5D(i-1,j+1,p2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i-1,j+1,p2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)]));
sflxpl = 0.5 * ((v1) * trc1
- (v2) * trc_ij[FTNREF4D(i-1,j+1,ivl,ns,nx,nx,nvl,1,1,1,1)]);
adfs_ij[FTNREF5D(i,j,p1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] = sflxph - sflxpl;
fsl_temp += sflxpl;
fsl_ij[ FTNREF5D(i,j,ivl,nf,ns,nx,nx,nvl,nabl,1,1,1,1,1) ] =fsl_temp;