Bug in register usage, CUDA 2.1

I want to report a bug (I think it is) regarding register usage. I am running CUDA 2.1. I have applied for a registered developer account, but have not heard back in a while, and the FAQ said to post here, so here it goes.

I asked about this problem at the end of March, but never found a resolution to it. I talked to some other CUDA users at the NCSA workshop had agreed what I was seeing shouldn’t happen.

In the code attached, there are 6 blocks of very similar code. As I include each successive block, the number of registers used increases. There shouldn’t be any implicit register usage

between the blocks. As I enable more blocks, the register usage goes up:

[ctierney@wgpu3 test_kernels]$ nvcc -O3 -DBLOCK1 --ptxas-options=“-v -O3” -c fcttrc_ijksw_GPU.debug.cu

ptxas info : Compiling entry function '_Z21fcttrc_ijksw_Kernel_2iiiifiiPiPfS0_S0_S0_iiifffS0_S0

S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0

ptxas info : Used 26 registers, 232+224 bytes smem, 20 bytes cmem[1], 20 bytes cmem[14]

[ctierney@wgpu3 test_kernels]$ nvcc -O3 -DBLOCK1 -DBLOCK2 --ptxas-options=“-v -O3” -c fcttrc_ijksw_GPU.debug.cu

ptxas info : Compiling entry function '_Z21fcttrc_ijksw_Kernel_2iiiifiiPiPfS0_S0_S0_iiifffS0_S0

S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0

ptxas info : Used 38 registers, 232+224 bytes smem, 20 bytes cmem[1], 20 bytes cmem[14]

[ctierney@wgpu3 test_kernels]$ nvcc -O3 -DBLOCK1 -DBLOCK2 -DBLOCK3 --ptxas-options=“-v -O3” -c fcttrc_ijksw_GPU.debug.cu

ptxas info : Compiling entry function '_Z21fcttrc_ijksw_Kernel_2iiiifiiPiPfS0_S0_S0_iiifffS0_S0

S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0

ptxas info : Used 44 registers, 232+224 bytes smem, 20 bytes cmem[1], 20 bytes cmem[14]

[ctierney@wgpu3 test_kernels]$ nvcc -O3 -DBLOCK1 -DBLOCK2 -DBLOCK3 -DBLOCK4 --ptxas-options=“-v -O3” -c fcttrc_ijksw_GPU.debug.cu

ptxas info : Compiling entry function '_Z21fcttrc_ijksw_Kernel_2iiiifiiPiPfS0_S0_S0_iiifffS0_S0

S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0

ptxas info : Used 46 registers, 232+224 bytes smem, 20 bytes cmem[1], 20 bytes cmem[14]

[ctierney@wgpu3 test_kernels]$ nvcc -O3 -DBLOCK1 -DBLOCK2 -DBLOCK3 -DBLOCK4 -DBLOCK5 --ptxas-options=“-v -O3” -c fcttrc_ijksw_GPU.debug.cu

ptxas info : Compiling entry function '_Z21fcttrc_ijksw_Kernel_2iiiifiiPiPfS0_S0_S0_iiifffS0_S0

S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0

ptxas info : Used 54 registers, 232+224 bytes smem, 20 bytes cmem[1], 20 bytes cmem[14]

^[[A[ctierney@wgpu3 test_kernels]$ nvcc -O3 -DBLOCK1 -DBLOCK2 -DBLOCK3 -DBLOCK4 -DBLOCK5 -DBLOCK6 --ptxas-options=“-v -O3” -c fcttrc_ijksw_GPU.debug.cu

ptxas info : Compiling entry function '_Z21fcttrc_ijksw_Kernel_2iiiifiiPiPfS0_S0_S0_iiifffS0_S0

S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0_S0

ptxas info : Used 60 registers, 232+224 bytes smem, 20 bytes cmem[1], 20 bytes cmem[14]

If only 26 registers are used when the first block is enabled, then only 26 registers should be used total.

[codebox]#include <stdio.h>

#ifndef FTOC_MACROS

#define FTOC_MACROS

#define FTNREF1D(i_index,i_lb) i_index-i_lb

#define FTNREF2D(i_index,j_index,i_size,i_lb,j_lb) (i_size)*(j_index-j_lb)+i_index-i_lb

#define FTNREF3D(i_index,j_index,k_index,i_size,j_size,i_lb,j_lb,k_l

b) (i_size)(j_size)(k_index-k_lb)+(i_size)*(j_index-j_lb)+i_index-i_lb

#define FTNREF4D(i_index,j_index,k_index,l_index,i_size,j_size,k_siz

e,i_lb,j_lb,k_lb,l_lb) (i_size)(j_size)(k_size)(l_index-l_lb)+(i_size)(j_size)(k_index-k_lb)+(i_size)(j_index-j_lb)+i_index-i_lb

#define FTNREF5D(i_index,j_index,k_index,l_index,m_index,i_size,j_si

ze,k_size,l_size,i_lb,j_lb,k_lb,l_lb,m_lb) (i_size)(j_size)(k_size)(l_size)(m_index-m_lb)+(i_size)(j_size)(k_size)(l_index-l_lb)+(i_size)(j_size)(k_index-k_lb)+(i_size)(j_index-j_lb)+i_index-i_lb

#define MAX(a, B) ((a) >= (B) ? (a) : (B))

#define MIN(a, B) ((a) >= (B) ? (B) : (a))

#define SIGN(a,B) (((B) < (0) && (a > (0))||((B) > (0) && ((a)<(0)))) ? (-a) : (a))

#define SQR(a) ((a)*(a))

#endif /* FTOC_MACROS */

#define BLOCK_SIZE 64

#define NX 32

#define NVL 50

#define NPP 6

#define NTR 4

device float s_plus_ij[(NX)(NX)(NVL)*(NTR)];

device float s_mnus_ij[(NX)(NX)(NVL)*(NTR)];

device float r_plus_ij[(NX)(NX)(NVL)*(NTR)];

device float r_mnus_ij[(NX)(NX)(NVL)*(NTR)];

device float deltrc_ij[(NX)(NX)(NPP)(NVL)(NTR)];

//!********************************************************


//! fcttrc_ijksw

//! fcttrc = flux corrected transport for mass field tracers

//! J. Lee Author September, 2005

//! R. Bleck Designer November, 2005

//! A. E. MacDonald Documentor November, 2005

//! T. B. Henderson May, 2008

//! serial ijk version, Rhombus #1 interior only, isn,ivl on outside

//! TBH: NOTE: “*_ij” variable names have been retained for easier

//! TBH: comparison with _ij code.

//!

//! This routine is based on Zalesak, JOURNAL OF COMPUTATIONAL

//! PHYSICS, 31, 335-362, 1979. Dale Durran provides an

//! excellent discussion of flux corrected transport in his book

//! NUMERICAL METHODS FOR WAVE EQUATIONS IN GEOPHYSICAL FLUID DYNAMICS.

//!********************************************************


//------------------------------------------------------------------------------

global void fcttrc_ijksw_Kernel_2(int its,int nvl,int npp,int nabl,float dt,int ntr,int nx,int *nprox_ij,float *rsideln_ij,float *rprox_ln_ij,float *rarea_ij,float *area_ij,int nf,int of,int vof,float ab1,float ab2,float ab3,float *ue_ij,float *ve_ij,float *trce_ij,float *trc_ij,float *trcp_ij,float *trcpl_ij,float *fs_ij,float *fsl_ij,float *flxp_ij,float *adfs_ij,float *s_plus_ij, float *s_mnus_ij,float *r_plus_ij,float *r_mnus_ij,float *deltrc_ij) {

//! Local variables:

int ivl=blockIdx.y+1;

int ns;

float sflxph;

float sflxpl;

int i = (blockIdx.x*BLOCK_SIZE + threadIdx.x) % nx + 1;

int j = (blockIdx.x*BLOCK_SIZE + threadIdx.x) / nx + 1;

float trc1;

float v1,v2;

float flxp_ij_v1, flxp_ij_v2, fsl_temp;

//!GPU$KERNEL(<64>,<nx*nx/64,nvl>)

//! TBH: ij equivalent for entire rhombus #1

if ((i < 3) || (i > nx-2) || (j < 2) || (j > nx-1)) {

// printf(“blockIdx = %d %d threadIdx = %d %d i: %d j: %d\n”,blockIdx.x,blockIdx.y,

// threadIdx.x,threadIdx.y,i,j);

return;

}

float rarea_temp=rarea_ij[FTNREF2D(i,j,nx,1,1)];

for (ns=1;ns<=ntr;ns++) {

// // fsl_ij(:,:,:,nf,ns) = 0. ! forcing for tracer, low order; intialized

fsl_temp = 0.;

trc1=trc_ij[FTNREF4D(i,j,ivl,ns,nx,nx,nvl,1,1,1,1)];

#ifdef BLOCK1

      flxp_ij_v1=flxp_ij[FTNREF4D(i,j,1,ivl,nx,nx,npp,1,1,1,1)];

      flxp_ij_v2=flxp_ij[FTNREF4D(i - 1,j,4,ivl,nx,nx,npp,1,1,1,1)];

      v1=flxp_ij_v1+fabs(flxp_ij_v1);

      v2=flxp_ij_v2+fabs(flxp_ij_v2);

sflxph = 0.5f * ((v1) * (trce_ij[FTNREF5D(i,j,1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j,1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)])

                    - (v2) * (trce_ij[FTNREF5D(i - 1,j,4,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i - 1,j,4,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)]));

      sflxpl = 0.5f * ((v1) * trc1

                    - (v2) * trc_ij[FTNREF4D(i - 1,j,ivl,ns,nx,nx,nvl,1,1,1,1)]);

      adfs_ij[FTNREF5D(i,j,1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] = sflxph - sflxpl;

      fsl_temp += sflxpl;

#endif

#ifdef BLOCK2

      flxp_ij_v1=flxp_ij[FTNREF4D(i,j,2,ivl,nx,nx,npp,1,1,1,1)];

      flxp_ij_v2=flxp_ij[FTNREF4D(i,j-1,5,ivl,nx,nx,npp,1,1,1,1)];

      v1=flxp_ij_v1+fabs(flxp_ij_v1);

      v2=flxp_ij_v2+fabs(flxp_ij_v2);

sflxph = 0.5f * ((v1) * (trce_ij[FTNREF5D(i,j,2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j,2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)])

                    - (v2) * (trce_ij[FTNREF5D(i,j-1,5,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j-1,5,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)]));

      sflxpl = 0.5f * ((v1) * trc1

                    - (v2) * trc_ij[FTNREF4D(i,j-1,ivl,ns,nx,nx,nvl,1,1,1,1)]);

      adfs_ij[FTNREF5D(i,j,2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] = sflxph - sflxpl;

      fsl_temp += sflxpl;

#endif

#ifdef BLOCK3

      flxp_ij_v1=flxp_ij[FTNREF4D(i,j,3,ivl,nx,nx,npp,1,1,1,1)];

      flxp_ij_v2=flxp_ij[FTNREF4D(i+1,j-1,6,ivl,nx,nx,npp,1,1,1,1)];

      v1=flxp_ij_v1+fabs(flxp_ij_v1);

      v2=flxp_ij_v2+fabs(flxp_ij_v2);

sflxph = 0.5f * ((v1) * (trce_ij[FTNREF5D(i,j,3,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j,3,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)])

                    - (v2) * (trce_ij[FTNREF5D(i+1,j-1,6,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i+1,j-1,6,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)]));

      sflxpl = 0.5f * ((v1) * trc1

                    - (v2) * trc_ij[FTNREF4D(i+1,j-1,ivl,ns,nx,nx,nvl,1,1,1,1)]);

      adfs_ij[FTNREF5D(i,j,3,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] = sflxph - sflxpl;

      fsl_temp += sflxpl;

#endif

#ifdef BLOCK4

      flxp_ij_v1=flxp_ij[FTNREF4D(i,j,3,ivl,nx,nx,npp,1,1,1,1)];

      flxp_ij_v1=flxp_ij[FTNREF4D(i,j,4,ivl,nx,nx,npp,1,1,1,1)];

      flxp_ij_v2=flxp_ij[FTNREF4D(i+1,j,1,ivl,nx,nx,npp,1,1,1,1)];

      v1=flxp_ij_v1+fabs(flxp_ij_v1);

      v2=flxp_ij_v2+fabs(flxp_ij_v2);

sflxph = 0.5f * ((v1) * (trce_ij[FTNREF5D(i,j,4,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j,4,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)])

                    - (v2) * (trce_ij[FTNREF5D(i+1,j,1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i+1,j,1,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)])

);

      sflxpl = 0.5f * ((v1) * trc1

                    - (v2) * trc_ij[FTNREF4D(i+1,j,ivl,ns,nx,nx,nvl,1,1,1,1)]);

      adfs_ij[FTNREF5D(i,j,4,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] = sflxph - sflxpl;

      fsl_temp += sflxpl;

#endif

#ifdef BLOCK5

      flxp_ij_v1=flxp_ij[FTNREF4D(i,j,5,ivl,nx,nx,npp,1,1,1,1)];

      flxp_ij_v2=flxp_ij[FTNREF4D(i,j+1,2,ivl,nx,nx,npp,1,1,1,1)];

      v1=flxp_ij_v1+fabs(flxp_ij_v1);

      v2=flxp_ij_v2+fabs(flxp_ij_v2);

sflxph = 0.5f * ((v1) * (trce_ij[FTNREF5D(i,j,5,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j,5,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)])

                    - (v2) * (trce_ij[FTNREF5D(i,j+1,2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j+1,2,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)])

);

      sflxpl = 0.5f * ((v1) * trc1

                    - (v2) * trc_ij[FTNREF4D(i,j+1,ivl,ns,nx,nx,nvl,1,1,1,1)]);

      adfs_ij[FTNREF5D(i,j,5,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] = sflxph - sflxpl;

      fsl_temp += sflxpl;

#endif

#ifdef BLOCK6

      flxp_ij_v1=flxp_ij[FTNREF4D(i,j,6,ivl,nx,nx,npp,1,1,1,1)];

      flxp_ij_v2=flxp_ij[FTNREF4D(i-1,j+1,3,ivl,nx,nx,npp,1,1,1,1)];

      v1=flxp_ij_v1+fabs(flxp_ij_v1);

      v2=flxp_ij_v2+fabs(flxp_ij_v2);

sflxph = 0.5f * ((v1) * (trce_ij[FTNREF5D(i,j,6,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i,j,6,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)])

                    - (v2) * (trce_ij[FTNREF5D(i-1,j+1,3,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] - deltrc_ij[FTNREF5D(i-1,j+1,3,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)]));

      sflxpl = 0.5f * ((v1) * trc1

                    - (v2) * trc_ij[FTNREF4D(i-1,j+1,ivl,ns,nx,nx,nvl,1,1,1,1)]);

      adfs_ij[FTNREF5D(i,j,6,ivl,ns,nx,nx,npp,nvl,1,1,1,1,1)] = sflxph - sflxpl;

      fsl_temp += sflxpl;

#endif

      fsl_ij[ FTNREF5D(i,j,ivl,nf,ns,nx,nx,nvl,nabl,1,1,1,1,1) ] =fsl_temp;

fsl_ij[ FTNREF5D(i,j,ivl,nf,ns,nx,nx,nvl,nabl,1,1,1,1,1) ] = -fsl_ij[FTNREF5D(i,j,ivl,nf,ns,nx,nx,nvl,nabl,1,1,1,1,1)] * rarea_temp;

      trcpl_ij[ FTNREF4D(i,j,ivl,ns,nx,nx,nvl,1,1,1,1) ] = trcp_ij[FTNREF4D(i,j,ivl,ns,nx,nx,nvl,1,1,1,1)] 

           + ab1 * fsl_ij[FTNREF5D(i,j,ivl,nf,ns,nx,nx,nvl,nabl,1,1,1,1,1)] + 

           + ab2 * fsl_ij[FTNREF5D(i,j,ivl,of,ns,nx,nx,nvl,nabl,1,1,1,1,1)] 

           + ab3 * fsl_ij[FTNREF5D(i,j,ivl,vof,ns,nx,nx,nvl,nabl,1,1,1,1,1)];

}

return;

}

[/codebox]