You would want to look at the generateg machine code to find answers to you questions. Your code
#include <stdlib.h>
#include <stdio.h>
__global__ void runWeightingFrameDelta(
float* pDevInitFrameThenDelta,
int nFramePixelNumU,
int iWeightingFirstU,
int iWeightingLastU,
int iActiveBorderV1,
int iActiveBorderV2)
{
int iPixel1U = blockIdx.x * blockDim.x + threadIdx.x;
int iPixel1V = blockIdx.y * blockDim.y + threadIdx.y;
if (iPixel1V > iActiveBorderV2 || iPixel1V < iActiveBorderV1 || ( iPixel1U - iWeightingLastU ) * (iPixel1U - iWeightingFirstU) > 0) {
return;
}
float theta = 1.0f *(iPixel1U - iWeightingFirstU) / (iWeightingLastU - iWeightingFirstU);
float weight = 0.5f + 0.5f * tanhf(6.0f * (theta - 0.5f));
int iPixelIndOfDevFrame = iPixel1V * nFramePixelNumU + iPixel1U;
pDevInitFrameThenDelta[iPixelIndOfDevFrame] *= weight;
}
int main (void)
{
runWeightingFrameDelta<<<1,1>>>(0,0,0,0,0,0);
}
when compiled with nvcc -arch=sm_35 -use_fast_math -Xptxas -v -o testkernel.exe testkernel.cu
, produces this ptxas
output
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z22runWeightingFrameDeltaPfiiiii' for 'sm_35'
ptxas info : Function properties for _Z22runWeightingFrameDeltaPfiiiii
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 6 registers, 348 bytes cmem[0], 32 bytes cmem[2]
and when disassembled with cuobjdump --dump-sass
shows the following machine code:
code for sm_35
Function : _Z22runWeightingFrameDeltaPfiiiii
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/*0008*/ MOV R1, c[0x0][0x44];
/*0010*/ S2R R0, SR_CTAID.Y;
/*0018*/ S2R R4, SR_TID.Y;
/*0020*/ S2R R3, SR_CTAID.X;
/*0028*/ S2R R2, SR_TID.X;
/*0030*/ IMAD R0, R0, c[0x0][0x2c], R4;
/*0038*/ IMAD R3, R3, c[0x0][0x28], R2;
/*0048*/ ISETP.LT.AND P0, PT, R0, c[0x0][0x154], PT;
/*0050*/ ISUB R2, R3, c[0x0][0x150];
/*0058*/ ISUB R4, R3, c[0x0][0x14c];
/*0060*/ ISETP.GT.OR P0, PT, R0, c[0x0][0x158], P0;
/*0068*/ IMUL R2, R2, R4;
/*0070*/ ISETP.GT.OR P0, PT, R2, RZ, P0;
/*0078*/ @P0 EXIT;
/*0088*/ MOV R2, c[0x0][0x150];
/*0090*/ I2F.F32.S32 R4, R4;
/*0098*/ ISUB R2, R2, c[0x0][0x14c];
/*00a0*/ SSY 0x1e0;
/*00a8*/ I2F.F32.S32 R2, R2;
/*00b0*/ MUFU.RCP R2, R2;
/*00b8*/ FFMA.FTZ R4, R4, R2, c[0x2][0x0];
/*00c8*/ FMUL.FTZ R4, R4, 6;
/*00d0*/ FSETP.LTU.FTZ.AND P0, PT, |R4|, c[0x2][0x4], PT;
/*00d8*/ @P0 BRA 0x180;
/*00e0*/ FADD.FTZ R2, |R4|, |R4|;
/*00e8*/ FSETP.LTU.FTZ.AND P0, PT, |R4|, 88, PT;
/*00f0*/ FMUL32I.FTZ R5, R2, 1.4426950216293334961;
/*00f8*/ LOP32I.AND R4, R4, 0x80000000;
/*0108*/ F2F.FTZ.F32.F32.TRUNC R5, R5;
/*0110*/ FFMA.FTZ R2, R5, c[0x2][0x8], R2;
/*0118*/ FFMA.FTZ R2, R5, c[0x2][0xc], R2;
/*0120*/ FMUL32I.FTZ R2, R2, 1.4426950216293334961;
/*0128*/ RRO.EX2 R5, R5;
/*0130*/ RRO.EX2 R2, R2;
/*0138*/ MUFU.EX2 R5, R5;
/*0148*/ MUFU.EX2 R2, R2;
/*0150*/ FFMA.FTZ R2, R2, R5, c[0x2][0x10];
/*0158*/ MOV32I R5, 0x3f800000;
/*0160*/ MUFU.RCP R2, R2;
/*0168*/ FFMA.FTZ R2, R2, -2, R5;
/*0170*/ SEL R2, R2, c[0x2][0x10], P0;
/*0178*/ LOP.OR.S R5, R2, R4;
/*0188*/ FMUL.FTZ R2, R4, R4;
/*0190*/ FSETP.EQ.AND P0, PT, R4, RZ, PT;
/*0198*/ MOV32I R5, 0x3d57be66;
/*01a0*/ FFMA.FTZ R5, R2, c[0x2][0x14], -R5;
/*01a8*/ FFMA.FTZ R5, R2, R5, c[0x2][0x18];
/*01b0*/ FFMA.FTZ R5, R2, R5, c[0x2][0x1c];
/*01b8*/ FMUL.FTZ R2, R2, R5;
/*01c8*/ FFMA.FTZ R5, R4, R2, R4;
/*01d0*/ @P0 FADD.FTZ R5, R4, R4;
/*01d8*/ NOP.S;
/*01e0*/ IMAD R0, R0, c[0x0][0x148], R3;
/*01e8*/ MOV32I R3, 0x4;
/*01f0*/ ISCADD R2.CC, R0, c[0x0][0x140], 0x2;
/*01f8*/ MOV32I R4, 0x3f000000;
/*0208*/ IMAD.HI.X R3, R0, R3, c[0x0][0x144];
/*0210*/ FFMA.FTZ R5, R5, 0.5, R4;
/*0218*/ LD.E R0, [R2];
/*0220*/ FMUL.FTZ R5, R5, R0;
/*0228*/ ST.E [R2], R5;
/*0230*/ EXIT;
/*0238*/ BRA 0x238;
Constant bank 0 (c[0]
) appears to be used for passing kernel arguments, while constant bank 2 (c[2]
) appears to be used for literal constants from the source code, in particular floating-point constants for the expf()
computation inside tanhf()
. You can also see that not all floating-point literals are stored in c[2]
, e.g. 0.5f
is loaded as an immediate in MOV32I R4, 0x3f000000
, while some floating-point constants can be encoded directly into the instruction as in the case of FFMA.FTZ R2, R2, -2, R5
.
Note that GPU architectures do not have binary compatibility, so they have different instruction set architectures and are being targeted by machine-specific backends inside ptxas
. When different instruction sets and different compiler components are being used it is not surprising that there will be differences in register and constant memory bank usage.
For what it’s worth, I am unable to reproduce your findings when building for sm_60
. When using CUDA 11.1, compiling with nvcc -arch=sm_35 -use_fast_math -Xptxas -v -o testkernel.exe testkernel.cu
, produces this ptxas
output:
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function ‘_Z22runWeightingFrameDeltaPfiiiii’ for ‘sm_60’
ptxas info : Function properties for _Z22runWeightingFrameDeltaPfiiiii
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 7 registers, 348 bytes cmem[0], 32 bytes cmem[2]
A quick look at the disassembly seems to indicate that the slight increase in register usage is due to the fact that 32-bit integer multiplies are emulated via the XMAD
instruction.
code for sm_60
Function : _Z22runWeightingFrameDeltaPfiiiii
.headerflags @"EF_CUDA_SM60 EF_CUDA_PTX_SM(EF_CUDA_SM60)"
/*0008*/ MOV R1, c[0x0][0x20] ;
/*0010*/ S2R R3, SR_CTAID.Y ;
/*0018*/ S2R R4, SR_TID.Y ;
/*0028*/ S2R R2, SR_CTAID.X ;
/*0030*/ S2R R0, SR_TID.X ;
/*0038*/ XMAD.MRG R6, R3.reuse, c[0x0] [0xc].H1, RZ ;
/*0048*/ XMAD R5, R3, c[0x0] [0xc], R4 ;
/*0050*/ XMAD R4, R2, c[0x0] [0x8], R0 ;
/*0058*/ XMAD.PSL.CBCC R0, R3.H1, R6.H1, R5 ;
/*0068*/ XMAD.MRG R3, R2.reuse, c[0x0] [0x8].H1, RZ ;
/*0070*/ ISETP.GE.AND P0, PT, R0.reuse, c[0x0][0x154], PT ;
/*0078*/ XMAD.PSL.CBCC R2, R2.H1, R3.H1, R4 ;
/*0088*/ IADD R3, R2.reuse, -c[0x0][0x150] ;
/*0090*/ IADD R4, R2, -c[0x0][0x14c] ;
/*0098*/ ISETP.GT.OR P0, PT, R0, c[0x0][0x158], !P0 ;
/*00a8*/ XMAD R5, R3.reuse, R4.reuse, RZ ;
/*00b0*/ XMAD.MRG R6, R3.reuse, R4.H1, RZ ;
/*00b8*/ XMAD.PSL.CBCC R3, R3.H1, R6.H1, R5 ;
/*00c8*/ ISETP.GT.OR P0, PT, R3, RZ, P0 ;
/*00d0*/ @P0 EXIT ;
/*00d8*/ { MOV R3, c[0x0][0x150] ;
/*00e8*/ I2F.F32.S32 R4, R4 }
/*00f0*/ { IADD R3, R3, -c[0x0][0x14c] ;
/*00f8*/ SSY 0x218 }
/*0108*/ I2F.F32.S32 R3, R3 ;
/*0110*/ MUFU.RCP R5, R3 ;
/*0118*/ FFMA.FTZ R5, R4, R5, c[0x2][0x0] ;
/*0128*/ FMUL.FTZ R5, R5, 6 ;
/*0130*/ FSETP.GE.FTZ.AND P0, PT, |R5|, c[0x2][0x4], PT ;
/*0138*/ @!P0 BRA 0x1c0 ;
/*0148*/ FADD R3, |R5|, -RZ ;
/*0150*/ FMUL32I.FTZ R3, R3, 2.8853900432586669922 ;
/*0158*/ RRO.EX2 R6, R3 ;
/*0168*/ MUFU.EX2 R3, R6 ;
/*0170*/ FSETP.GE.FTZ.AND P0, PT, |R5|, c[0x2][0x8], PT ;
/*0178*/ MOV32I R4, 0x3f800000 ;
/*0188*/ FADD.FTZ R3, R3, 1 ;
/*0190*/ MUFU.RCP R3, R3 ;
/*0198*/ FFMA.FTZ R4, R3, -2, R4 ;
/*01a8*/ SEL R4, R4, c[0x2][0xc], !P0 ;
/*01b0*/ { LOP3.LUT R5, R4, c[0x2][0x1c], R5, 0xf8 ;
/*01b8*/ SYNC }
/*01c8*/ FMUL.FTZ R3, R5, R5 ;
/*01d0*/ MOV32I R4, 0x3d563cae ;
/*01d8*/ FFMA.FTZ R4, R3.reuse, c[0x2][0x10], -R4 ;
/*01e8*/ FFMA.FTZ R4, R3.reuse, R4, c[0x2][0x14] ;
/*01f0*/ FFMA.FTZ R4, R3.reuse, R4, c[0x2][0x18] ;
/*01f8*/ FFMA.FTZ R4, R3, R4, RZ ;
/*0208*/ { FFMA.FTZ R5, R5, R4, R5 ;
/*0210*/ SYNC }
/*0218*/ XMAD R2, R0.reuse, c[0x0] [0x148], R2 ;
/*0228*/ XMAD.MRG R3, R0.reuse, c[0x0] [0x148].H1, RZ ;
/*0230*/ XMAD.PSL.CBCC R0, R0.H1, R3.H1, R2 ;
/*0238*/ SHR R3, R0.reuse, 0x1e ;
/*0248*/ ISCADD R2.CC, R0, c[0x0][0x140], 0x2 ;
/*0250*/ IADD.X R3, R3, c[0x0][0x144] ;
/*0258*/ LDG.E R0, [R2] ;
/*0268*/ MOV32I R4, 0x3f000000 ;
/*0270*/ FFMA.FTZ R5, R5, 0.5, R4 ;
/*0278*/ FMUL.FTZ R0, R5, R0 ;
/*0288*/ STG.E [R2], R0 ;
/*0290*/ EXIT ;
/*0298*/ BRA 0x298 ;
/*02a8*/ NOP;
/*02b0*/ NOP;
/*02b8*/ NOP;