Hi
For a simple vector addition code,
__global__ void increment( int *a, int N )
{
int i = threadIdx.x;
if ( i < N )
a[ i ] = a[ i ] + 1;
}
...
int M = atoi( argv[1] );
int N = atoi( argv[2] );
...
dim3 grid_size( M ); dim3 block_size( N );
increment<<< grid_size, block_size >>>( d_a, N );
...
I see the following assembly code:
code for sm_86
Function : _Z9incrementPii
.headerflags @"EF_CUDA_SM86 EF_CUDA_PTX_SM(EF_CUDA_SM86)"
/*0000*/ IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ; /* 0x00000a00ff017624 */
/* 0x000fc400078e00ff */
/*0010*/ S2R R2, SR_TID.X ; /* 0x0000000000027919 */
/* 0x000e240000002100 */
/*0020*/ ISETP.GE.AND P0, PT, R2, c[0x0][0x168], PT ; /* 0x00005a0002007a0c */
/* 0x001fda0003f06270 */
/*0030*/ @P0 EXIT ; /* 0x000000000000094d */
/* 0x000fea0003800000 */
/*0040*/ MOV R3, 0x4 ; /* 0x0000000400037802 */
/* 0x000fe20000000f00 */
/*0050*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fc80000000a00 */
/*0060*/ IMAD.WIDE R2, R2, R3, c[0x0][0x160] ; /* 0x0000580002027625 */
/* 0x000fca00078e0203 */
/*0070*/ LDG.E R0, [R2.64] ; /* 0x0000000402007981 */
/* 0x000ea4000c1e1900 */
/*0080*/ IADD3 R5, R0, 0x1, RZ ; /* 0x0000000100057810 */
/* 0x004fca0007ffe0ff */
/*0090*/ STG.E [R2.64], R5 ; /* 0x0000000502007986 */
/* 0x000fe2000c101904 */
/*00a0*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*00b0*/ BRA 0xb0; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*00c0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*00d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*00e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*00f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0100*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0110*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0120*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0130*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0140*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0150*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0160*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0170*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
In order to see how many instructions were executed, I ran the following profiler command:
$ nv-nsight-cu-cli --metrics smsp__inst_executed.sum ./vec_add 1 1024
==PROF== Connected to process 137482 (/home/mahmood/cuTest/vec_add)
==PROF== Profiling "increment(int*, int)" - 1: 0%....50%....100% - 1 pass
==PROF== Disconnected from process 137482
[137482] vec_add@127.0.0.1
increment(int*, int), 2022-Jun-17 10:11:33, Context 1, Stream 7
Section: Command line profiler metrics
----------------------------------------------- --------------- ------------------------------
smsp__inst_executed.sum inst 352
----------------------------------------------- --------------- ------------------------------
Excluding the NOP instructions, there are 12 instructions. 1024 threads means 32 warps. Therefore, 32*12=384 but the reported number is 352. Can someone explain what is missing here?