I had written the following code for study. According to my observation on CUDA 11.4 it repeats the observed MOV sequence approximately as indicated in OP’s post, more or less independent of the arch target, although there are some code gen variations from one arch target to the other. I haven’t gotten around to checking the SASS on CUDA 12.0 yet.
#include <cstdint>
__device__ __constant__ uint16_t LUT[256] = {0};
__global__ void k(uint16_t *d){
__shared__ uint16_t sh_LUT[256][32];
uint4 tmp0;
uint32_t tmp1;
tmp1 = LUT[threadIdx.x] | (LUT[threadIdx.x] << 16);
tmp0.x = tmp1;
tmp0.y = tmp1;
tmp0.z = tmp1;
tmp0.w = tmp1;
for(int i = 0; i < 4; i++){
reinterpret_cast<uint4*>(sh_LUT[threadIdx.x])[i] = tmp0;
}
for (int i = threadIdx.x; i < 256; i++)
d[threadIdx.x] += sh_LUT[i][i];
}
Example:
$ nvcc -c t2181.cu -arch=sm_61
$ cuobjdump -sass ./t2181.o
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_61
Function : _Z1kPt
.headerflags @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
/* 0x083fd000e3e007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ S2R R8, SR_TID.X ; /* 0xf0c8000002170008 */
/*0018*/ SHL R2, R8.reuse, 0x1 ; /* 0x3848000000170802 */
/* 0x003fc400e3e007ed */
/*0028*/ ISETP.GT.AND P0, PT, R8, 0xff, PT ; /* 0x366903800ff70807 */
/*0030*/ LDC.U16 R0, c[0x3][R2] ; /* 0xef92003000070200 */
/*0038*/ PRMT R4, R0, 0x1054, R0 ; /* 0x36c0000105470004 */
/* 0x101fc480fe2007f5 */
/*0048*/ SHL R0, R8, 0x6 ; /* 0x3848000000670800 */
/*0050*/ MOV R12, R4.reuse ; /* 0x5c9807800047000c */
/*0058*/ MOV R13, R4.reuse ; /* 0x5c9807800047000d */
/* 0x101fc080fe8007e1 */
/*0068*/ MOV R14, R4 ; /* 0x5c9807800047000e */
/*0070*/ MOV R15, R4.reuse ; /* 0x5c9807800047000f */
/*0078*/ { MOV R16, R4.reuse ; /* 0x5c98078000470010 */
/* 0x101fc480fe2000f1 */
/*0088*/ STS.128 [R0+0x10], R12 }
/* 0xef5e00000107000c */
/*0090*/ MOV R17, R4.reuse ; /* 0x5c98078000470011 */
/*0098*/ MOV R18, R4.reuse ; /* 0x5c98078000470012 */
/* 0x0003c480fe0407f4 */
/*00a8*/ MOV R19, R4.reuse ; /* 0x5c98078000470013 */
/*00b0*/ { MOV R5, R4.reuse ; /* 0x5c98078000470005 */
/*00b8*/ STS.128 [R0+0x20], R16 }
/* 0xef5e000002070010 */
/* 0x101fc080fe8007e1 */
/*00c8*/ MOV R6, R4 ; /* 0x5c98078000470006 */
/*00d0*/ MOV R7, R4.reuse ; /* 0x5c98078000470007 */
/*00d8*/ { MOV R20, R4.reuse ; /* 0x5c98078000470014 */
/* 0x101fc480fe2000f1 */
/*00e8*/ STS.128 [R0], R4 }
/* 0xef5e000000070004 */
/*00f0*/ MOV R21, R4.reuse ; /* 0x5c98078000470015 */
/*00f8*/ MOV R22, R4.reuse ; /* 0x5c98078000470016 */
/* 0x001ff4001e2007f4 */
/*0108*/ MOV R23, R4 ; /* 0x5c98078000470017 */
/*0110*/ STS.128 [R0+0x30], R20 ; /* 0xef5e000003070014 */
/*0118*/ @P0 EXIT ; /* 0xe30000000000000f */
/* 0x001f8800fec207f0 */
/*0128*/ { LEA R2.CC, R8.reuse, c[0x0][0x140], 0x1 ; /* 0x4bd7808005070802 */
/*0130*/ SSY 0x218 }
/* 0xe29000000e000000 */
/*0138*/ LEA.HI.X R3, R8, c[0x0][0x144], RZ, 0x1 ; /* 0x1a0f7f8005170803 */
/* 0x001f8400fcc008b1 */
/*0148*/ LDG.E.U16 R5, [R2] ; /* 0xeed2200000070205 */
/*0150*/ IADD R0, -R8, RZ ; /* 0x5c1200000ff70800 */
/*0158*/ LOP32I.AND R0, R0, 0x3 ; /* 0x0400000000370000 */
/* 0x001fb000fe2007f5 */
/*0168*/ IADD32I R4, -R8, 0xff ; /* 0x1d0000000ff70804 */
/*0170*/ ISETP.NE.AND P1, PT, R0, RZ, PT ; /* 0x5b6b03800ff7000f */
/*0178*/ ISETP.GE.U32.AND P0, PT, R4, 0x3, PT ; /* 0x366c038000370407 */
/* 0x083fc400fec007f0 */
/*0188*/ { MOV R6, R8 ; /* 0x5c98078000870006 */
/*0190*/ @!P1 SYNC }
/* 0xf0f800000009000f */
/*0198*/ XMAD R7, R8.reuse, 0x42, RZ ; /* 0x36007f8004270807 */
/* 0x001fc400fec007f5 */
/*01a8*/ MOV R6, R8 ; /* 0x5c98078000870006 */
/*01b0*/ XMAD.PSL R4, R8.H1, 0x42, R7 ; /* 0x3620039004270804 */
/*01b8*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001fd400062007f0 */
/*01c8*/ { IADD32I R0, R0, -0x1 ; /* 0x1c0ffffffff70000 */
/*01d0*/ LDS.U.U16 R7, [R4] }
/* 0xef4a100000070407 */
/*01d8*/ IADD32I R6, R6, 0x1 ; /* 0x1c00000000170606 */
/* 0x045fc001fd8007f1 */
/*01e8*/ ISETP.NE.AND P1, PT, R0, RZ, PT ; /* 0x5b6b03800ff7000f */
/*01f0*/ IADD32I R4, R4, 0x42 ; /* 0x1c00000004270404 */
/*01f8*/ { IADD R5, R7, R5 ; /* 0x5c10000000570705 */
/* 0x003fc400ffa007fd */
/*0208*/ @P1 BRA 0x1c0 }
/* 0xe2400ffffb01000f */
/*0210*/ SYNC ; /* 0xf0f800000007000f */
/*0218*/ SSY 0x510 ; /* 0xe29000002f000000 */
/* 0x001fc440fe0007fd */
/*0228*/ @!P0 SYNC ; /* 0xf0f800000008000f */
/*0230*/ { IADD32I R4, R6.reuse, -0x4 ; /* 0x1c0fffffffc70604 */
/*0238*/ SSY 0x3d0 }
/* 0xe290000019000000 */
/* 0x001fc400fe8007f1 */
/*0248*/ MOV32I R0, 0x84 ; /* 0x010000000847f000 */
/*0250*/ PSETP.AND.AND P0, PT, PT, PT, PT ; /* 0x50900380e0077007 */
/*0258*/ IADD32I R7, -R4, 0xfc ; /* 0x1d0000000fc70407 */
/* 0x001fc000fda007f5 */
/*0268*/ XMAD R0, R6, 0x42, R0 ; /* 0x3600000004270600 */
/*0270*/ ISETP.GT.AND P1, PT, R7, 0xc, PT ; /* 0x3669038000c7070f */
/*0278*/ { XMAD.PSL R0, R6.H1, 0x42, R0 ; /* 0x3620001004270600 */
/* 0x001fc000fda007fd */
/*0288*/ @!P1 SYNC }
/* 0xf0f800000009000f */
/*0290*/ PSETP.AND.AND P0, PT, !PT, PT, PT ; /* 0x50900380e007f007 */
/*0298*/ { IADD32I R4, R4, 0x10 ; /* 0x1c00000001070404 */
/* 0x001c4400fe0007f6 */
/*02a8*/ LDS.U.U16 R6, [R0+-0x84] }
/* 0xef4a1ffff7c70006 */
/*02b0*/ { ISETP.GE.AND P1, PT, R4, 0xf0, PT ; /* 0x366d03800f07040f */
/*02b8*/ LDS.U.U16 R7, [R0+-0x42] }
/* 0xef4a1ffffbe70007 */
/* 0x001fc400e62007f1 */
/*02c8*/ LDS.U.U16 R8, [R0] ; /* 0xef4a100000070008 */
/*02d0*/ LDS.U.U16 R9, [R0+0x42] ; /* 0xef4a100004270009 */
/*02d8*/ LDS.U.U16 R11, [R0+0x84] ; /* 0xef4a10000847000b */
/* 0x001dc400fe200751 */
/*02e8*/ LDS.U.U16 R12, [R0+0xc6] ; /* 0xef4a10000c67000c */
/*02f0*/ LDS.U.U16 R13, [R0+0x108] ; /* 0xef4a10001087000d */
/*02f8*/ LDS.U.U16 R15, [R0+0x14a] ; /* 0xef4a100014a7000f */
/* 0x005fc000fec10ff0 */
/*0308*/ { IADD3 R6, R7, R5, R6 ; /* 0x5cc0030000570706 */
/*0310*/ LDS.U.U16 R16, [R0+0x18c] }
/* 0xef4a100018c70010 */
/*0318*/ { IADD3 R6, R9, R6, R8 ; /* 0x5cc0040000670906 */
/* 0x001ff404fe000716 */
/*0328*/ LDS.U.U16 R17, [R0+0x1ce] }
/* 0xef4a10001ce70011 */
/*0330*/ { IADD3 R6, R12, R6, R11 ; /* 0x5cc0058000670c06 */
/*0338*/ LDS.U.U16 R19, [R0+0x210] }
/* 0xef4a100021070013 */
/* 0x003fc000e7a047f0 */
/*0348*/ { IADD3 R6, R15, R6, R13 ; /* 0x5cc0068000670f06 */
/*0350*/ LDS.U.U16 R5, [R0+0x252] }
/* 0xef4a100025270005 */
/*0358*/ { IADD3 R6, R17, R6, R16 ; /* 0x5cc0080000671106 */
/* 0x001fc400e22007f1 */
/*0368*/ LDS.U.U16 R7, [R0+0x294] }
/* 0xef4a100029470007 */
/*0370*/ LDS.U.U16 R8, [R0+0x2d6] ; /* 0xef4a10002d670008 */
/*0378*/ LDS.U.U16 R10, [R0+0x318] ; /* 0xef4a10003187000a */
/* 0x009ff402fc200273 */
/*0388*/ LDS.U.U16 R11, [R0+0x35a] ; /* 0xef4a100035a7000b */
/*0390*/ IADD3 R5, R5, R6, R19 ; /* 0x5cc0098000670505 */
/*0398*/ IADD32I R0, R0, 0x420 ; /* 0x1c00000042070000 */
/* 0x001ff408fe000ffd */
/*03a8*/ IADD3 R5, R8, R5, R7 ; /* 0x5cc0038000570805 */
/*03b0*/ { IADD3 R5, R11, R5, R10 ; /* 0x5cc0050000570b05 */
/*03b8*/ @!P1 BRA 0x298 }
/* 0xe2400fffed89000f */
/* 0x001fd800fe0007fd */
/*03c8*/ SYNC ; /* 0xf0f800000007000f */
/*03d0*/ { IADD32I R6, -R4, 0xfc ; /* 0x1d0000000fc70406 */
/*03d8*/ SSY 0x4b0 }
/* 0xe29000000d000000 */
/* 0x001fc000ffa007ed */
/*03e8*/ ISETP.GT.AND P1, PT, R6, 0x4, PT ; /* 0x366903800047060f */
/*03f0*/ @!P1 SYNC ; /* 0xf0f800000009000f */
/*03f8*/ { PSETP.AND.AND P0, PT, !PT, PT, PT ; /* 0x50900380e007f007 */
/* 0x001c4400fe0007f1 */
/*0408*/ LDS.U.U16 R6, [R0+-0x84] }
/* 0xef4a1ffff7c70006 */
/*0410*/ { IADD32I R4, R4, 0x8 ; /* 0x1c00000000870404 */
/*0418*/ LDS.U.U16 R7, [R0+-0x42] }
/* 0xef4a1ffffbe70007 */
/* 0x001fc400e62007f1 */
/*0428*/ LDS.U.U16 R8, [R0] ; /* 0xef4a100000070008 */
/*0430*/ LDS.U.U16 R9, [R0+0x42] ; /* 0xef4a100004270009 */
/*0438*/ LDS.U.U16 R11, [R0+0x84] ; /* 0xef4a10000847000b */
/* 0x000e4400fe200751 */
/*0448*/ LDS.U.U16 R12, [R0+0xc6] ; /* 0xef4a10000c67000c */
/*0450*/ LDS.U.U16 R13, [R0+0x108] ; /* 0xef4a10001087000d */
/*0458*/ LDS.U.U16 R15, [R0+0x14a] ; /* 0xef4a100014a7000f */
/* 0x009f8402ffa10fe6 */
/*0468*/ IADD3 R6, R7, R5, R6 ; /* 0x5cc0030000570706 */
/*0470*/ IADD3 R6, R9, R6, R8 ; /* 0x5cc0040000670906 */
/*0478*/ IADD3 R6, R12, R6, R11 ; /* 0x5cc0058000670c06 */
/* 0x001fc010fcc047f5 */
/*0488*/ IADD32I R0, R0, 0x210 ; /* 0x1c00000021070000 */
/*0490*/ IADD3 R6, R15, R6, R13 ; /* 0x5cc0068000670f06 */
/*0498*/ { XMAD.PSL.CLO R5, R5.H1, 0x1, R6 ; /* 0x3624031000170505 */
/* 0x001fc400fda007fd */
/*04a8*/ SYNC }
/* 0xf0f800000007000f */
/*04b0*/ ISETP.LT.OR P0, PT, R4, 0xfc, P0 ; /* 0x366320000fc70407 */
/*04b8*/ @P0 LDS.U.U16 R4, [R0+-0x84] ; /* 0xef4a1ffff7c00004 */
/* 0x001cc400fe200711 */
/*04c8*/ @P0 LDS.U.U16 R6, [R0+-0x42] ; /* 0xef4a1ffffbe00006 */
/*04d0*/ @P0 LDS.U.U16 R7, [R0] ; /* 0xef4a100000000007 */
/*04d8*/ @P0 LDS.U.U16 R9, [R0+0x42] ; /* 0xef4a100004200009 */
/* 0x001fc002fcc10ff6 */
/*04e8*/ @P0 IADD3 R4, R6, R5, R4 ; /* 0x5cc0020000500604 */
/*04f0*/ @P0 IADD3 R4, R9, R4, R7 ; /* 0x5cc0038000400904 */
/*04f8*/ { @P0 XMAD.PSL.CLO R5, R5.H1, 0x1, R4 ; /* 0x3624021000100505 */
/* 0x001f8420fe2007fd */
/*0508*/ SYNC }
/* 0xf0f800000007000f */
/*0510*/ STG.E.U16 [R2], R5 ; /* 0xeeda200000070205 */
/*0518*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001f8000ffe007ff */
/*0528*/ EXIT ; /* 0xe30000000007000f */
/*0530*/ BRA 0x530 ; /* 0xe2400fffff87000f */
/*0538*/ NOP; /* 0x50b0000000070f00 */
..........
Fatbin ptx code:
================
arch = sm_61
code version = [7,4]
producer = <unknown>
host = linux
compile_size = 64bit
compressed
$
(Yes, the code as written has illegal behavior in it. I don’t think that is important, and can be gotten rid of. I wrote it just to study the generated SASS.)