I had written the following code for study.  According to my observation on CUDA 11.4 it repeats the observed MOV sequence approximately as indicated in OP’s post, more or less independent of the arch target, although there are some code gen variations from one arch target to the other.  I haven’t gotten around to checking the SASS on CUDA 12.0 yet.
#include <cstdint>
    __device__ __constant__ uint16_t LUT[256] = {0};
    __global__ void k(uint16_t *d){
      __shared__  uint16_t sh_LUT[256][32];
                uint4     tmp0;
                uint32_t  tmp1;
                tmp1 = LUT[threadIdx.x] | (LUT[threadIdx.x] << 16);
                tmp0.x = tmp1;
                tmp0.y = tmp1;
                tmp0.z = tmp1;
                tmp0.w = tmp1;
                for(int i = 0; i < 4; i++){
                        reinterpret_cast<uint4*>(sh_LUT[threadIdx.x])[i] = tmp0;
                }
                for (int i = threadIdx.x; i <  256; i++)
                  d[threadIdx.x]  += sh_LUT[i][i];
                }
Example:
$ nvcc -c t2181.cu -arch=sm_61
$ cuobjdump -sass ./t2181.o
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
        code for sm_61
                Function : _Z1kPt
        .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                              /* 0x083fd000e3e007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20] ;                     /* 0x4c98078000870001 */
        /*0010*/                   S2R R8, SR_TID.X ;                         /* 0xf0c8000002170008 */
        /*0018*/                   SHL R2, R8.reuse, 0x1 ;                    /* 0x3848000000170802 */
                                                                              /* 0x003fc400e3e007ed */
        /*0028*/                   ISETP.GT.AND P0, PT, R8, 0xff, PT ;        /* 0x366903800ff70807 */
        /*0030*/                   LDC.U16 R0, c[0x3][R2] ;                   /* 0xef92003000070200 */
        /*0038*/                   PRMT R4, R0, 0x1054, R0 ;                  /* 0x36c0000105470004 */
                                                                              /* 0x101fc480fe2007f5 */
        /*0048*/                   SHL R0, R8, 0x6 ;                          /* 0x3848000000670800 */
        /*0050*/                   MOV R12, R4.reuse ;                        /* 0x5c9807800047000c */
        /*0058*/                   MOV R13, R4.reuse ;                        /* 0x5c9807800047000d */
                                                                              /* 0x101fc080fe8007e1 */
        /*0068*/                   MOV R14, R4 ;                              /* 0x5c9807800047000e */
        /*0070*/                   MOV R15, R4.reuse ;                        /* 0x5c9807800047000f */
        /*0078*/         {         MOV R16, R4.reuse ;                        /* 0x5c98078000470010 */
                                                                              /* 0x101fc480fe2000f1 */
        /*0088*/                   STS.128 [R0+0x10], R12         }
                                                                              /* 0xef5e00000107000c */
        /*0090*/                   MOV R17, R4.reuse ;                        /* 0x5c98078000470011 */
        /*0098*/                   MOV R18, R4.reuse ;                        /* 0x5c98078000470012 */
                                                                              /* 0x0003c480fe0407f4 */
        /*00a8*/                   MOV R19, R4.reuse ;                        /* 0x5c98078000470013 */
        /*00b0*/         {         MOV R5, R4.reuse ;                         /* 0x5c98078000470005 */
        /*00b8*/                   STS.128 [R0+0x20], R16         }
                                                                              /* 0xef5e000002070010 */
                                                                              /* 0x101fc080fe8007e1 */
        /*00c8*/                   MOV R6, R4 ;                               /* 0x5c98078000470006 */
        /*00d0*/                   MOV R7, R4.reuse ;                         /* 0x5c98078000470007 */
        /*00d8*/         {         MOV R20, R4.reuse ;                        /* 0x5c98078000470014 */
                                                                              /* 0x101fc480fe2000f1 */
        /*00e8*/                   STS.128 [R0], R4         }
                                                                              /* 0xef5e000000070004 */
        /*00f0*/                   MOV R21, R4.reuse ;                        /* 0x5c98078000470015 */
        /*00f8*/                   MOV R22, R4.reuse ;                        /* 0x5c98078000470016 */
                                                                              /* 0x001ff4001e2007f4 */
        /*0108*/                   MOV R23, R4 ;                              /* 0x5c98078000470017 */
        /*0110*/                   STS.128 [R0+0x30], R20 ;                   /* 0xef5e000003070014 */
        /*0118*/               @P0 EXIT ;                                     /* 0xe30000000000000f */
                                                                              /* 0x001f8800fec207f0 */
        /*0128*/         {         LEA R2.CC, R8.reuse, c[0x0][0x140], 0x1 ;  /* 0x4bd7808005070802 */
        /*0130*/                   SSY 0x218         }
                                                                              /* 0xe29000000e000000 */
        /*0138*/                   LEA.HI.X R3, R8, c[0x0][0x144], RZ, 0x1 ;  /* 0x1a0f7f8005170803 */
                                                                              /* 0x001f8400fcc008b1 */
        /*0148*/                   LDG.E.U16 R5, [R2] ;                       /* 0xeed2200000070205 */
        /*0150*/                   IADD R0, -R8, RZ ;                         /* 0x5c1200000ff70800 */
        /*0158*/                   LOP32I.AND R0, R0, 0x3 ;                   /* 0x0400000000370000 */
                                                                              /* 0x001fb000fe2007f5 */
        /*0168*/                   IADD32I R4, -R8, 0xff ;                    /* 0x1d0000000ff70804 */
        /*0170*/                   ISETP.NE.AND P1, PT, R0, RZ, PT ;          /* 0x5b6b03800ff7000f */
        /*0178*/                   ISETP.GE.U32.AND P0, PT, R4, 0x3, PT ;     /* 0x366c038000370407 */
                                                                              /* 0x083fc400fec007f0 */
        /*0188*/         {         MOV R6, R8 ;                               /* 0x5c98078000870006 */
        /*0190*/              @!P1 SYNC         }
                                                                              /* 0xf0f800000009000f */
        /*0198*/                   XMAD R7, R8.reuse, 0x42, RZ ;              /* 0x36007f8004270807 */
                                                                              /* 0x001fc400fec007f5 */
        /*01a8*/                   MOV R6, R8 ;                               /* 0x5c98078000870006 */
        /*01b0*/                   XMAD.PSL R4, R8.H1, 0x42, R7 ;             /* 0x3620039004270804 */
        /*01b8*/                   NOP ;                                      /* 0x50b0000000070f00 */
                                                                              /* 0x001fd400062007f0 */
        /*01c8*/         {         IADD32I R0, R0, -0x1 ;                     /* 0x1c0ffffffff70000 */
        /*01d0*/                   LDS.U.U16 R7, [R4]         }
                                                                              /* 0xef4a100000070407 */
        /*01d8*/                   IADD32I R6, R6, 0x1 ;                      /* 0x1c00000000170606 */
                                                                              /* 0x045fc001fd8007f1 */
        /*01e8*/                   ISETP.NE.AND P1, PT, R0, RZ, PT ;          /* 0x5b6b03800ff7000f */
        /*01f0*/                   IADD32I R4, R4, 0x42 ;                     /* 0x1c00000004270404 */
        /*01f8*/         {         IADD R5, R7, R5 ;                          /* 0x5c10000000570705 */
                                                                              /* 0x003fc400ffa007fd */
        /*0208*/               @P1 BRA 0x1c0         }
                                                                              /* 0xe2400ffffb01000f */
        /*0210*/                   SYNC ;                                     /* 0xf0f800000007000f */
        /*0218*/                   SSY 0x510 ;                                /* 0xe29000002f000000 */
                                                                              /* 0x001fc440fe0007fd */
        /*0228*/              @!P0 SYNC ;                                     /* 0xf0f800000008000f */
        /*0230*/         {         IADD32I R4, R6.reuse, -0x4 ;               /* 0x1c0fffffffc70604 */
        /*0238*/                   SSY 0x3d0         }
                                                                              /* 0xe290000019000000 */
                                                                              /* 0x001fc400fe8007f1 */
        /*0248*/                   MOV32I R0, 0x84 ;                          /* 0x010000000847f000 */
        /*0250*/                   PSETP.AND.AND P0, PT, PT, PT, PT ;         /* 0x50900380e0077007 */
        /*0258*/                   IADD32I R7, -R4, 0xfc ;                    /* 0x1d0000000fc70407 */
                                                                              /* 0x001fc000fda007f5 */
        /*0268*/                   XMAD R0, R6, 0x42, R0 ;                    /* 0x3600000004270600 */
        /*0270*/                   ISETP.GT.AND P1, PT, R7, 0xc, PT ;         /* 0x3669038000c7070f */
        /*0278*/         {         XMAD.PSL R0, R6.H1, 0x42, R0 ;             /* 0x3620001004270600 */
                                                                              /* 0x001fc000fda007fd */
        /*0288*/              @!P1 SYNC         }
                                                                              /* 0xf0f800000009000f */
        /*0290*/                   PSETP.AND.AND P0, PT, !PT, PT, PT ;        /* 0x50900380e007f007 */
        /*0298*/         {         IADD32I R4, R4, 0x10 ;                     /* 0x1c00000001070404 */
                                                                              /* 0x001c4400fe0007f6 */
        /*02a8*/                   LDS.U.U16 R6, [R0+-0x84]         }
                                                                              /* 0xef4a1ffff7c70006 */
        /*02b0*/         {         ISETP.GE.AND P1, PT, R4, 0xf0, PT ;        /* 0x366d03800f07040f */
        /*02b8*/                   LDS.U.U16 R7, [R0+-0x42]         }
                                                                              /* 0xef4a1ffffbe70007 */
                                                                              /* 0x001fc400e62007f1 */
        /*02c8*/                   LDS.U.U16 R8, [R0] ;                       /* 0xef4a100000070008 */
        /*02d0*/                   LDS.U.U16 R9, [R0+0x42] ;                  /* 0xef4a100004270009 */
        /*02d8*/                   LDS.U.U16 R11, [R0+0x84] ;                 /* 0xef4a10000847000b */
                                                                              /* 0x001dc400fe200751 */
        /*02e8*/                   LDS.U.U16 R12, [R0+0xc6] ;                 /* 0xef4a10000c67000c */
        /*02f0*/                   LDS.U.U16 R13, [R0+0x108] ;                /* 0xef4a10001087000d */
        /*02f8*/                   LDS.U.U16 R15, [R0+0x14a] ;                /* 0xef4a100014a7000f */
                                                                              /* 0x005fc000fec10ff0 */
        /*0308*/         {         IADD3 R6, R7, R5, R6 ;                     /* 0x5cc0030000570706 */
        /*0310*/                   LDS.U.U16 R16, [R0+0x18c]         }
                                                                              /* 0xef4a100018c70010 */
        /*0318*/         {         IADD3 R6, R9, R6, R8 ;                     /* 0x5cc0040000670906 */
                                                                              /* 0x001ff404fe000716 */
        /*0328*/                   LDS.U.U16 R17, [R0+0x1ce]         }
                                                                              /* 0xef4a10001ce70011 */
        /*0330*/         {         IADD3 R6, R12, R6, R11 ;                   /* 0x5cc0058000670c06 */
        /*0338*/                   LDS.U.U16 R19, [R0+0x210]         }
                                                                              /* 0xef4a100021070013 */
                                                                              /* 0x003fc000e7a047f0 */
        /*0348*/         {         IADD3 R6, R15, R6, R13 ;                   /* 0x5cc0068000670f06 */
        /*0350*/                   LDS.U.U16 R5, [R0+0x252]         }
                                                                              /* 0xef4a100025270005 */
        /*0358*/         {         IADD3 R6, R17, R6, R16 ;                   /* 0x5cc0080000671106 */
                                                                              /* 0x001fc400e22007f1 */
        /*0368*/                   LDS.U.U16 R7, [R0+0x294]         }
                                                                              /* 0xef4a100029470007 */
        /*0370*/                   LDS.U.U16 R8, [R0+0x2d6] ;                 /* 0xef4a10002d670008 */
        /*0378*/                   LDS.U.U16 R10, [R0+0x318] ;                /* 0xef4a10003187000a */
                                                                              /* 0x009ff402fc200273 */
        /*0388*/                   LDS.U.U16 R11, [R0+0x35a] ;                /* 0xef4a100035a7000b */
        /*0390*/                   IADD3 R5, R5, R6, R19 ;                    /* 0x5cc0098000670505 */
        /*0398*/                   IADD32I R0, R0, 0x420 ;                    /* 0x1c00000042070000 */
                                                                              /* 0x001ff408fe000ffd */
        /*03a8*/                   IADD3 R5, R8, R5, R7 ;                     /* 0x5cc0038000570805 */
        /*03b0*/         {         IADD3 R5, R11, R5, R10 ;                   /* 0x5cc0050000570b05 */
        /*03b8*/              @!P1 BRA 0x298         }
                                                                              /* 0xe2400fffed89000f */
                                                                              /* 0x001fd800fe0007fd */
        /*03c8*/                   SYNC ;                                     /* 0xf0f800000007000f */
        /*03d0*/         {         IADD32I R6, -R4, 0xfc ;                    /* 0x1d0000000fc70406 */
        /*03d8*/                   SSY 0x4b0         }
                                                                              /* 0xe29000000d000000 */
                                                                              /* 0x001fc000ffa007ed */
        /*03e8*/                   ISETP.GT.AND P1, PT, R6, 0x4, PT ;         /* 0x366903800047060f */
        /*03f0*/              @!P1 SYNC ;                                     /* 0xf0f800000009000f */
        /*03f8*/         {         PSETP.AND.AND P0, PT, !PT, PT, PT ;        /* 0x50900380e007f007 */
                                                                              /* 0x001c4400fe0007f1 */
        /*0408*/                   LDS.U.U16 R6, [R0+-0x84]         }
                                                                              /* 0xef4a1ffff7c70006 */
        /*0410*/         {         IADD32I R4, R4, 0x8 ;                      /* 0x1c00000000870404 */
        /*0418*/                   LDS.U.U16 R7, [R0+-0x42]         }
                                                                              /* 0xef4a1ffffbe70007 */
                                                                              /* 0x001fc400e62007f1 */
        /*0428*/                   LDS.U.U16 R8, [R0] ;                       /* 0xef4a100000070008 */
        /*0430*/                   LDS.U.U16 R9, [R0+0x42] ;                  /* 0xef4a100004270009 */
        /*0438*/                   LDS.U.U16 R11, [R0+0x84] ;                 /* 0xef4a10000847000b */
                                                                              /* 0x000e4400fe200751 */
        /*0448*/                   LDS.U.U16 R12, [R0+0xc6] ;                 /* 0xef4a10000c67000c */
        /*0450*/                   LDS.U.U16 R13, [R0+0x108] ;                /* 0xef4a10001087000d */
        /*0458*/                   LDS.U.U16 R15, [R0+0x14a] ;                /* 0xef4a100014a7000f */
                                                                              /* 0x009f8402ffa10fe6 */
        /*0468*/                   IADD3 R6, R7, R5, R6 ;                     /* 0x5cc0030000570706 */
        /*0470*/                   IADD3 R6, R9, R6, R8 ;                     /* 0x5cc0040000670906 */
        /*0478*/                   IADD3 R6, R12, R6, R11 ;                   /* 0x5cc0058000670c06 */
                                                                              /* 0x001fc010fcc047f5 */
        /*0488*/                   IADD32I R0, R0, 0x210 ;                    /* 0x1c00000021070000 */
        /*0490*/                   IADD3 R6, R15, R6, R13 ;                   /* 0x5cc0068000670f06 */
        /*0498*/         {         XMAD.PSL.CLO R5, R5.H1, 0x1, R6 ;          /* 0x3624031000170505 */
                                                                              /* 0x001fc400fda007fd */
        /*04a8*/                   SYNC         }
                                                                              /* 0xf0f800000007000f */
        /*04b0*/                   ISETP.LT.OR P0, PT, R4, 0xfc, P0 ;         /* 0x366320000fc70407 */
        /*04b8*/               @P0 LDS.U.U16 R4, [R0+-0x84] ;                 /* 0xef4a1ffff7c00004 */
                                                                              /* 0x001cc400fe200711 */
        /*04c8*/               @P0 LDS.U.U16 R6, [R0+-0x42] ;                 /* 0xef4a1ffffbe00006 */
        /*04d0*/               @P0 LDS.U.U16 R7, [R0] ;                       /* 0xef4a100000000007 */
        /*04d8*/               @P0 LDS.U.U16 R9, [R0+0x42] ;                  /* 0xef4a100004200009 */
                                                                              /* 0x001fc002fcc10ff6 */
        /*04e8*/               @P0 IADD3 R4, R6, R5, R4 ;                     /* 0x5cc0020000500604 */
        /*04f0*/               @P0 IADD3 R4, R9, R4, R7 ;                     /* 0x5cc0038000400904 */
        /*04f8*/         {     @P0 XMAD.PSL.CLO R5, R5.H1, 0x1, R4 ;          /* 0x3624021000100505 */
                                                                              /* 0x001f8420fe2007fd */
        /*0508*/                   SYNC         }
                                                                              /* 0xf0f800000007000f */
        /*0510*/                   STG.E.U16 [R2], R5 ;                       /* 0xeeda200000070205 */
        /*0518*/                   NOP ;                                      /* 0x50b0000000070f00 */
                                                                              /* 0x001f8000ffe007ff */
        /*0528*/                   EXIT ;                                     /* 0xe30000000007000f */
        /*0530*/                   BRA 0x530 ;                                /* 0xe2400fffff87000f */
        /*0538*/                   NOP;                                       /* 0x50b0000000070f00 */
                ..........
Fatbin ptx code:
================
arch = sm_61
code version = [7,4]
producer = <unknown>
host = linux
compile_size = 64bit
compressed
$
(Yes, the code as written has illegal behavior in it.  I don’t think that is important, and can be gotten rid of.  I wrote it just to study the generated SASS.)