in case anybody cares you can get the SASS also with:
nvcc -arch=sm_86 -cubin -o kernel.cubin kernel.ptx
cuobjdump -sass kernel.cubin
OP has also provided a method that I assume is correct.
I’m not sure why extracting the SASS from the JIT cache is needed, unless you are testing the theory that the driver JIT compiler is producing something different than the offline compiler (ptxas). If that is the case, then the driver version certainly matters.
Using the method I indicated on CUDA 11.4, the SASS seems to be:
code for sm_86
Function : __inference_reduction_1d_parallel_2d_contraction_25_Sum_kernel
.headerflags @"EF_CUDA_SM86 EF_CUDA_PTX_SM(EF_CUDA_SM86)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */
/* 0x000e220000002100 */
/*0020*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fe20000000a00 */
/*0030*/ BSSY B0, 0x950 ; /* 0x0000091000007945 */
/* 0x000fe40003800000 */
/*0040*/ S2R R3, SR_CTAID.X ; /* 0x0000000000037919 */
/* 0x000e680000002500 */
/*0050*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */
/* 0x000fec0000000000 */
/*0060*/ ISETP.GT.AND P0, PT, R0, 0xff, PT ; /* 0x000000ff0000780c */
/* 0x001fda0003f04270 */
/*0070*/ @P0 BRA 0x940 ; /* 0x000008c000000947 */
/* 0x000fea0003800000 */
/*0080*/ IADD3 R5, R0, -0x100, RZ ; /* 0xffffff0000057810 */
/* 0x002fe40007ffe0ff */
/*0090*/ LEA R4, R3, R0, 0x8 ; /* 0x0000000003047211 */
/* 0x000fe400078e40ff */
/*00a0*/ ISETP.GE.U32.AND P1, PT, R5, 0x7ffffeff, PT ; /* 0x7ffffeff0500780c */
/* 0x000fc60003f26070 */
/*00b0*/ IMAD R4, R4, 0x3, RZ ; /* 0x0000000304047824 */
/* 0x000fd400078e02ff */
/*00c0*/ @P1 MOV R11, 0x4 ; /* 0x00000004000b1802 */
/* 0x000fe40000000f00 */
/*00d0*/ @P1 IADD3 R8, R4.reuse, 0x1, RZ ; /* 0x0000000104081810 */
/* 0x040fe40007ffe0ff */
/*00e0*/ @P1 IADD3 R10, R4.reuse, 0x2, RZ ; /* 0x00000002040a1810 */
/* 0x040fe20007ffe0ff */
/*00f0*/ @P1 IMAD.WIDE R6, R4, R11, c[0x0][0x168] ; /* 0x00005a0004061625 */
/* 0x000fc800078e020b */
/*0100*/ @P1 IMAD.WIDE R8, R8, R11.reuse, c[0x0][0x168] ; /* 0x00005a0008081625 */
/* 0x080fe400078e020b */
/*0110*/ @P1 LDG.E R6, [R6.64] ; /* 0x0000000406061981 */
/* 0x000ea4000c1e1900 */
/*0120*/ @P1 IMAD.WIDE R10, R10, R11, c[0x0][0x168] ; /* 0x00005a000a0a1625 */
/* 0x000fe400078e020b */
/*0130*/ @P1 LDG.E R9, [R8.64] ; /* 0x0000000408091981 */
/* 0x000ee8000c1e1900 */
/*0140*/ @P1 LDG.E R11, [R10.64] ; /* 0x000000040a0b1981 */
/* 0x000f22000c1e1900 */
/*0150*/ @P1 IADD3 R5, R5, 0x100, RZ ; /* 0x0000010005051810 */
/* 0x000fe20007ffe0ff */
/*0160*/ BSSY B1, 0x5d0 ; /* 0x0000046000017945 */
/* 0x000fe20003800000 */
/*0170*/ PLOP3.LUT P0, PT, P1, PT, PT, 0x8, 0x0 ; /* 0x000000000000781c */
/* 0x000fc40000f0e170 */
/*0180*/ IADD3 R2, -R5, 0x7ffffeff, RZ ; /* 0x7ffffeff05027810 */
/* 0x000fe40007ffe1ff */
/*0190*/ @P1 IADD3 R4, R4, 0x300, RZ ; /* 0x0000030004041810 */
/* 0x000fe40007ffe0ff */
/*01a0*/ ISETP.GT.AND P2, PT, R2, 0x300, PT ; /* 0x000003000200780c */
/* 0x000fe40003f44270 */
/*01b0*/ SHF.L.U32 R2, R0, 0x2, RZ ; /* 0x0000000200027819 */
/* 0x000fe200000006ff */
/*01c0*/ @P1 FADD R12, RZ, R6 ; /* 0x00000006ff0c1221 */
/* 0x004fc80000000000 */
/*01d0*/ @P1 FADD R12, R12, R9 ; /* 0x000000090c0c1221 */
/* 0x008fc80000000000 */
/*01e0*/ @P1 FADD R7, R12, R11 ; /* 0x0000000b0c071221 */
/* 0x010fca0000000000 */
/*01f0*/ @P1 STS [R2], R7 ; /* 0x0000000702001388 */
/* 0x0001e40000000800 */
/*0200*/ @P1 IADD3 R2, R2, 0x400, RZ ; /* 0x0000040002021810 */
/* 0x001fe20007ffe0ff */
/*0210*/ @!P2 BRA 0x5c0 ; /* 0x000003a00000a947 */
/* 0x000fea0003800000 */
/*0220*/ PLOP3.LUT P0, PT, PT, PT, PT, 0x8, 0x0 ; /* 0x000000000000781c */
/* 0x000fc40003f0e170 */
/*0230*/ MOV R7, 0x4 ; /* 0x0000000400077802 */
/* 0x000fe40000000f00 */
/*0240*/ IADD3 R8, R4.reuse, 0x1, RZ ; /* 0x0000000104087810 */
/* 0x040fe40007ffe0ff */
/*0250*/ IADD3 R12, R4.reuse, 0x300, RZ ; /* 0x00000300040c7810 */
/* 0x040fe20007ffe0ff */
/*0260*/ IMAD.WIDE R10, R4.reuse, R7.reuse, c[0x0][0x168] ; /* 0x00005a00040a7625 */
/* 0x0c0fe200078e0207 */
/*0270*/ IADD3 R22, R4.reuse, 0x600, RZ ; /* 0x0000060004167810 */
/* 0x040fe40007ffe0ff */
/*0280*/ IADD3 R24, R4, 0x900, RZ ; /* 0x0000090004187810 */
/* 0x000fe20007ffe0ff */
/*0290*/ IMAD.WIDE R8, R8, R7.reuse, c[0x0][0x168] ; /* 0x00005a0008087625 */
/* 0x080fe200078e0207 */
/*02a0*/ IADD3 R16, R4.reuse, 0x301, RZ ; /* 0x0000030104107810 */
/* 0x040fe20007ffe0ff */
/*02b0*/ LDG.E R6, [R10.64] ; /* 0x000000040a067981 */
/* 0x0000a2000c1e1900 */
/*02c0*/ IADD3 R18, R4, 0x601, RZ ; /* 0x0000060104127810 */
/* 0x000fe20007ffe0ff */
/*02d0*/ IMAD.WIDE R22, R22, R7.reuse, c[0x0][0x168] ; /* 0x00005a0016167625 */
/* 0x080fe200078e0207 */
/*02e0*/ IADD3 R14, R4, 0x2, RZ ; /* 0x00000002040e7810 */
/* 0x000fe20007ffe0ff */
/*02f0*/ LDG.E R8, [R8.64] ; /* 0x0000000408087981 */
/* 0x0002e4000c1e1900 */
/*0300*/ IMAD.WIDE R24, R24, R7.reuse, c[0x0][0x168] ; /* 0x00005a0018187625 */
/* 0x080fe200078e0207 */
/*0310*/ IADD3 R26, R4.reuse, 0x302, RZ ; /* 0x00000302041a7810 */
/* 0x040fe20007ffe0ff */
/*0320*/ LDG.E R22, [R22.64] ; /* 0x0000000416167981 */
/* 0x000f22000c1e1900 */
/*0330*/ IADD3 R10, R4, 0x901, RZ ; /* 0x00000901040a7810 */
/* 0x001fe20007ffe0ff */
/*0340*/ IMAD.WIDE R12, R12, R7.reuse, c[0x0][0x168] ; /* 0x00005a000c0c7625 */
/* 0x080fe200078e0207 */
/*0350*/ IADD3 R28, R4.reuse, 0x602, RZ ; /* 0x00000602041c7810 */
/* 0x040fe20007ffe0ff */
/*0360*/ LDG.E R24, [R24.64] ; /* 0x0000000418187981 */
/* 0x000f62000c1e1900 */
/*0370*/ IADD3 R30, R4, 0x902, RZ ; /* 0x00000902041e7810 */
/* 0x000fe20007ffe0ff */
/*0380*/ IMAD.WIDE R16, R16, R7, c[0x0][0x168] ; /* 0x00005a0010107625 */
/* 0x000fc400078e0207 */
/*0390*/ LDG.E R20, [R12.64] ; /* 0x000000040c147981 */
/* 0x000164000c1e1900 */
/*03a0*/ IMAD.WIDE R18, R18, R7.reuse, c[0x0][0x168] ; /* 0x00005a0012127625 */
/* 0x080fe400078e0207 */
/*03b0*/ LDG.E R9, [R16.64] ; /* 0x0000000410097981 */
/* 0x002364000c1e1900 */
/*03c0*/ IMAD.WIDE R10, R10, R7.reuse, c[0x0][0x168] ; /* 0x00005a000a0a7625 */
/* 0x080fe400078e0207 */
/*03d0*/ LDG.E R21, [R18.64] ; /* 0x0000000412157981 */
/* 0x000164000c1e1900 */
/*03e0*/ IMAD.WIDE R14, R14, R7, c[0x0][0x168] ; /* 0x00005a000e0e7625 */
/* 0x000fc400078e0207 */
/*03f0*/ LDG.E R11, [R10.64] ; /* 0x000000040a0b7981 */
/* 0x000f64000c1e1900 */
/*0400*/ IMAD.WIDE R16, R26, R7.reuse, c[0x0][0x168] ; /* 0x00005a001a107625 */
/* 0x082fe400078e0207 */
/*0410*/ LDG.E R14, [R14.64] ; /* 0x000000040e0e7981 */
/* 0x000f64000c1e1900 */
/*0420*/ IMAD.WIDE R12, R28, R7.reuse, c[0x0][0x168] ; /* 0x00005a001c0c7625 */
/* 0x081fe400078e0207 */
/*0430*/ LDG.E R16, [R16.64] ; /* 0x0000000410107981 */
/* 0x000f64000c1e1900 */
/*0440*/ IMAD.WIDE R18, R30, R7, c[0x0][0x168] ; /* 0x00005a001e127625 */
/* 0x000fc400078e0207 */
/*0450*/ LDG.E R12, [R12.64] ; /* 0x000000040c0c7981 */
/* 0x000f68000c1e1900 */
/*0460*/ LDG.E R18, [R18.64] ; /* 0x0000000412127981 */
/* 0x000f62000c1e1900 */
/*0470*/ IADD3 R5, R5, 0x400, RZ ; /* 0x0000040005057810 */
/* 0x000fc80007ffe0ff */
/*0480*/ ISETP.GE.U32.AND P1, PT, R5, 0x7ffffbff, PT ; /* 0x7ffffbff0500780c */
/* 0x000fe40003f26070 */
/*0490*/ IADD3 R4, R4, 0xc00, RZ ; /* 0x00000c0004047810 */
/* 0x000fe20007ffe0ff */
/*04a0*/ FADD R7, RZ, R6 ; /* 0x00000006ff077221 */
/* 0x004fc80000000000 */
/*04b0*/ FADD R7, R7, R8 ; /* 0x0000000807077221 */
/* 0x008fe20000000000 */
/*04c0*/ FADD R22, RZ, R22 ; /* 0x00000016ff167221 */
/* 0x010fe20000000000 */
/*04d0*/ FADD R24, RZ, R24 ; /* 0x00000018ff187221 */
/* 0x020fe20000000000 */
/*04e0*/ FADD R20, RZ, R20 ; /* 0x00000014ff147221 */
/* 0x000fc80000000000 */
/*04f0*/ FADD R9, R20, R9 ; /* 0x0000000914097221 */
/* 0x000fe20000000000 */
/*0500*/ FADD R21, R22, R21 ; /* 0x0000001516157221 */
/* 0x000fe20000000000 */
/*0510*/ FADD R11, R24, R11 ; /* 0x0000000b180b7221 */
/* 0x000fe20000000000 */
/*0520*/ FADD R7, R7, R14 ; /* 0x0000000e07077221 */
/* 0x000fe20000000000 */
/*0530*/ FADD R9, R9, R16 ; /* 0x0000001009097221 */
/* 0x000fe20000000000 */
/*0540*/ FADD R21, R21, R12 ; /* 0x0000000c15157221 */
/* 0x000fe20000000000 */
/*0550*/ FADD R11, R11, R18 ; /* 0x000000120b0b7221 */
/* 0x000fe40000000000 */
/*0560*/ STS [R2], R7 ; /* 0x0000000702007388 */
/* 0x000fe80000000800 */
/*0570*/ STS [R2+0x400], R9 ; /* 0x0004000902007388 */
/* 0x000fe80000000800 */
/*0580*/ STS [R2+0x800], R21 ; /* 0x0008001502007388 */
/* 0x000fe80000000800 */
/*0590*/ STS [R2+0xc00], R11 ; /* 0x000c000b02007388 */
/* 0x0001e40000000800 */
/*05a0*/ IADD3 R2, R2, 0x1000, RZ ; /* 0x0000100002027810 */
/* 0x001fe20007ffe0ff */
/*05b0*/ @!P1 BRA 0x230 ; /* 0xfffffc7000009947 */
/* 0x000fea000383ffff */
/*05c0*/ BSYNC B1 ; /* 0x0000000000017941 */
/* 0x000fea0003800000 */
/*05d0*/ IADD3 R6, -R5, 0x7ffffeff, RZ ; /* 0x7ffffeff05067810 */
/* 0x000fe20007ffe1ff */
/*05e0*/ BSSY B1, 0x800 ; /* 0x0000021000017945 */
/* 0x000fe60003800000 */
/*05f0*/ ISETP.GT.AND P1, PT, R6, 0x100, PT ; /* 0x000001000600780c */
/* 0x000fda0003f24270 */
/*0600*/ @!P1 BRA 0x7f0 ; /* 0x000001e000009947 */
/* 0x000fea0003800000 */
/*0610*/ MOV R11, 0x4 ; /* 0x00000004000b7802 */
/* 0x000fe40000000f00 */
/*0620*/ IADD3 R6, R4.reuse, 0x300, RZ ; /* 0x0000030004067810 */
/* 0x040fe40007ffe0ff */
/*0630*/ IADD3 R14, R4.reuse, 0x1, RZ ; /* 0x00000001040e7810 */
/* 0x040fe20007ffe0ff */
/*0640*/ IMAD.WIDE R12, R4.reuse, R11, c[0x0][0x168] ; /* 0x00005a00040c7625 */
/* 0x040fe200078e020b */
/*0650*/ IADD3 R8, R4, 0x301, RZ ; /* 0x0000030104087810 */
/* 0x000fc60007ffe0ff */
/*0660*/ IMAD.WIDE R6, R6, R11.reuse, c[0x0][0x168] ; /* 0x00005a0006067625 */
/* 0x080fe200078e020b */
/*0670*/ IADD3 R16, R4.reuse, 0x2, RZ ; /* 0x0000000204107810 */
/* 0x040fe20007ffe0ff */
/*0680*/ LDG.E R12, [R12.64] ; /* 0x000000040c0c7981 */
/* 0x000ea2000c1e1900 */
/*0690*/ IADD3 R10, R4, 0x302, RZ ; /* 0x00000302040a7810 */
/* 0x000fe20007ffe0ff */
/*06a0*/ IMAD.WIDE R14, R14, R11.reuse, c[0x0][0x168] ; /* 0x00005a000e0e7625 */
/* 0x080fe400078e020b */
/*06b0*/ LDG.E R6, [R6.64] ; /* 0x0000000406067981 */
/* 0x000ee4000c1e1900 */
/*06c0*/ IMAD.WIDE R8, R8, R11.reuse, c[0x0][0x168] ; /* 0x00005a0008087625 */
/* 0x080fe400078e020b */
/*06d0*/ LDG.E R15, [R14.64] ; /* 0x000000040e0f7981 */
/* 0x000f24000c1e1900 */
/*06e0*/ IMAD.WIDE R16, R16, R11, c[0x0][0x168] ; /* 0x00005a0010107625 */
/* 0x000fc400078e020b */
/*06f0*/ LDG.E R8, [R8.64] ; /* 0x0000000408087981 */
/* 0x000f64000c1e1900 */
/*0700*/ IMAD.WIDE R10, R10, R11, c[0x0][0x168] ; /* 0x00005a000a0a7625 */
/* 0x000fe400078e020b */
/*0710*/ LDG.E R17, [R16.64] ; /* 0x0000000410117981 */
/* 0x000f68000c1e1900 */
/*0720*/ LDG.E R10, [R10.64] ; /* 0x000000040a0a7981 */
/* 0x000f62000c1e1900 */
/*0730*/ PLOP3.LUT P0, PT, PT, PT, PT, 0x8, 0x0 ; /* 0x000000000000781c */
/* 0x000fe40003f0e170 */
/*0740*/ IADD3 R5, R5, 0x200, RZ ; /* 0x0000020005057810 */
/* 0x000fc40007ffe0ff */
/*0750*/ IADD3 R4, R4, 0x600, RZ ; /* 0x0000060004047810 */
/* 0x000fe20007ffe0ff */
/*0760*/ FADD R12, RZ, R12 ; /* 0x0000000cff0c7221 */
/* 0x004fe20000000000 */
/*0770*/ FADD R13, RZ, R6 ; /* 0x00000006ff0d7221 */
/* 0x008fc60000000000 */
/*0780*/ FADD R12, R12, R15 ; /* 0x0000000f0c0c7221 */
/* 0x010fe20000000000 */
/*0790*/ FADD R13, R13, R8 ; /* 0x000000080d0d7221 */
/* 0x020fc60000000000 */
/*07a0*/ FADD R7, R12, R17 ; /* 0x000000110c077221 */
/* 0x000fe20000000000 */
/*07b0*/ FADD R13, R13, R10 ; /* 0x0000000a0d0d7221 */
/* 0x000fc80000000000 */
/*07c0*/ STS [R2], R7 ; /* 0x0000000702007388 */
/* 0x000fe80000000800 */
/*07d0*/ STS [R2+0x400], R13 ; /* 0x0004000d02007388 */
/* 0x0001e40000000800 */
/*07e0*/ IADD3 R2, R2, 0x800, RZ ; /* 0x0000080002027810 */
/* 0x001fc40007ffe0ff */
/*07f0*/ BSYNC B1 ; /* 0x0000000000017941 */
/* 0x000fea0003800000 */
/*0800*/ ISETP.GT.U32.OR P0, PT, R5, 0x7ffffeff, P0 ; /* 0x7ffffeff0500780c */
/* 0x000fda0000704470 */
/*0810*/ @!P0 BRA 0x940 ; /* 0x0000012000008947 */
/* 0x000fea0003800000 */
/*0820*/ MOV R11, 0x4 ; /* 0x00000004000b7802 */
/* 0x000fe40000000f00 */
/*0830*/ IADD3 R8, R4.reuse, 0x1, RZ ; /* 0x0000000104087810 */
/* 0x040fe40007ffe0ff */
/*0840*/ IADD3 R10, R4.reuse, 0x2, RZ ; /* 0x00000002040a7810 */
/* 0x040fe20007ffe0ff */
/*0850*/ IMAD.WIDE R6, R4, R11, c[0x0][0x168] ; /* 0x00005a0004067625 */
/* 0x000fc800078e020b */
/*0860*/ IMAD.WIDE R8, R8, R11.reuse, c[0x0][0x168] ; /* 0x00005a0008087625 */
/* 0x080fe400078e020b */
/*0870*/ LDG.E R6, [R6.64] ; /* 0x0000000406067981 */
/* 0x000ea4000c1e1900 */
/*0880*/ IMAD.WIDE R10, R10, R11, c[0x0][0x168] ; /* 0x00005a000a0a7625 */
/* 0x000fe400078e020b */
/*0890*/ LDG.E R9, [R8.64] ; /* 0x0000000408097981 */
/* 0x000ee8000c1e1900 */
/*08a0*/ LDG.E R11, [R10.64] ; /* 0x000000040a0b7981 */
/* 0x000f22000c1e1900 */
/*08b0*/ IADD3 R5, R5, 0x100, RZ ; /* 0x0000010005057810 */
/* 0x000fc40007ffe0ff */
/*08c0*/ IADD3 R4, R4, 0x300, RZ ; /* 0x0000030004047810 */
/* 0x000fe40007ffe0ff */
/*08d0*/ ISETP.GT.U32.AND P0, PT, R5, 0x7ffffeff, PT ; /* 0x7ffffeff0500780c */
/* 0x000fe20003f04070 */
/*08e0*/ FADD R12, RZ, R6 ; /* 0x00000006ff0c7221 */
/* 0x004fc80000000000 */
/*08f0*/ FADD R12, R12, R9 ; /* 0x000000090c0c7221 */
/* 0x008fc80000000000 */
/*0900*/ FADD R13, R12, R11 ; /* 0x0000000b0c0d7221 */
/* 0x010fca0000000000 */
/*0910*/ STS [R2], R13 ; /* 0x0000000d02007388 */
/* 0x0001e40000000800 */
/*0920*/ IADD3 R2, R2, 0x400, RZ ; /* 0x0000040002027810 */
/* 0x001fe20007ffe0ff */
/*0930*/ @P0 BRA 0x820 ; /* 0xfffffee000000947 */
/* 0x000fea000383ffff */
/*0940*/ BSYNC B0 ; /* 0x0000000000007941 */
/* 0x002fea0003800000 */
/*0950*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */
/* 0x000fec0000000000 */
/*0960*/ ISETP.GT.AND P0, PT, R0, RZ, PT ; /* 0x000000ff0000720c */
/* 0x000fe20003f04270 */
/*0970*/ BSSY B0, 0xb10 ; /* 0x0000019000007945 */
/* 0x000fd80003800000 */
/*0980*/ @P0 BRA 0xb00 ; /* 0x0000017000000947 */
/* 0x000fea0003800000 */
/*0990*/ MOV R2, 0x4 ; /* 0x0000000400027802 */
/* 0x000fca0000000f00 */
/*09a0*/ IMAD.WIDE R2, R3, R2, c[0x0][0x198] ; /* 0x0000660003027625 */
/* 0x000fca00078e0202 */
/*09b0*/ LDG.E R13, [R2.64] ; /* 0x00000004020d7981 */
/* 0x000162000c1e1900 */
/*09c0*/ BSSY B1, 0xaf0 ; /* 0x0000012000017945 */
/* 0x000fe40003800000 */
/*09d0*/ MOV R12, RZ ; /* 0x000000ff000c7202 */
/* 0x000fca0000000f00 */
/*09e0*/ LDS.128 R8, [R12] ; /* 0x000000000c087984 */
/* 0x000e680000000c00 */
/*09f0*/ LDS.128 R4, [R12+0x10] ; /* 0x000010000c047984 */
/* 0x0004e40000000c00 */
/*0a00*/ IADD3 R12, R12, 0x20, RZ ; /* 0x000000200c0c7810 */
/* 0x004fc80007ffe0ff */
/*0a10*/ ISETP.NE.AND P0, PT, R12, 0x400, PT ; /* 0x000004000c00780c */
/* 0x000fe20003f05270 */
/*0a20*/ FADD R8, R8, R13 ; /* 0x0000000d08087221 */
/* 0x022fc80000000000 */
/*0a30*/ FADD R9, R9, R8 ; /* 0x0000000809097221 */
/* 0x000fc80000000000 */
/*0a40*/ FADD R10, R10, R9 ; /* 0x000000090a0a7221 */
/* 0x000fc80000000000 */
/*0a50*/ FADD R11, R11, R10 ; /* 0x0000000a0b0b7221 */
/* 0x000fc80000000000 */
/*0a60*/ FADD R4, R11, R4 ; /* 0x000000040b047221 */
/* 0x008fc80000000000 */
/*0a70*/ FADD R5, R5, R4 ; /* 0x0000000405057221 */
/* 0x000fc80000000000 */
/*0a80*/ FADD R6, R6, R5 ; /* 0x0000000506067221 */
/* 0x000fc80000000000 */
/*0a90*/ FADD R13, R7, R6 ; /* 0x00000006070d7221 */
/* 0x000fe20000000000 */
/*0aa0*/ @P0 BRA 0x9e0 ; /* 0xffffff3000000947 */
/* 0x000fea000383ffff */
/*0ab0*/ ISETP.GE.AND P0, PT, R0.reuse, -0xff, PT ; /* 0xffffff010000780c */
/* 0x040fe40003f06270 */
/*0ac0*/ IADD3 R0, R0, 0x100, RZ ; /* 0x0000010000007810 */
/* 0x000fd60007ffe0ff */
/*0ad0*/ @!P0 BRA 0x9d0 ; /* 0xfffffef000008947 */
/* 0x000fea000383ffff */
/*0ae0*/ BSYNC B1 ; /* 0x0000000000017941 */
/* 0x000fea0003800000 */
/*0af0*/ STG.E [R2.64], R13 ; /* 0x0000000d02007986 */
/* 0x0003e4000c101904 */
/*0b00*/ BSYNC B0 ; /* 0x0000000000007941 */
/* 0x000fea0003800000 */
/*0b10*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */
/* 0x000fec0000000000 */
/*0b20*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*0b30*/ BRA 0xb30; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*0b40*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0b50*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0b60*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0b70*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0b80*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0b90*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0ba0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0bb0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0bc0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0bd0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0be0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0bf0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
I would not suggest filing a bug with NVIDIA for an incomplete test case. I can’t imagine why an “ordinary” CUDA test case can’t be built around this, and indeed it could be useful for inspection/debug. I don’t expect anyone at NVIDIA is going to spend time trying to reverse engineer what the set of “proper” allocations would be to run a presumed correct version of the test, by studying your PTX.
You would almost certainly get asked to provide a complete test case using a NVIDIA toolchain, if you want anyone to look at it.