I am trying to analyse the code generated by a custom language/framework, which through various front end stages generates PTX as the end product. This PTX is then compiled using ptxas
to cubin
. I was wondering if the register usage of the generated SASS
in the cubin could be forcefully reduced as an experiment.
The ptxas command (I am trying to force the register usage to 40
per thread):
$ ptxas -lineinfo -v --maxrregcount 40 --gpu-name=sm_86 /tmp/add_kernel_10.ptx -o /tmp/add_kernel_10.cubin 2> /tmp/add_kernel_10.log
$ cat /tmp/add_kernel_10.log
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function 'add_kernel_10' for 'sm_86'
ptxas info : Function properties for add_kernel_10
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 44 registers, 484 bytes cmem[0]
It uses 44
registers irrespective of the --maxrregcount 40
. Is ptxas
trying to optimize against the register spills (0 bytes spill stores, 0 bytes spill loads
) and is that overriding the maxrregcount
option?
The PTX generated does not make use of the .maxnreg
directive so I don’t think something explicit in the PTX code is overriding the maxrregcount
compiler option, as mentioned here.
The above mentions of the compiler option being application to all __global__
functions:
Register usage can also be controlled for all
__global__
functions in a file using themaxrregcount
compiler option.
Does my kernel satisfy that criteria?
$ cat /tmp/add_kernel_10.ptx
//
// Generated by LLVM NVPTX Back-End
//
.version 8.1
.target sm_86
.address_size 64
// .globl add_kernel_10
.visible .entry add_kernel_10(
I tried to test the same kernel compilation from PTX to cubin using nvcc
:
$ nvcc -arch=sm_86 -cubin /tmp/add_kernel_10.ptx --maxrregcount 40 -o /tmp/add_kernel_10.cubin 2>
/tmp/add_kernel_10.log
In this case there is no output in the log file, but an ncu report shows that kernel still makes use of 44
registers instead of 40
which I forced.
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Feb__7_19:32:13_PST_2023
Cuda compilation tools, release 12.1, V12.1.66
Build cuda_12.1.r12.1/compiler.32415258_0
$ ptxas --version
ptxas: NVIDIA (R) Ptx optimizing assembler
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Feb__7_19:30:12_PST_2023
Cuda compilation tools, release 12.1, V12.1.66
Build cuda_12.1.r12.1/compiler.32415258_0
$ cat /tmp/add_kernel_0.ptx
//
// Generated by LLVM NVPTX Back-End
//
.version 8.1
.target sm_86
.address_size 64
// .globl add_kernel_0
.visible .entry add_kernel_0(
.param .u64 add_kernel_0_param_0,
.param .u64 add_kernel_0_param_1,
.param .u64 add_kernel_0_param_2,
.param .u64 add_kernel_0_param_3,
.param .u64 add_kernel_0_param_4,
.param .u64 add_kernel_0_param_5,
.param .u64 add_kernel_0_param_6,
.param .u64 add_kernel_0_param_7,
.param .u64 add_kernel_0_param_8,
.param .u64 add_kernel_0_param_9,
.param .u64 add_kernel_0_param_10,
.param .u64 add_kernel_0_param_11,
.param .u64 add_kernel_0_param_12,
.param .u64 add_kernel_0_param_13,
.param .u64 add_kernel_0_param_14,
.param .u64 add_kernel_0_param_15,
.param .u32 add_kernel_0_param_16
)
.maxntid 128, 1, 1
{
.reg .pred %p<33>;
.reg .b32 %r<137>;
.reg .f32 %f<233>;
.reg .b64 %rd<50>;
.loc 1 9 0
$L__func_begin0:
.loc 1 9 0
ld.param.u64 %rd33, [add_kernel_0_param_0];
ld.param.u64 %rd34, [add_kernel_0_param_1];
$L__tmp0:
.loc 1 31 24
// begin inline asm
mov.u32 %r1, %ctaid.x;
// end inline asm
.loc 1 36 24
shl.b32 %r130, %r1, 10;
ld.param.u64 %rd35, [add_kernel_0_param_2];
ld.param.u64 %rd36, [add_kernel_0_param_3];
.loc 1 37 41
mov.u32 %r131, %tid.x;
shl.b32 %r132, %r131, 2;
ld.param.u64 %rd37, [add_kernel_0_param_4];
and.b32 %r133, %r132, 508;
ld.param.u64 %rd38, [add_kernel_0_param_5];
.loc 1 37 28
or.b32 %r134, %r130, %r133;
ld.param.u64 %rd39, [add_kernel_0_param_6];
or.b32 %r135, %r134, 512;
ld.param.u64 %rd40, [add_kernel_0_param_7];
ld.param.u32 %r136, [add_kernel_0_param_16];
.loc 1 39 21
setp.lt.s32 %p1, %r134, %r136;
ld.param.u64 %rd41, [add_kernel_0_param_8];
setp.lt.s32 %p2, %r135, %r136;
ld.param.u64 %rd42, [add_kernel_0_param_9];
ld.param.u64 %rd43, [add_kernel_0_param_10];
.loc 1 42 26
mul.wide.s32 %rd44, %r134, 4;
add.s64 %rd1, %rd33, %rd44;
ld.param.u64 %rd45, [add_kernel_0_param_11];
add.s64 %rd2, %rd1, 2048;
ld.param.u64 %rd46, [add_kernel_0_param_12];
.loc 1 42 17
// begin inline asm
mov.u32 %r2, 0x0;
mov.u32 %r3, 0x0;
mov.u32 %r4, 0x0;
mov.u32 %r5, 0x0;
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
// end inline asm
ld.param.u64 %rd47, [add_kernel_0_param_13];
ld.param.u64 %rd48, [add_kernel_0_param_14];
ld.param.u64 %rd49, [add_kernel_0_param_15];
mov.b32 %f1, %r2;
mov.b32 %f2, %r3;
mov.b32 %f3, %r4;
mov.b32 %f4, %r5;
// begin inline asm
mov.u32 %r6, 0x0;
mov.u32 %r7, 0x0;
mov.u32 %r8, 0x0;
mov.u32 %r9, 0x0;
@%p2 ld.global.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd2 + 0 ];
// end inline asm
mov.b32 %f5, %r6;
mov.b32 %f6, %r7;
mov.b32 %f7, %r8;
mov.b32 %f8, %r9;
.loc 1 43 26
add.s64 %rd3, %rd34, %rd44;
add.s64 %rd4, %rd3, 2048;
.loc 1 43 17
// begin inline asm
mov.u32 %r10, 0x0;
mov.u32 %r11, 0x0;
mov.u32 %r12, 0x0;
mov.u32 %r13, 0x0;
@%p1 ld.global.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd3 + 0 ];
// end inline asm
mov.b32 %f9, %r10;
mov.b32 %f10, %r11;
mov.b32 %f11, %r12;
mov.b32 %f12, %r13;
// begin inline asm
mov.u32 %r14, 0x0;
mov.u32 %r15, 0x0;
mov.u32 %r16, 0x0;
mov.u32 %r17, 0x0;
@%p2 ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd4 + 0 ];
// end inline asm
mov.b32 %f13, %r14;
mov.b32 %f14, %r15;
mov.b32 %f15, %r16;
mov.b32 %f16, %r17;
.loc 1 44 26
add.s64 %rd5, %rd35, %rd44;
add.s64 %rd6, %rd5, 2048;
.loc 1 44 17
// begin inline asm
mov.u32 %r18, 0x0;
mov.u32 %r19, 0x0;
mov.u32 %r20, 0x0;
mov.u32 %r21, 0x0;
@%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd5 + 0 ];
// end inline asm
mov.b32 %f17, %r18;
mov.b32 %f18, %r19;
mov.b32 %f19, %r20;
mov.b32 %f20, %r21;
// begin inline asm
mov.u32 %r22, 0x0;
mov.u32 %r23, 0x0;
mov.u32 %r24, 0x0;
mov.u32 %r25, 0x0;
@%p2 ld.global.v4.b32 { %r22, %r23, %r24, %r25 }, [ %rd6 + 0 ];
// end inline asm
mov.b32 %f21, %r22;
mov.b32 %f22, %r23;
mov.b32 %f23, %r24;
mov.b32 %f24, %r25;
.loc 1 45 26
add.s64 %rd7, %rd36, %rd44;
add.s64 %rd8, %rd7, 2048;
.loc 1 45 17
// begin inline asm
mov.u32 %r26, 0x0;
mov.u32 %r27, 0x0;
mov.u32 %r28, 0x0;
mov.u32 %r29, 0x0;
@%p1 ld.global.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd7 + 0 ];
// end inline asm
mov.b32 %f25, %r26;
mov.b32 %f26, %r27;
mov.b32 %f27, %r28;
mov.b32 %f28, %r29;
// begin inline asm
mov.u32 %r30, 0x0;
mov.u32 %r31, 0x0;
mov.u32 %r32, 0x0;
mov.u32 %r33, 0x0;
@%p2 ld.global.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd8 + 0 ];
// end inline asm
mov.b32 %f29, %r30;
mov.b32 %f30, %r31;
mov.b32 %f31, %r32;
mov.b32 %f32, %r33;
.loc 1 46 26
add.s64 %rd9, %rd37, %rd44;
add.s64 %rd10, %rd9, 2048;
.loc 1 46 17
// begin inline asm
mov.u32 %r34, 0x0;
mov.u32 %r35, 0x0;
mov.u32 %r36, 0x0;
mov.u32 %r37, 0x0;
@%p1 ld.global.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd9 + 0 ];
// end inline asm
mov.b32 %f33, %r34;
mov.b32 %f34, %r35;
mov.b32 %f35, %r36;
mov.b32 %f36, %r37;
// begin inline asm
mov.u32 %r38, 0x0;
mov.u32 %r39, 0x0;
mov.u32 %r40, 0x0;
mov.u32 %r41, 0x0;
@%p2 ld.global.v4.b32 { %r38, %r39, %r40, %r41 }, [ %rd10 + 0 ];
// end inline asm
mov.b32 %f37, %r38;
mov.b32 %f38, %r39;
mov.b32 %f39, %r40;
mov.b32 %f40, %r41;
.loc 1 47 26
add.s64 %rd11, %rd38, %rd44;
add.s64 %rd12, %rd11, 2048;
.loc 1 47 17
// begin inline asm
mov.u32 %r42, 0x0;
mov.u32 %r43, 0x0;
mov.u32 %r44, 0x0;
mov.u32 %r45, 0x0;
@%p1 ld.global.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd11 + 0 ];
// end inline asm
mov.b32 %f41, %r42;
mov.b32 %f42, %r43;
mov.b32 %f43, %r44;
mov.b32 %f44, %r45;
// begin inline asm
mov.u32 %r46, 0x0;
mov.u32 %r47, 0x0;
mov.u32 %r48, 0x0;
mov.u32 %r49, 0x0;
@%p2 ld.global.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd12 + 0 ];
// end inline asm
mov.b32 %f45, %r46;
mov.b32 %f46, %r47;
mov.b32 %f47, %r48;
mov.b32 %f48, %r49;
.loc 1 48 26
add.s64 %rd13, %rd39, %rd44;
add.s64 %rd14, %rd13, 2048;
.loc 1 48 17
// begin inline asm
mov.u32 %r50, 0x0;
mov.u32 %r51, 0x0;
mov.u32 %r52, 0x0;
mov.u32 %r53, 0x0;
@%p1 ld.global.v4.b32 { %r50, %r51, %r52, %r53 }, [ %rd13 + 0 ];
// end inline asm
mov.b32 %f49, %r50;
mov.b32 %f50, %r51;
mov.b32 %f51, %r52;
mov.b32 %f52, %r53;
// begin inline asm
mov.u32 %r54, 0x0;
mov.u32 %r55, 0x0;
mov.u32 %r56, 0x0;
mov.u32 %r57, 0x0;
@%p2 ld.global.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd14 + 0 ];
// end inline asm
mov.b32 %f53, %r54;
mov.b32 %f54, %r55;
mov.b32 %f55, %r56;
mov.b32 %f56, %r57;
.loc 1 49 26
add.s64 %rd15, %rd40, %rd44;
add.s64 %rd16, %rd15, 2048;
.loc 1 49 17
// begin inline asm
mov.u32 %r58, 0x0;
mov.u32 %r59, 0x0;
mov.u32 %r60, 0x0;
mov.u32 %r61, 0x0;
@%p1 ld.global.v4.b32 { %r58, %r59, %r60, %r61 }, [ %rd15 + 0 ];
// end inline asm
mov.b32 %f57, %r58;
mov.b32 %f58, %r59;
mov.b32 %f59, %r60;
mov.b32 %f60, %r61;
// begin inline asm
mov.u32 %r62, 0x0;
mov.u32 %r63, 0x0;
mov.u32 %r64, 0x0;
mov.u32 %r65, 0x0;
@%p2 ld.global.v4.b32 { %r62, %r63, %r64, %r65 }, [ %rd16 + 0 ];
// end inline asm
mov.b32 %f61, %r62;
mov.b32 %f62, %r63;
mov.b32 %f63, %r64;
mov.b32 %f64, %r65;
.loc 1 50 26
add.s64 %rd17, %rd41, %rd44;
add.s64 %rd18, %rd17, 2048;
.loc 1 50 17
// begin inline asm
mov.u32 %r66, 0x0;
mov.u32 %r67, 0x0;
mov.u32 %r68, 0x0;
mov.u32 %r69, 0x0;
@%p1 ld.global.v4.b32 { %r66, %r67, %r68, %r69 }, [ %rd17 + 0 ];
// end inline asm
mov.b32 %f65, %r66;
mov.b32 %f66, %r67;
mov.b32 %f67, %r68;
mov.b32 %f68, %r69;
// begin inline asm
mov.u32 %r70, 0x0;
mov.u32 %r71, 0x0;
mov.u32 %r72, 0x0;
mov.u32 %r73, 0x0;
@%p2 ld.global.v4.b32 { %r70, %r71, %r72, %r73 }, [ %rd18 + 0 ];
// end inline asm
mov.b32 %f69, %r70;
mov.b32 %f70, %r71;
mov.b32 %f71, %r72;
mov.b32 %f72, %r73;
.loc 1 51 26
add.s64 %rd19, %rd42, %rd44;
add.s64 %rd20, %rd19, 2048;
.loc 1 51 17
// begin inline asm
mov.u32 %r74, 0x0;
mov.u32 %r75, 0x0;
mov.u32 %r76, 0x0;
mov.u32 %r77, 0x0;
@%p1 ld.global.v4.b32 { %r74, %r75, %r76, %r77 }, [ %rd19 + 0 ];
// end inline asm
mov.b32 %f73, %r74;
mov.b32 %f74, %r75;
mov.b32 %f75, %r76;
mov.b32 %f76, %r77;
// begin inline asm
mov.u32 %r78, 0x0;
mov.u32 %r79, 0x0;
mov.u32 %r80, 0x0;
mov.u32 %r81, 0x0;
@%p2 ld.global.v4.b32 { %r78, %r79, %r80, %r81 }, [ %rd20 + 0 ];
// end inline asm
mov.b32 %f77, %r78;
mov.b32 %f78, %r79;
mov.b32 %f79, %r80;
mov.b32 %f80, %r81;
.loc 1 52 28
add.s64 %rd21, %rd43, %rd44;
add.s64 %rd22, %rd21, 2048;
.loc 1 52 18
// begin inline asm
mov.u32 %r82, 0x0;
mov.u32 %r83, 0x0;
mov.u32 %r84, 0x0;
mov.u32 %r85, 0x0;
@%p1 ld.global.v4.b32 { %r82, %r83, %r84, %r85 }, [ %rd21 + 0 ];
// end inline asm
mov.b32 %f81, %r82;
mov.b32 %f82, %r83;
mov.b32 %f83, %r84;
mov.b32 %f84, %r85;
// begin inline asm
mov.u32 %r86, 0x0;
mov.u32 %r87, 0x0;
mov.u32 %r88, 0x0;
mov.u32 %r89, 0x0;
@%p2 ld.global.v4.b32 { %r86, %r87, %r88, %r89 }, [ %rd22 + 0 ];
// end inline asm
mov.b32 %f85, %r86;
mov.b32 %f86, %r87;
mov.b32 %f87, %r88;
mov.b32 %f88, %r89;
.loc 1 53 28
add.s64 %rd23, %rd45, %rd44;
add.s64 %rd24, %rd23, 2048;
.loc 1 53 18
// begin inline asm
mov.u32 %r90, 0x0;
mov.u32 %r91, 0x0;
mov.u32 %r92, 0x0;
mov.u32 %r93, 0x0;
@%p1 ld.global.v4.b32 { %r90, %r91, %r92, %r93 }, [ %rd23 + 0 ];
// end inline asm
mov.b32 %f89, %r90;
mov.b32 %f90, %r91;
mov.b32 %f91, %r92;
mov.b32 %f92, %r93;
// begin inline asm
mov.u32 %r94, 0x0;
mov.u32 %r95, 0x0;
mov.u32 %r96, 0x0;
mov.u32 %r97, 0x0;
@%p2 ld.global.v4.b32 { %r94, %r95, %r96, %r97 }, [ %rd24 + 0 ];
// end inline asm
mov.b32 %f93, %r94;
mov.b32 %f94, %r95;
mov.b32 %f95, %r96;
mov.b32 %f96, %r97;
.loc 1 54 28
add.s64 %rd25, %rd46, %rd44;
add.s64 %rd26, %rd25, 2048;
.loc 1 54 18
// begin inline asm
mov.u32 %r98, 0x0;
mov.u32 %r99, 0x0;
mov.u32 %r100, 0x0;
mov.u32 %r101, 0x0;
@%p1 ld.global.v4.b32 { %r98, %r99, %r100, %r101 }, [ %rd25 + 0 ];
// end inline asm
mov.b32 %f97, %r98;
mov.b32 %f98, %r99;
mov.b32 %f99, %r100;
mov.b32 %f100, %r101;
// begin inline asm
mov.u32 %r102, 0x0;
mov.u32 %r103, 0x0;
mov.u32 %r104, 0x0;
mov.u32 %r105, 0x0;
@%p2 ld.global.v4.b32 { %r102, %r103, %r104, %r105 }, [ %rd26 + 0 ];
// end inline asm
mov.b32 %f101, %r102;
mov.b32 %f102, %r103;
mov.b32 %f103, %r104;
mov.b32 %f104, %r105;
.loc 1 55 28
add.s64 %rd27, %rd47, %rd44;
add.s64 %rd28, %rd27, 2048;
.loc 1 55 18
// begin inline asm
mov.u32 %r106, 0x0;
mov.u32 %r107, 0x0;
mov.u32 %r108, 0x0;
mov.u32 %r109, 0x0;
@%p1 ld.global.v4.b32 { %r106, %r107, %r108, %r109 }, [ %rd27 + 0 ];
// end inline asm
mov.b32 %f105, %r106;
mov.b32 %f106, %r107;
mov.b32 %f107, %r108;
mov.b32 %f108, %r109;
// begin inline asm
mov.u32 %r110, 0x0;
mov.u32 %r111, 0x0;
mov.u32 %r112, 0x0;
mov.u32 %r113, 0x0;
@%p2 ld.global.v4.b32 { %r110, %r111, %r112, %r113 }, [ %rd28 + 0 ];
// end inline asm
mov.b32 %f109, %r110;
mov.b32 %f110, %r111;
mov.b32 %f111, %r112;
mov.b32 %f112, %r113;
.loc 1 56 28
add.s64 %rd29, %rd48, %rd44;
add.s64 %rd30, %rd29, 2048;
.loc 1 56 18
// begin inline asm
mov.u32 %r114, 0x0;
mov.u32 %r115, 0x0;
mov.u32 %r116, 0x0;
mov.u32 %r117, 0x0;
@%p1 ld.global.v4.b32 { %r114, %r115, %r116, %r117 }, [ %rd29 + 0 ];
// end inline asm
mov.b32 %f113, %r114;
mov.b32 %f114, %r115;
mov.b32 %f115, %r116;
mov.b32 %f116, %r117;
// begin inline asm
mov.u32 %r118, 0x0;
mov.u32 %r119, 0x0;
mov.u32 %r120, 0x0;
mov.u32 %r121, 0x0;
@%p2 ld.global.v4.b32 { %r118, %r119, %r120, %r121 }, [ %rd30 + 0 ];
// end inline asm
mov.b32 %f117, %r118;
mov.b32 %f118, %r119;
mov.b32 %f119, %r120;
mov.b32 %f120, %r121;
.loc 1 58 18
add.f32 %f121, %f1, %f9;
add.f32 %f122, %f2, %f10;
add.f32 %f123, %f3, %f11;
add.f32 %f124, %f4, %f12;
add.f32 %f125, %f5, %f13;
add.f32 %f126, %f6, %f14;
add.f32 %f127, %f7, %f15;
add.f32 %f128, %f8, %f16;
.loc 1 58 23
add.f32 %f129, %f121, %f17;
add.f32 %f130, %f122, %f18;
add.f32 %f131, %f123, %f19;
add.f32 %f132, %f124, %f20;
add.f32 %f133, %f125, %f21;
add.f32 %f134, %f126, %f22;
add.f32 %f135, %f127, %f23;
add.f32 %f136, %f128, %f24;
.loc 1 58 28
add.f32 %f137, %f129, %f25;
add.f32 %f138, %f130, %f26;
add.f32 %f139, %f131, %f27;
add.f32 %f140, %f132, %f28;
add.f32 %f141, %f133, %f29;
add.f32 %f142, %f134, %f30;
add.f32 %f143, %f135, %f31;
add.f32 %f144, %f136, %f32;
.loc 1 58 33
add.f32 %f145, %f137, %f33;
add.f32 %f146, %f138, %f34;
add.f32 %f147, %f139, %f35;
add.f32 %f148, %f140, %f36;
add.f32 %f149, %f141, %f37;
add.f32 %f150, %f142, %f38;
add.f32 %f151, %f143, %f39;
add.f32 %f152, %f144, %f40;
.loc 1 58 38
add.f32 %f153, %f145, %f41;
add.f32 %f154, %f146, %f42;
add.f32 %f155, %f147, %f43;
add.f32 %f156, %f148, %f44;
add.f32 %f157, %f149, %f45;
add.f32 %f158, %f150, %f46;
add.f32 %f159, %f151, %f47;
add.f32 %f160, %f152, %f48;
.loc 1 58 43
add.f32 %f161, %f153, %f49;
add.f32 %f162, %f154, %f50;
add.f32 %f163, %f155, %f51;
add.f32 %f164, %f156, %f52;
add.f32 %f165, %f157, %f53;
add.f32 %f166, %f158, %f54;
add.f32 %f167, %f159, %f55;
add.f32 %f168, %f160, %f56;
.loc 1 58 48
add.f32 %f169, %f161, %f57;
add.f32 %f170, %f162, %f58;
add.f32 %f171, %f163, %f59;
add.f32 %f172, %f164, %f60;
add.f32 %f173, %f165, %f61;
add.f32 %f174, %f166, %f62;
add.f32 %f175, %f167, %f63;
add.f32 %f176, %f168, %f64;
.loc 1 58 53
add.f32 %f177, %f169, %f65;
add.f32 %f178, %f170, %f66;
add.f32 %f179, %f171, %f67;
add.f32 %f180, %f172, %f68;
add.f32 %f181, %f173, %f69;
add.f32 %f182, %f174, %f70;
add.f32 %f183, %f175, %f71;
add.f32 %f184, %f176, %f72;
.loc 1 58 58
add.f32 %f185, %f177, %f73;
add.f32 %f186, %f178, %f74;
add.f32 %f187, %f179, %f75;
add.f32 %f188, %f180, %f76;
add.f32 %f189, %f181, %f77;
add.f32 %f190, %f182, %f78;
add.f32 %f191, %f183, %f79;
add.f32 %f192, %f184, %f80;
.loc 1 58 63
add.f32 %f193, %f185, %f81;
add.f32 %f194, %f186, %f82;
add.f32 %f195, %f187, %f83;
add.f32 %f196, %f188, %f84;
add.f32 %f197, %f189, %f85;
add.f32 %f198, %f190, %f86;
add.f32 %f199, %f191, %f87;
add.f32 %f200, %f192, %f88;
.loc 1 58 69
add.f32 %f201, %f193, %f89;
add.f32 %f202, %f194, %f90;
add.f32 %f203, %f195, %f91;
add.f32 %f204, %f196, %f92;
add.f32 %f205, %f197, %f93;
add.f32 %f206, %f198, %f94;
add.f32 %f207, %f199, %f95;
add.f32 %f208, %f200, %f96;
.loc 1 58 75
add.f32 %f209, %f201, %f97;
add.f32 %f210, %f202, %f98;
add.f32 %f211, %f203, %f99;
add.f32 %f212, %f204, %f100;
add.f32 %f213, %f205, %f101;
add.f32 %f214, %f206, %f102;
add.f32 %f215, %f207, %f103;
add.f32 %f216, %f208, %f104;
.loc 1 58 81
add.f32 %f217, %f209, %f105;
add.f32 %f218, %f210, %f106;
add.f32 %f219, %f211, %f107;
add.f32 %f220, %f212, %f108;
add.f32 %f221, %f213, %f109;
add.f32 %f222, %f214, %f110;
add.f32 %f223, %f215, %f111;
add.f32 %f224, %f216, %f112;
.loc 1 58 87
add.f32 %f225, %f217, %f113;
add.f32 %f226, %f218, %f114;
add.f32 %f227, %f219, %f115;
add.f32 %f228, %f220, %f116;
add.f32 %f229, %f221, %f117;
add.f32 %f230, %f222, %f118;
add.f32 %f231, %f223, %f119;
add.f32 %f232, %f224, %f120;
.loc 1 60 26
add.s64 %rd31, %rd49, %rd44;
add.s64 %rd32, %rd31, 2048;
.loc 1 60 35
mov.b32 %r122, %f225;
mov.b32 %r123, %f226;
mov.b32 %r124, %f227;
mov.b32 %r125, %f228;
// begin inline asm
@%p1 st.global.v4.b32 [ %rd31 + 0 ], { %r122, %r123, %r124, %r125 };
// end inline asm
mov.b32 %r126, %f229;
mov.b32 %r127, %f230;
mov.b32 %r128, %f231;
mov.b32 %r129, %f232;
// begin inline asm
@%p2 st.global.v4.b32 [ %rd32 + 0 ], { %r126, %r127, %r128, %r129 };
// end inline asm
.loc 1 60 4
ret;
$L__tmp1:
$L__func_end0:
}
.file 1 "/media/disk1/abhishek/triton_testing/indirection_vs_num_args_1.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 0
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 103
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 105
.b8 110
.b8 100
.b8 105
.b8 114
.b8 101
.b8 99
.b8 116
.b8 105
.b8 111
.b8 110
.b8 95
.b8 118
.b8 115
.b8 95
.b8 110
.b8 117
.b8 109
.b8 95
.b8 97
.b8 114
.b8 103
.b8 115
.b8 95
.b8 49
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 109
.b8 101
.b8 100
.b8 105
.b8 97
.b8 47
.b8 100
.b8 105
.b8 115
.b8 107
.b8 49
.b8 47
.b8 97
.b8 98
.b8 104
.b8 105
.b8 115
.b8 104
.b8 101
.b8 107
.b8 47
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 116
.b8 101
.b8 115
.b8 116
.b8 105
.b8 110
.b8 103
.b8 0
.b64 $L__func_begin0
.b64 $L__func_end0
}
.section .debug_loc { }
The above is the PTX of a simpler kernel. Which by default makes use of 40
registers per thread. I am trying to match the other versions to make use of 40
registers. Can’t we just force to make use of a lower number of registers at the cost of spilling may be? (Why is the flag having no effect at all)