Here is PTX generated by libdevice function __nv_sin()
.
.version 6.5
.target sm_50
.address_size 64
// .weak _ZTS9sinkernel
.func __internal_trig_reduction_slowpathd
(
.param .b64 __internal_trig_reduction_slowpathd_param_0,
.param .b64 __internal_trig_reduction_slowpathd_param_1
)
;
.const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
.weak .entry _ZTS9sinkernel()
{
.local .align 4 .b8 __local_depot0[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<4>;
.reg .b32 %r<7>;
.reg .b64 %rd<4>;
.reg .f64 %fd<5>;
mov.u64 %SPL, __local_depot0;
cvta.local.u64 %SP, %SPL;
add.u64 %rd1, %SP, 0;
add.u64 %rd2, %SPL, 0;
mov.f64 %fd2, 0d400999999999999A;
{
.reg .b32 %temp;
mov.b64 {%r1, %temp}, %fd2;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd2;
}
and.b32 %r3, %r2, 2147483647;
setp.eq.s32 %p1, %r3, 2146435072;
setp.eq.s32 %p2, %r1, 0;
selp.f64 %fd3, 0d0000000000000000, 0d400999999999999A, %p2;
selp.f64 %fd1, %fd3, 0d400999999999999A, %p1;
mul.rn.f64 %fd4, %fd1, 0d3FE45F306DC9C883;
cvt.rni.s32.f64 %r4, %fd4;
st.local.u32 [%rd2], %r4;
{
.reg .b32 %temp;
mov.b64 {%temp, %r5}, %fd1;
}
and.b32 %r6, %r5, 2145386496;
setp.lt.u32 %p3, %r6, 1105199104;
@%p3 bra LBB0_2;
{ // callseq 0, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.f64 [param0+0], %fd1;
.param .b64 param1;
st.param.b64 [param1+0], %rd1;
call.uni
__internal_trig_reduction_slowpathd,
(
param0,
param1
);
} // callseq 0
LBB0_2:
ret;
}
.func __internal_trig_reduction_slowpathd(
.param .b64 __internal_trig_reduction_slowpathd_param_0,
.param .b64 __internal_trig_reduction_slowpathd_param_1
)
{
.local .align 8 .b8 __local_depot1[40];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<9>;
.reg .b32 %r<29>;
.reg .b64 %rd<87>;
.reg .f64 %fd<2>;
mov.u64 %SPL, __local_depot1;
ld.param.f64 %fd1, [__internal_trig_reduction_slowpathd_param_0];
{
.reg .b32 %temp;
mov.b64 {%temp, %r1}, %fd1;
}
bfe.u32 %r3, %r1, 20, 11;
setp.eq.s32 %p1, %r3, 2047;
@%p1 bra LBB1_9;
add.u64 %rd1, %SPL, 0;
shr.u32 %r2, %r1, 20;
add.s32 %r4, %r3, -1024;
shr.u32 %r5, %r4, 6;
mov.u32 %r12, 15;
sub.s32 %r28, %r12, %r5;
mov.u32 %r13, 19;
sub.s32 %r14, %r13, %r5;
setp.lt.u32 %p2, %r4, 128;
selp.b32 %r7, 18, %r14, %p2;
setp.ge.s32 %p3, %r28, %r7;
mov.u64 %rd82, 0;
@%p3 bra LBB1_4;
mov.b64 %rd29, %fd1;
shl.b64 %rd30, %rd29, 11;
or.b64 %rd43, %rd30, -9223372036854775808;
cvt.u64.u32 %rd32, %r4;
shr.u64 %rd33, %rd32, 6;
add.s32 %r15, %r28, -15;
cvt.s64.s32 %rd34, %r15;
add.s64 %rd35, %rd33, %rd34;
shl.b64 %rd36, %rd35, 3;
add.s64 %rd80, %rd1, %rd36;
mul.wide.s32 %rd37, %r15, 8;
mov.u64 %rd38, __cudart_i2opi_d;
add.s64 %rd39, %rd37, %rd38;
add.s64 %rd79, %rd39, 120;
mov.u64 %rd82, 0;
mov.u32 %r27, %r28;
LBB1_3:
ld.const.u64 %rd42, [%rd79];
// begin inline asm
{
.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi, clo, chi;
mov.b64 {alo,ahi}, %rd42;
mov.b64 {blo,bhi}, %rd43;
mov.b64 {clo,chi}, %rd82;
mad.lo.cc.u32 r0, alo, blo, clo;
madc.hi.cc.u32 r1, alo, blo, chi;
madc.hi.u32 r2, alo, bhi, 0;
mad.lo.cc.u32 r1, alo, bhi, r1;
madc.hi.cc.u32 r2, ahi, blo, r2;
madc.hi.u32 r3, ahi, bhi, 0;
mad.lo.cc.u32 r1, ahi, blo, r1;
madc.lo.cc.u32 r2, ahi, bhi, r2;
addc.u32 r3, r3, 0;
mov.b64 %rd40, {r0,r1};
mov.b64 %rd82, {r2,r3};
}
// end inline asm
st.local.u64 [%rd80], %rd40;
add.s32 %r27, %r27, 1;
add.s64 %rd80, %rd80, 8;
add.s64 %rd79, %rd79, 8;
setp.ne.s32 %p4, %r7, %r27;
mov.u32 %r28, %r7;
@%p4 bra LBB1_3;
LBB1_4:
ld.param.u64 %rd26, [__internal_trig_reduction_slowpathd_param_1];
cvt.s64.s32 %rd45, %r28;
cvt.u64.u32 %rd46, %r5;
add.s64 %rd47, %rd45, %rd46;
shl.b64 %rd48, %rd47, 3;
add.s64 %rd49, %rd1, %rd48;
st.local.u64 [%rd49+-120], %rd82;
and.b32 %r11, %r2, 63;
ld.local.u64 %rd83, [%rd1+16];
ld.local.u64 %rd84, [%rd1+24];
setp.eq.s32 %p5, %r11, 0;
@%p5 bra LBB1_6;
mov.u32 %r16, 64;
sub.s32 %r17, %r16, %r11;
shl.b64 %rd50, %rd84, %r11;
shr.u64 %rd51, %rd83, %r17;
or.b64 %rd84, %rd50, %rd51;
shl.b64 %rd52, %rd83, %r11;
ld.local.u64 %rd53, [%rd1+8];
shr.u64 %rd54, %rd53, %r17;
or.b64 %rd83, %rd54, %rd52;
LBB1_6:
shr.u64 %rd55, %rd84, 62;
cvt.u32.u64 %r18, %rd55;
shr.u64 %rd56, %rd83, 62;
shl.b64 %rd57, %rd84, 2;
or.b64 %rd86, %rd57, %rd56;
shl.b64 %rd85, %rd83, 2;
bfe.u64 %rd58, %rd84, 61, 1;
cvt.u32.u64 %r19, %rd58;
add.s32 %r20, %r19, %r18;
setp.gt.s32 %p6, %r1, -1;
neg.s32 %r21, %r20;
selp.b32 %r22, %r20, %r21, %p6;
st.u32 [%rd26], %r22;
setp.gt.s64 %p7, %rd86, -1;
@%p7 bra LBB1_8;
mov.u64 %rd61, 0;
// begin inline asm
{
.reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
mov.b64 {a0,a1}, %rd61;
mov.b64 {a2,a3}, %rd61;
mov.b64 {b0,b1}, %rd85;
mov.b64 {b2,b3}, %rd86;
sub.cc.u32 r0, a0, b0;
subc.cc.u32 r1, a1, b1;
subc.cc.u32 r2, a2, b2;
subc.u32 r3, a3, b3;
mov.b64 %rd85, {r0,r1};
mov.b64 %rd86, {r2,r3};
}
// end inline asm
LBB1_8:
clz.b64 %r23, %rd86;
and.b32 %r24, %r23, 63;
shl.b64 %rd70, %rd86, %r24;
shr.u64 %rd71, %rd85, 1;
not.b32 %r25, %r23;
and.b32 %r26, %r25, 63;
shr.u64 %rd72, %rd71, %r26;
or.b64 %rd67, %rd70, %rd72;
mov.u64 %rd68, -3958705157555305931;
// begin inline asm
{
.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;
mov.b64 {alo,ahi}, %rd67;
mov.b64 {blo,bhi}, %rd68;
mul.lo.u32 r0, alo, blo;
mul.hi.u32 r1, alo, blo;
mad.lo.cc.u32 r1, alo, bhi, r1;
madc.hi.u32 r2, alo, bhi, 0;
mad.lo.cc.u32 r1, ahi, blo, r1;
madc.hi.cc.u32 r2, ahi, blo, r2;
madc.hi.u32 r3, ahi, bhi, 0;
mad.lo.cc.u32 r2, ahi, bhi, r2;
addc.u32 r3, r3, 0;
mov.b64 %rd65, {r0,r1};
mov.b64 %rd66, {r2,r3};
}
// end inline asm
setp.lt.s64 %p8, %rd66, 1;
LBB1_9:
ret;
}
Invoking:
/usr/local/cuda-11.2/bin/ptxas -arch=sm_50 my_ptx.ptx -O0
Will complete without error. However changing the optimisation level above -O0
causes:
/usr/local/cuda-11.2/bin/ptxas -arch=sm_50 my_ptx.ptx -O1
Segmentation fault
The segmentation fault will not happen if the loop condition @%p4 bra LBB1_3;
at the end of LBB1_3
is commented out.
This also occurs for libdevice functions __nv_cos()
, __nv_tan()
, and __nv_sincos()
.