Libdevice functions causing PTXAS segfault

Here is PTX generated by libdevice function __nv_sin().

.version 6.5
.target sm_50
.address_size 64

	// .weak	_ZTS9sinkernel
.func __internal_trig_reduction_slowpathd
(
	.param .b64 __internal_trig_reduction_slowpathd_param_0,
	.param .b64 __internal_trig_reduction_slowpathd_param_1
)
;
.const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};

.weak .entry _ZTS9sinkernel()
{
	.local .align 4 .b8 	__local_depot0[4];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<4>;
	.reg .b32 	%r<7>;
	.reg .b64 	%rd<4>;
	.reg .f64 	%fd<5>;

	mov.u64 	%SPL, __local_depot0;
	cvta.local.u64 	%SP, %SPL;
	add.u64 	%rd1, %SP, 0;
	add.u64 	%rd2, %SPL, 0;
	mov.f64 	%fd2, 0d400999999999999A;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r1, %temp}, %fd2;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r2}, %fd2;
	}
	and.b32  	%r3, %r2, 2147483647;
	setp.eq.s32 	%p1, %r3, 2146435072;
	setp.eq.s32 	%p2, %r1, 0;
	selp.f64 	%fd3, 0d0000000000000000, 0d400999999999999A, %p2;
	selp.f64 	%fd1, %fd3, 0d400999999999999A, %p1;
	mul.rn.f64 	%fd4, %fd1, 0d3FE45F306DC9C883;
	cvt.rni.s32.f64 	%r4, %fd4;
	st.local.u32 	[%rd2], %r4;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r5}, %fd1;
	}
	and.b32  	%r6, %r5, 2145386496;
	setp.lt.u32 	%p3, %r6, 1105199104;
	@%p3 bra 	LBB0_2;
	{ // callseq 0, 0
	.reg .b32 temp_param_reg;
	.param .b64 param0;
	st.param.f64 	[param0+0], %fd1;
	.param .b64 param1;
	st.param.b64 	[param1+0], %rd1;
	call.uni 
	__internal_trig_reduction_slowpathd, 
	(
	param0, 
	param1
	);
	} // callseq 0
LBB0_2:
	ret;

}
.func __internal_trig_reduction_slowpathd(
	.param .b64 __internal_trig_reduction_slowpathd_param_0,
	.param .b64 __internal_trig_reduction_slowpathd_param_1
)
{
	.local .align 8 .b8 	__local_depot1[40];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<9>;
	.reg .b32 	%r<29>;
	.reg .b64 	%rd<87>;
	.reg .f64 	%fd<2>;

	mov.u64 	%SPL, __local_depot1;
	ld.param.f64 	%fd1, [__internal_trig_reduction_slowpathd_param_0];
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r1}, %fd1;
	}
	bfe.u32 	%r3, %r1, 20, 11;
	setp.eq.s32 	%p1, %r3, 2047;
	@%p1 bra 	LBB1_9;
	add.u64 	%rd1, %SPL, 0;
	shr.u32 	%r2, %r1, 20;
	add.s32 	%r4, %r3, -1024;
	shr.u32 	%r5, %r4, 6;
	mov.u32 	%r12, 15;
	sub.s32 	%r28, %r12, %r5;
	mov.u32 	%r13, 19;
	sub.s32 	%r14, %r13, %r5;
	setp.lt.u32 	%p2, %r4, 128;
	selp.b32 	%r7, 18, %r14, %p2;
	setp.ge.s32 	%p3, %r28, %r7;
	mov.u64 	%rd82, 0;
	@%p3 bra 	LBB1_4;
	mov.b64 	%rd29, %fd1;
	shl.b64 	%rd30, %rd29, 11;
	or.b64  	%rd43, %rd30, -9223372036854775808;
	cvt.u64.u32 	%rd32, %r4;
	shr.u64 	%rd33, %rd32, 6;
	add.s32 	%r15, %r28, -15;
	cvt.s64.s32 	%rd34, %r15;
	add.s64 	%rd35, %rd33, %rd34;
	shl.b64 	%rd36, %rd35, 3;
	add.s64 	%rd80, %rd1, %rd36;
	mul.wide.s32 	%rd37, %r15, 8;
	mov.u64 	%rd38, __cudart_i2opi_d;
	add.s64 	%rd39, %rd37, %rd38;
	add.s64 	%rd79, %rd39, 120;
	mov.u64 	%rd82, 0;
	mov.u32 	%r27, %r28;
LBB1_3:
	ld.const.u64 	%rd42, [%rd79];
	// begin inline asm
	{
	.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi, clo, chi;
	mov.b64         {alo,ahi}, %rd42;    
	mov.b64         {blo,bhi}, %rd43;    
	mov.b64         {clo,chi}, %rd82;    
	mad.lo.cc.u32   r0, alo, blo, clo;
	madc.hi.cc.u32  r1, alo, blo, chi;
	madc.hi.u32     r2, alo, bhi,   0;
	mad.lo.cc.u32   r1, alo, bhi,  r1;
	madc.hi.cc.u32  r2, ahi, blo,  r2;
	madc.hi.u32     r3, ahi, bhi,   0;
	mad.lo.cc.u32   r1, ahi, blo,  r1;
	madc.lo.cc.u32  r2, ahi, bhi,  r2;
	addc.u32        r3,  r3,   0;     
	mov.b64         %rd40, {r0,r1};      
	mov.b64         %rd82, {r2,r3};      
	}
	// end inline asm
	st.local.u64 	[%rd80], %rd40;
	add.s32 	%r27, %r27, 1;
	add.s64 	%rd80, %rd80, 8;
	add.s64 	%rd79, %rd79, 8;
	setp.ne.s32 	%p4, %r7, %r27;
	mov.u32 	%r28, %r7;
	@%p4 bra 	LBB1_3;
LBB1_4:
	ld.param.u64 	%rd26, [__internal_trig_reduction_slowpathd_param_1];
	cvt.s64.s32 	%rd45, %r28;
	cvt.u64.u32 	%rd46, %r5;
	add.s64 	%rd47, %rd45, %rd46;
	shl.b64 	%rd48, %rd47, 3;
	add.s64 	%rd49, %rd1, %rd48;
	st.local.u64 	[%rd49+-120], %rd82;
	and.b32  	%r11, %r2, 63;
	ld.local.u64 	%rd83, [%rd1+16];
	ld.local.u64 	%rd84, [%rd1+24];
	setp.eq.s32 	%p5, %r11, 0;
	@%p5 bra 	LBB1_6;
	mov.u32 	%r16, 64;
	sub.s32 	%r17, %r16, %r11;
	shl.b64 	%rd50, %rd84, %r11;
	shr.u64 	%rd51, %rd83, %r17;
	or.b64  	%rd84, %rd50, %rd51;
	shl.b64 	%rd52, %rd83, %r11;
	ld.local.u64 	%rd53, [%rd1+8];
	shr.u64 	%rd54, %rd53, %r17;
	or.b64  	%rd83, %rd54, %rd52;
LBB1_6:
	shr.u64 	%rd55, %rd84, 62;
	cvt.u32.u64 	%r18, %rd55;
	shr.u64 	%rd56, %rd83, 62;
	shl.b64 	%rd57, %rd84, 2;
	or.b64  	%rd86, %rd57, %rd56;
	shl.b64 	%rd85, %rd83, 2;
	bfe.u64 	%rd58, %rd84, 61, 1;
	cvt.u32.u64 	%r19, %rd58;
	add.s32 	%r20, %r19, %r18;
	setp.gt.s32 	%p6, %r1, -1;
	neg.s32 	%r21, %r20;
	selp.b32 	%r22, %r20, %r21, %p6;
	st.u32 	[%rd26], %r22;
	setp.gt.s64 	%p7, %rd86, -1;
	@%p7 bra 	LBB1_8;
	mov.u64 	%rd61, 0;
	// begin inline asm
	{
	.reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
	mov.b64         {a0,a1}, %rd61;
	mov.b64         {a2,a3}, %rd61;
	mov.b64         {b0,b1}, %rd85;
	mov.b64         {b2,b3}, %rd86;
	sub.cc.u32      r0, a0, b0; 
	subc.cc.u32     r1, a1, b1; 
	subc.cc.u32     r2, a2, b2; 
	subc.u32        r3, a3, b3; 
	mov.b64         %rd85, {r0,r1};
	mov.b64         %rd86, {r2,r3};
	}
	// end inline asm
LBB1_8:
	clz.b64 	%r23, %rd86;
	and.b32  	%r24, %r23, 63;
	shl.b64 	%rd70, %rd86, %r24;
	shr.u64 	%rd71, %rd85, 1;
	not.b32 	%r25, %r23;
	and.b32  	%r26, %r25, 63;
	shr.u64 	%rd72, %rd71, %r26;
	or.b64  	%rd67, %rd70, %rd72;
	mov.u64 	%rd68, -3958705157555305931;
	// begin inline asm
	{
	.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;
	mov.b64         {alo,ahi}, %rd67;   
	mov.b64         {blo,bhi}, %rd68;   
	mul.lo.u32      r0, alo, blo;    
	mul.hi.u32      r1, alo, blo;    
	mad.lo.cc.u32   r1, alo, bhi, r1;
	madc.hi.u32     r2, alo, bhi,  0;
	mad.lo.cc.u32   r1, ahi, blo, r1;
	madc.hi.cc.u32  r2, ahi, blo, r2;
	madc.hi.u32     r3, ahi, bhi,  0;
	mad.lo.cc.u32   r2, ahi, bhi, r2;
	addc.u32        r3, r3,  0;      
	mov.b64         %rd65, {r0,r1};     
	mov.b64         %rd66, {r2,r3};     
	}
	// end inline asm
	setp.lt.s64 	%p8, %rd66, 1;
LBB1_9:
	ret;

}

Invoking:

/usr/local/cuda-11.2/bin/ptxas -arch=sm_50 my_ptx.ptx -O0

Will complete without error. However changing the optimisation level above -O0 causes:

/usr/local/cuda-11.2/bin/ptxas -arch=sm_50 my_ptx.ptx -O1
Segmentation fault

The segmentation fault will not happen if the loop condition @%p4 bra LBB1_3; at the end of LBB1_3 is commented out.

This also occurs for libdevice functions __nv_cos(), __nv_tan(), and __nv_sincos().

I recommend:

  1. testing against the latest cuda version (currently: 11.5)
  2. if the issue still occurs there, filing a bug

Thanks for response. Bug replicated in cuda-11.5. Bug report filed https://developer.nvidia.com/nvidia_bug/3419212

Sorry for poorly formatted text in bug report, but I don’t seem to be able to edit my original post.

Thanks for filing the bug, I will link it to the one I filed, and clean it up if needed.

Thanks Robert!