Libdevice functions segfaulting at runtime

Apologies if this is in the wrong category.

The libdevice functions __nv_sin(), __nv_cos(), __nv_tan(), and __nv_sincos() are causing segfaults at runtime.

This only occurs when all the conditions are met:

  • Using doubles.
  • At least one input is defined on kernel stack (not accessed in global memory).
  • Using sm_60 or lower.

The libdevice function __nv_sin() generates the following PTX:

//
// Generated by LLVM NVPTX Back-End
//

.version 6.5
.target sm_60
.address_size 64

	// .weak	_ZTS9sinkernel
.func  (.param .b64 func_retval0) __internal_trig_reduction_slowpathd
(
	.param .b64 __internal_trig_reduction_slowpathd_param_0,
	.param .b64 __internal_trig_reduction_slowpathd_param_1
)
;
.const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
.const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63};

.weak .entry _ZTS9sinkernel(
	.param .u64 _ZTS9sinkernel_param_0,
	.param .align 8 .b8 _ZTS9sinkernel_param_1[8],
	.param .align 8 .b8 _ZTS9sinkernel_param_2[8],
	.param .align 8 .b8 _ZTS9sinkernel_param_3[8]
)
{
	.local .align 4 .b8 	__local_depot0[4];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<6>;
	.reg .b32 	%r<13>;
	.reg .f64 	%fd<33>;
	.reg .b64 	%rd<11>;

	mov.u64 	%SPL, __local_depot0;
	cvta.local.u64 	%SP, %SPL;
	ld.param.u64 	%rd3, [_ZTS9sinkernel_param_0];
	add.u64 	%rd4, %SP, 0;
	add.u64 	%rd1, %SPL, 0;
	ld.param.u64 	%rd2, [_ZTS9sinkernel_param_3];
	mov.f64 	%fd5, 0d400F333333333333;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r4, %temp}, %fd5;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r5}, %fd5;
	}
	and.b32  	%r6, %r5, 2147483647;
	setp.eq.s32 	%p1, %r6, 2146435072;
	setp.eq.s32 	%p2, %r4, 0;
	selp.f64 	%fd6, 0d0000000000000000, 0d400F333333333333, %p2;
	selp.f64 	%fd1, %fd6, 0d400F333333333333, %p1;
	mul.rn.f64 	%fd7, %fd1, 0d3FE45F306DC9C883;
	cvt.rni.s32.f64 	%r12, %fd7;
	st.local.u32 	[%rd1], %r12;
	cvt.rn.f64.s32 	%fd8, %r12;
	fma.rn.f64 	%fd9, %fd8, 0dBFF921FB54442D18, %fd1;
	fma.rn.f64 	%fd10, %fd8, 0dBC91A62633145C00, %fd9;
	fma.rn.f64 	%fd32, %fd8, 0dB97B839A252049C0, %fd10;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r7}, %fd1;
	}
	and.b32  	%r8, %r7, 2145386496;
	setp.lt.u32 	%p3, %r8, 1105199104;
	@%p3 bra 	LBB0_2;
	{ // callseq 0, 0
	.reg .b32 temp_param_reg;
	.param .b64 param0;
	st.param.f64 	[param0+0], %fd1;
	.param .b64 param1;
	st.param.b64 	[param1+0], %rd4;
	.param .b64 retval0;
	call.uni (retval0), 
	__internal_trig_reduction_slowpathd, 
	(
	param0, 
	param1
	);
	ld.param.f64 	%fd32, [retval0+0];
	} // callseq 0
	ld.local.u32 	%r12, [%rd1];
LBB0_2:
	shl.b64 	%rd6, %rd2, 3;
	add.s64 	%rd7, %rd3, %rd6;
	and.b32  	%r9, %r12, 1;
	shl.b32 	%r10, %r9, 3;
	mul.wide.u32 	%rd8, %r10, 8;
	mov.u64 	%rd9, __cudart_sin_cos_coeffs;
	add.s64 	%rd10, %rd9, %rd8;
	mul.rn.f64 	%fd12, %fd32, %fd32;
	setp.eq.s32 	%p4, %r9, 0;
	selp.f64 	%fd13, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p4;
	ld.const.f64 	%fd14, [%rd10+8];
	fma.rn.f64 	%fd15, %fd13, %fd12, %fd14;
	ld.const.f64 	%fd16, [%rd10+16];
	fma.rn.f64 	%fd17, %fd15, %fd12, %fd16;
	ld.const.f64 	%fd18, [%rd10+24];
	fma.rn.f64 	%fd19, %fd17, %fd12, %fd18;
	ld.const.f64 	%fd20, [%rd10+32];
	fma.rn.f64 	%fd21, %fd19, %fd12, %fd20;
	ld.const.f64 	%fd22, [%rd10+40];
	fma.rn.f64 	%fd23, %fd21, %fd12, %fd22;
	ld.const.f64 	%fd24, [%rd10+48];
	fma.rn.f64 	%fd25, %fd23, %fd12, %fd24;
	fma.rn.f64 	%fd26, %fd25, %fd32, %fd32;
	fma.rn.f64 	%fd27, %fd25, %fd12, 0d3FF0000000000000;
	selp.f64 	%fd28, %fd26, %fd27, %p4;
	and.b32  	%r11, %r12, 2;
	setp.eq.s32 	%p5, %r11, 0;
	mov.f64 	%fd29, 0d0000000000000000;
	sub.rn.f64 	%fd30, %fd29, %fd28;
	selp.f64 	%fd31, %fd28, %fd30, %p5;
	st.global.f64 	[%rd7], %fd31;
	ret;

}
.func  (.param .b64 func_retval0) __internal_trig_reduction_slowpathd(
	.param .b64 __internal_trig_reduction_slowpathd_param_0,
	.param .b64 __internal_trig_reduction_slowpathd_param_1
)
{
	.local .align 8 .b8 	__local_depot1[40];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<9>;
	.reg .b32 	%r<33>;
	.reg .f64 	%fd<5>;
	.reg .b64 	%rd<104>;

	mov.u64 	%SPL, __local_depot1;
	ld.param.f64 	%fd4, [__internal_trig_reduction_slowpathd_param_0];
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r14}, %fd4;
	}
	bfe.u32 	%r3, %r14, 20, 11;
	setp.eq.s32 	%p1, %r3, 2047;
	@%p1 bra 	LBB1_11;
	add.u64 	%rd1, %SPL, 0;
	shr.u32 	%r2, %r14, 20;
	add.s32 	%r4, %r3, -1024;
	shr.u32 	%r5, %r4, 6;
	mov.u32 	%r15, 15;
	sub.s32 	%r31, %r15, %r5;
	mov.u32 	%r16, 19;
	sub.s32 	%r17, %r16, %r5;
	setp.lt.u32 	%p2, %r4, 128;
	selp.b32 	%r7, 18, %r17, %p2;
	setp.ge.s32 	%p3, %r31, %r7;
	mov.u64 	%rd97, 0;
	@%p3 bra 	LBB1_4;
	mov.b64 	%rd34, %fd4;
	shl.b64 	%rd35, %rd34, 11;
	or.b64  	%rd48, %rd35, -9223372036854775808;
	cvt.u64.u32 	%rd37, %r4;
	shr.u64 	%rd38, %rd37, 6;
	add.s32 	%r18, %r31, -15;
	cvt.s64.s32 	%rd39, %r18;
	add.s64 	%rd40, %rd38, %rd39;
	shl.b64 	%rd41, %rd40, 3;
	add.s64 	%rd95, %rd1, %rd41;
	mul.wide.s32 	%rd42, %r18, 8;
	mov.u64 	%rd43, __cudart_i2opi_d;
	add.s64 	%rd44, %rd42, %rd43;
	add.s64 	%rd94, %rd44, 120;
	mov.u64 	%rd97, 0;
	mov.u32 	%r30, %r31;
LBB1_3:
	ld.const.u64 	%rd47, [%rd94];
	// begin inline asm
	{
	.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi, clo, chi;
	mov.b64         {alo,ahi}, %rd47;    
	mov.b64         {blo,bhi}, %rd48;    
	mov.b64         {clo,chi}, %rd97;    
	mad.lo.cc.u32   r0, alo, blo, clo;
	madc.hi.cc.u32  r1, alo, blo, chi;
	madc.hi.u32     r2, alo, bhi,   0;
	mad.lo.cc.u32   r1, alo, bhi,  r1;
	madc.hi.cc.u32  r2, ahi, blo,  r2;
	madc.hi.u32     r3, ahi, bhi,   0;
	mad.lo.cc.u32   r1, ahi, blo,  r1;
	madc.lo.cc.u32  r2, ahi, bhi,  r2;
	addc.u32        r3,  r3,   0;     
	mov.b64         %rd45, {r0,r1};      
	mov.b64         %rd97, {r2,r3};      
	}
	// end inline asm
	st.local.u64 	[%rd95], %rd45;
	add.s32 	%r30, %r30, 1;
	add.s64 	%rd95, %rd95, 8;
	add.s64 	%rd94, %rd94, 8;
	setp.ne.s32 	%p4, %r7, %r30;
	mov.u32 	%r31, %r7;
	@%p4 bra 	LBB1_3;
LBB1_4:
	ld.param.u64 	%rd31, [__internal_trig_reduction_slowpathd_param_1];
	and.b32  	%r32, %r14, -2147483648;
	cvt.s64.s32 	%rd50, %r31;
	cvt.u64.u32 	%rd51, %r5;
	add.s64 	%rd52, %rd50, %rd51;
	shl.b64 	%rd53, %rd52, 3;
	add.s64 	%rd54, %rd1, %rd53;
	st.local.u64 	[%rd54+-120], %rd97;
	and.b32  	%r11, %r2, 63;
	ld.local.u64 	%rd98, [%rd1+16];
	ld.local.u64 	%rd99, [%rd1+24];
	setp.eq.s32 	%p5, %r11, 0;
	@%p5 bra 	LBB1_6;
	mov.u32 	%r19, 64;
	sub.s32 	%r20, %r19, %r11;
	shl.b64 	%rd55, %rd99, %r11;
	shr.u64 	%rd56, %rd98, %r20;
	or.b64  	%rd99, %rd55, %rd56;
	shl.b64 	%rd57, %rd98, %r11;
	ld.local.u64 	%rd58, [%rd1+8];
	shr.u64 	%rd59, %rd58, %r20;
	or.b64  	%rd98, %rd59, %rd57;
LBB1_6:
	shr.u64 	%rd60, %rd99, 62;
	cvt.u32.u64 	%r21, %rd60;
	shr.u64 	%rd61, %rd98, 62;
	shl.b64 	%rd62, %rd99, 2;
	or.b64  	%rd101, %rd62, %rd61;
	shl.b64 	%rd100, %rd98, 2;
	bfe.u64 	%rd63, %rd99, 61, 1;
	cvt.u32.u64 	%r22, %rd63;
	add.s32 	%r23, %r22, %r21;
	setp.eq.s32 	%p6, %r32, 0;
	neg.s32 	%r24, %r23;
	selp.b32 	%r25, %r23, %r24, %p6;
	st.u32 	[%rd31], %r25;
	setp.gt.s64 	%p7, %rd101, -1;
	@%p7 bra 	LBB1_8;
	xor.b32  	%r32, %r32, -2147483648;
	mov.u64 	%rd66, 0;
	// begin inline asm
	{
	.reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
	mov.b64         {a0,a1}, %rd66;
	mov.b64         {a2,a3}, %rd66;
	mov.b64         {b0,b1}, %rd100;
	mov.b64         {b2,b3}, %rd101;
	sub.cc.u32      r0, a0, b0; 
	subc.cc.u32     r1, a1, b1; 
	subc.cc.u32     r2, a2, b2; 
	subc.u32        r3, a3, b3; 
	mov.b64         %rd100, {r0,r1};
	mov.b64         %rd101, {r2,r3};
	}
	// end inline asm
LBB1_8:
	clz.b64 	%r26, %rd101;
	cvt.u64.u32 	%rd102, %r26;
	and.b32  	%r27, %r26, 63;
	shl.b64 	%rd75, %rd101, %r27;
	shr.u64 	%rd76, %rd100, 1;
	not.b32 	%r28, %r26;
	and.b32  	%r29, %r28, 63;
	shr.u64 	%rd77, %rd76, %r29;
	or.b64  	%rd72, %rd75, %rd77;
	mov.u64 	%rd73, -3958705157555305931;
	// begin inline asm
	{
	.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;
	mov.b64         {alo,ahi}, %rd72;   
	mov.b64         {blo,bhi}, %rd73;   
	mul.lo.u32      r0, alo, blo;    
	mul.hi.u32      r1, alo, blo;    
	mad.lo.cc.u32   r1, alo, bhi, r1;
	madc.hi.u32     r2, alo, bhi,  0;
	mad.lo.cc.u32   r1, ahi, blo, r1;
	madc.hi.cc.u32  r2, ahi, blo, r2;
	madc.hi.u32     r3, ahi, bhi,  0;
	mad.lo.cc.u32   r2, ahi, bhi, r2;
	addc.u32        r3, r3,  0;      
	mov.b64         %rd80, {r0,r1};     
	mov.b64         %rd103, {r2,r3};     
	}
	// end inline asm
	setp.lt.s64 	%p8, %rd103, 1;
	@%p8 bra 	LBB1_10;
	add.s64 	%rd102, %rd102, 1;
	// begin inline asm
	{
	.reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
	mov.b64         {a0,a1}, %rd80;
	mov.b64         {a2,a3}, %rd103;
	mov.b64         {b0,b1}, %rd80;
	mov.b64         {b2,b3}, %rd103;
	add.cc.u32      r0, a0, b0; 
	addc.cc.u32     r1, a1, b1; 
	addc.cc.u32     r2, a2, b2; 
	addc.u32        r3, a3, b3; 
	mov.b64         %rd78, {r0,r1};
	mov.b64         %rd103, {r2,r3};
	}
	// end inline asm
LBB1_10:
	cvt.u64.u32 	%rd84, %r32;
	shl.b64 	%rd85, %rd84, 32;
	shl.b64 	%rd86, %rd102, 52;
	add.s64 	%rd87, %rd103, 1;
	shr.u64 	%rd88, %rd87, 10;
	add.s64 	%rd89, %rd88, 1;
	shr.u64 	%rd90, %rd89, 1;
	sub.s64 	%rd91, %rd90, %rd86;
	add.s64 	%rd92, %rd91, 4602678819172646912;
	or.b64  	%rd93, %rd92, %rd85;
	mov.b64 	%fd4, %rd93;
LBB1_11:
	st.param.f64 	[func_retval0+0], %fd4;
	ret;

}

Which will segfault at runtime. However, it will not segfault if the target is sm_70. The PTX generated for sm_70 is identical, except for the target sm_70.

There are many differences in the sass generated by nvdisasm from this PTX for the different compute capabilities.

NVIDIA doesn’t develop, support, or maintain Clang, SYCL, or DPC++.

PTX code cannot cause a segfault. A segfault is a result of host code behavior.

If you believe this behavior manifests with a NVIDIA-provided tool chain, I suggest providing a complete example.