Libdevice functions causing PTXAS segfault

user14604 · October 28, 2021, 12:42pm

Here is PTX generated by libdevice function __nv_sin().

.version 6.5
.target sm_50
.address_size 64

	// .weak	_ZTS9sinkernel
.func __internal_trig_reduction_slowpathd
(
	.param .b64 __internal_trig_reduction_slowpathd_param_0,
	.param .b64 __internal_trig_reduction_slowpathd_param_1
)
;
.const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};

.weak .entry _ZTS9sinkernel()
{
	.local .align 4 .b8 	__local_depot0[4];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<4>;
	.reg .b32 	%r<7>;
	.reg .b64 	%rd<4>;
	.reg .f64 	%fd<5>;

	mov.u64 	%SPL, __local_depot0;
	cvta.local.u64 	%SP, %SPL;
	add.u64 	%rd1, %SP, 0;
	add.u64 	%rd2, %SPL, 0;
	mov.f64 	%fd2, 0d400999999999999A;
	{
	.reg .b32 %temp; 
	mov.b64 	{%r1, %temp}, %fd2;
	}
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r2}, %fd2;
	}
	and.b32  	%r3, %r2, 2147483647;
	setp.eq.s32 	%p1, %r3, 2146435072;
	setp.eq.s32 	%p2, %r1, 0;
	selp.f64 	%fd3, 0d0000000000000000, 0d400999999999999A, %p2;
	selp.f64 	%fd1, %fd3, 0d400999999999999A, %p1;
	mul.rn.f64 	%fd4, %fd1, 0d3FE45F306DC9C883;
	cvt.rni.s32.f64 	%r4, %fd4;
	st.local.u32 	[%rd2], %r4;
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r5}, %fd1;
	}
	and.b32  	%r6, %r5, 2145386496;
	setp.lt.u32 	%p3, %r6, 1105199104;
	@%p3 bra 	LBB0_2;
	{ // callseq 0, 0
	.reg .b32 temp_param_reg;
	.param .b64 param0;
	st.param.f64 	[param0+0], %fd1;
	.param .b64 param1;
	st.param.b64 	[param1+0], %rd1;
	call.uni 
	__internal_trig_reduction_slowpathd, 
	(
	param0, 
	param1
	);
	} // callseq 0
LBB0_2:
	ret;

}
.func __internal_trig_reduction_slowpathd(
	.param .b64 __internal_trig_reduction_slowpathd_param_0,
	.param .b64 __internal_trig_reduction_slowpathd_param_1
)
{
	.local .align 8 .b8 	__local_depot1[40];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<9>;
	.reg .b32 	%r<29>;
	.reg .b64 	%rd<87>;
	.reg .f64 	%fd<2>;

	mov.u64 	%SPL, __local_depot1;
	ld.param.f64 	%fd1, [__internal_trig_reduction_slowpathd_param_0];
	{
	.reg .b32 %temp; 
	mov.b64 	{%temp, %r1}, %fd1;
	}
	bfe.u32 	%r3, %r1, 20, 11;
	setp.eq.s32 	%p1, %r3, 2047;
	@%p1 bra 	LBB1_9;
	add.u64 	%rd1, %SPL, 0;
	shr.u32 	%r2, %r1, 20;
	add.s32 	%r4, %r3, -1024;
	shr.u32 	%r5, %r4, 6;
	mov.u32 	%r12, 15;
	sub.s32 	%r28, %r12, %r5;
	mov.u32 	%r13, 19;
	sub.s32 	%r14, %r13, %r5;
	setp.lt.u32 	%p2, %r4, 128;
	selp.b32 	%r7, 18, %r14, %p2;
	setp.ge.s32 	%p3, %r28, %r7;
	mov.u64 	%rd82, 0;
	@%p3 bra 	LBB1_4;
	mov.b64 	%rd29, %fd1;
	shl.b64 	%rd30, %rd29, 11;
	or.b64  	%rd43, %rd30, -9223372036854775808;
	cvt.u64.u32 	%rd32, %r4;
	shr.u64 	%rd33, %rd32, 6;
	add.s32 	%r15, %r28, -15;
	cvt.s64.s32 	%rd34, %r15;
	add.s64 	%rd35, %rd33, %rd34;
	shl.b64 	%rd36, %rd35, 3;
	add.s64 	%rd80, %rd1, %rd36;
	mul.wide.s32 	%rd37, %r15, 8;
	mov.u64 	%rd38, __cudart_i2opi_d;
	add.s64 	%rd39, %rd37, %rd38;
	add.s64 	%rd79, %rd39, 120;
	mov.u64 	%rd82, 0;
	mov.u32 	%r27, %r28;
LBB1_3:
	ld.const.u64 	%rd42, [%rd79];
	// begin inline asm
	{
	.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi, clo, chi;
	mov.b64         {alo,ahi}, %rd42;    
	mov.b64         {blo,bhi}, %rd43;    
	mov.b64         {clo,chi}, %rd82;    
	mad.lo.cc.u32   r0, alo, blo, clo;
	madc.hi.cc.u32  r1, alo, blo, chi;
	madc.hi.u32     r2, alo, bhi,   0;
	mad.lo.cc.u32   r1, alo, bhi,  r1;
	madc.hi.cc.u32  r2, ahi, blo,  r2;
	madc.hi.u32     r3, ahi, bhi,   0;
	mad.lo.cc.u32   r1, ahi, blo,  r1;
	madc.lo.cc.u32  r2, ahi, bhi,  r2;
	addc.u32        r3,  r3,   0;     
	mov.b64         %rd40, {r0,r1};      
	mov.b64         %rd82, {r2,r3};      
	}
	// end inline asm
	st.local.u64 	[%rd80], %rd40;
	add.s32 	%r27, %r27, 1;
	add.s64 	%rd80, %rd80, 8;
	add.s64 	%rd79, %rd79, 8;
	setp.ne.s32 	%p4, %r7, %r27;
	mov.u32 	%r28, %r7;
	@%p4 bra 	LBB1_3;
LBB1_4:
	ld.param.u64 	%rd26, [__internal_trig_reduction_slowpathd_param_1];
	cvt.s64.s32 	%rd45, %r28;
	cvt.u64.u32 	%rd46, %r5;
	add.s64 	%rd47, %rd45, %rd46;
	shl.b64 	%rd48, %rd47, 3;
	add.s64 	%rd49, %rd1, %rd48;
	st.local.u64 	[%rd49+-120], %rd82;
	and.b32  	%r11, %r2, 63;
	ld.local.u64 	%rd83, [%rd1+16];
	ld.local.u64 	%rd84, [%rd1+24];
	setp.eq.s32 	%p5, %r11, 0;
	@%p5 bra 	LBB1_6;
	mov.u32 	%r16, 64;
	sub.s32 	%r17, %r16, %r11;
	shl.b64 	%rd50, %rd84, %r11;
	shr.u64 	%rd51, %rd83, %r17;
	or.b64  	%rd84, %rd50, %rd51;
	shl.b64 	%rd52, %rd83, %r11;
	ld.local.u64 	%rd53, [%rd1+8];
	shr.u64 	%rd54, %rd53, %r17;
	or.b64  	%rd83, %rd54, %rd52;
LBB1_6:
	shr.u64 	%rd55, %rd84, 62;
	cvt.u32.u64 	%r18, %rd55;
	shr.u64 	%rd56, %rd83, 62;
	shl.b64 	%rd57, %rd84, 2;
	or.b64  	%rd86, %rd57, %rd56;
	shl.b64 	%rd85, %rd83, 2;
	bfe.u64 	%rd58, %rd84, 61, 1;
	cvt.u32.u64 	%r19, %rd58;
	add.s32 	%r20, %r19, %r18;
	setp.gt.s32 	%p6, %r1, -1;
	neg.s32 	%r21, %r20;
	selp.b32 	%r22, %r20, %r21, %p6;
	st.u32 	[%rd26], %r22;
	setp.gt.s64 	%p7, %rd86, -1;
	@%p7 bra 	LBB1_8;
	mov.u64 	%rd61, 0;
	// begin inline asm
	{
	.reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
	mov.b64         {a0,a1}, %rd61;
	mov.b64         {a2,a3}, %rd61;
	mov.b64         {b0,b1}, %rd85;
	mov.b64         {b2,b3}, %rd86;
	sub.cc.u32      r0, a0, b0; 
	subc.cc.u32     r1, a1, b1; 
	subc.cc.u32     r2, a2, b2; 
	subc.u32        r3, a3, b3; 
	mov.b64         %rd85, {r0,r1};
	mov.b64         %rd86, {r2,r3};
	}
	// end inline asm
LBB1_8:
	clz.b64 	%r23, %rd86;
	and.b32  	%r24, %r23, 63;
	shl.b64 	%rd70, %rd86, %r24;
	shr.u64 	%rd71, %rd85, 1;
	not.b32 	%r25, %r23;
	and.b32  	%r26, %r25, 63;
	shr.u64 	%rd72, %rd71, %r26;
	or.b64  	%rd67, %rd70, %rd72;
	mov.u64 	%rd68, -3958705157555305931;
	// begin inline asm
	{
	.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;
	mov.b64         {alo,ahi}, %rd67;   
	mov.b64         {blo,bhi}, %rd68;   
	mul.lo.u32      r0, alo, blo;    
	mul.hi.u32      r1, alo, blo;    
	mad.lo.cc.u32   r1, alo, bhi, r1;
	madc.hi.u32     r2, alo, bhi,  0;
	mad.lo.cc.u32   r1, ahi, blo, r1;
	madc.hi.cc.u32  r2, ahi, blo, r2;
	madc.hi.u32     r3, ahi, bhi,  0;
	mad.lo.cc.u32   r2, ahi, bhi, r2;
	addc.u32        r3, r3,  0;      
	mov.b64         %rd65, {r0,r1};     
	mov.b64         %rd66, {r2,r3};     
	}
	// end inline asm
	setp.lt.s64 	%p8, %rd66, 1;
LBB1_9:
	ret;

}

Invoking:

/usr/local/cuda-11.2/bin/ptxas -arch=sm_50 my_ptx.ptx -O0

Will complete without error. However changing the optimisation level above -O0 causes:

/usr/local/cuda-11.2/bin/ptxas -arch=sm_50 my_ptx.ptx -O1
Segmentation fault

The segmentation fault will not happen if the loop condition @%p4 bra LBB1_3; at the end of LBB1_3 is commented out.

This also occurs for libdevice functions __nv_cos(), __nv_tan(), and __nv_sincos().

Robert_Crovella · October 28, 2021, 1:44pm

I recommend:

testing against the latest cuda version (currently: 11.5)
if the issue still occurs there, filing a bug

user14604 · October 28, 2021, 3:11pm

Thanks for response. Bug replicated in cuda-11.5. Bug report filed https://developer.nvidia.com/nvidia_bug/3419212

Sorry for poorly formatted text in bug report, but I don’t seem to be able to edit my original post.

Robert_Crovella · October 28, 2021, 3:16pm

Thanks for filing the bug, I will link it to the one I filed, and clean it up if needed.

user14604 · October 28, 2021, 3:21pm

Thanks Robert!

Topic		Replies	Views
Libdevice functions segfaulting at runtime CUDA Programming and Performance	1	387	October 25, 2021
nvcc Segfault CUDA Programming and Performance	6	11415	October 14, 2010
Cannot run any CUDA kernels CUDA runtime doesn't recognize NVIDIA GPU CUDA Programming and Performance	26	12387	August 24, 2010
CUDA 9.2 (9.2.148) Update1, nvcc compiler bug CUDA Programming and Performance	10	1126	August 18, 2018
Bug in nvcc, incorrect sign-extend for textures Incorrect sign-extend when the results of one textur CUDA Programming and Performance	3	2836	May 5, 2009
CUDA 5.5 produces different PTX code from CUDA 5.0 with degraded performance CUDA Programming and Performance	3	1438	August 15, 2013
Nv-nsight-cu-cli segfault Nsight Compute cuda	6	1763	May 5, 2020
Segmentation fault in pthread_mutex_lock () Legacy PGI Compilers	6	10527	June 29, 2022
CUDA 8.0.26 Inline PTX "addc" Bug CUDA Programming and Performance	18	1490	December 26, 2017
Inline functions not inlined in CUDA 6.5? CUDA Programming and Performance	7	5943	November 29, 2014

Libdevice functions causing PTXAS segfault

Related topics