__int128 causes optix complaining OPTIX_ERROR_INTERNAL_COMPILER_ERROR

Hi, the Optix community,

I encountered a bug when trying to use __int128 in Optix. A compile error is thrown when calling optixModuleCreateFromPTX.

    struct LaunchParams {
        uint32_t width;
        uint32_t height;
        uint32_t depth;

        float3 *vertices;
        uint3 *indices;

        OptixTraversableHandle traversable;
        __int128 a, b, c;
    };
    extern "C" __global__ void __anyhit__radiance() {
        double f = -optixLaunchParams.a * optixLaunchParams.b / optixLaunchParams.c; // This makes optix failed to compile ptx code
        printf("%lf\n", f);
    }
.visible .entry __anyhit__radiance()
{
	.local .align 8 .b8 	__local_depot1[8];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<22>;
	.reg .b32 	%r<417>;
	.reg .f64 	%fd<2>;
	.reg .b64 	%rd<47>;
	.loc	1 38 0


	mov.u64 	%SPL, __local_depot1;
	cvta.local.u64 	%SP, %SPL;
	add.u64 	%rd13, %SP, 0;
	add.u64 	%rd1, %SPL, 0;
	.loc	1 54 18
	ld.const.v2.u64 	{%rd14, %rd15}, [optixLaunchParams+48];
	mov.u64 	%rd5, 0;
	sub.cc.s64 	%rd18, %rd5, %rd14;
	subc.cc.s64 	%rd19, %rd5, %rd15;
	ld.const.v2.u64 	{%rd20, %rd21}, [optixLaunchParams+64];
	mul.lo.s64 	%rd24, %rd18, %rd21;
	mul.hi.u64 	%rd25, %rd18, %rd20;
	add.s64 	%rd26, %rd25, %rd24;
	mul.lo.s64 	%rd27, %rd19, %rd20;
	add.s64 	%rd28, %rd26, %rd27;
	mul.lo.s64 	%rd29, %rd18, %rd20;
	ld.const.v2.u64 	{%rd30, %rd31}, [optixLaunchParams+80];
	cvt.u32.u64 	%r82, %rd29;
	shr.u64 	%rd34, %rd29, 32;
	cvt.u32.u64 	%r85, %rd34;
	mov.u32 	%r194, 0;
	cvt.u32.u64 	%r88, %rd28;
	shr.u64 	%rd35, %rd28, 32;
	cvt.u32.u64 	%r91, %rd35;
	cvt.u32.u64 	%r94, %rd30;
	mov.b64 	{%r195, %r97}, %rd30;
	mov.b64 	{%r100, %r103}, %rd31;
	shr.u64 	%rd2, %rd28, 63;
	// begin inline asm
	sub.cc.u32 %r80,%r194,%r82;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r83,%r194,%r85;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r86,%r194,%r88;
	// end inline asm
	// begin inline asm
	subc.u32 %r89,%r194,%r91;
	// end inline asm
	setp.eq.s64 	%p1, %rd2, 0;
	selp.b32 	%r125, %r82, %r80, %p1;
	selp.b32 	%r129, %r85, %r83, %p1;
	selp.b32 	%r133, %r88, %r86, %p1;
	selp.b32 	%r137, %r91, %r89, %p1;
	shr.u64 	%rd4, %rd31, 63;
	// begin inline asm
	sub.cc.u32 %r92,%r194,%r94;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r95,%r194,%r97;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r98,%r194,%r100;
	// end inline asm
	// begin inline asm
	subc.u32 %r101,%r194,%r103;
	// end inline asm
	setp.eq.s64 	%p2, %rd4, 0;
	selp.b32 	%r196, %r94, %r92, %p2;
	selp.b32 	%r197, %r97, %r95, %p2;
	selp.b32 	%r198, %r100, %r98, %p2;
	selp.b32 	%r199, %r103, %r101, %p2;
	or.b32  	%r200, %r199, %r198;
	setp.eq.s32 	%p3, %r200, 0;
	selp.b32 	%r201, %r197, %r199, %p3;
	selp.b32 	%r202, %r196, %r198, %p3;
	selp.b32 	%r203, 0, %r197, %p3;
	selp.b32 	%r204, 0, %r196, %p3;
	selp.b32 	%r205, 64, 0, %p3;
	setp.eq.s32 	%p4, %r201, 0;
	selp.b32 	%r106, %r202, %r201, %p4;
	selp.b32 	%r110, %r203, %r202, %p4;
	selp.b32 	%r114, %r204, %r203, %p4;
	selp.b32 	%r118, 0, %r204, %p4;
	selp.b32 	%r206, 32, 0, %p4;
	or.b32  	%r207, %r206, %r205;
	clz.b32 	%r208, %r106;
	add.s32 	%r139, %r207, %r208;
	// begin inline asm
	shf.l.wrap.b32 %r104,%r110,%r106,%r139;
	// end inline asm
	// begin inline asm
	shf.l.wrap.b32 %r108,%r114,%r110,%r139;
	// end inline asm
	// begin inline asm
	shf.l.wrap.b32 %r112,%r118,%r114,%r139;
	// end inline asm
	// begin inline asm
	shf.l.wrap.b32 %r116,%r194,%r118,%r139;
	// end inline asm
	// begin inline asm
	shf.l.wrap.b32 %r392,%r194,%r125,%r139;
	// end inline asm
	// begin inline asm
	shf.l.wrap.b32 %r391,%r125,%r129,%r139;
	// end inline asm
	// begin inline asm
	shf.l.wrap.b32 %r390,%r129,%r133,%r139;
	// end inline asm
	// begin inline asm
	shf.l.wrap.b32 %r389,%r133,%r137,%r139;
	// end inline asm
	// begin inline asm
	shf.l.wrap.b32 %r388,%r137,%r194,%r139;
	// end inline asm
	// begin inline asm
	mov.b64 %rd6,{%r108,%r104};
	// end inline asm
	// begin inline asm
	{
	   .reg .f32 ftmp;
	   .reg .u32 qtmp;
	   prmt.b32       ftmp,%r104,0x3F,0x4321;
	   rcp.approx.f32 ftmp,ftmp;
	   mov.b32        qtmp,ftmp;
	   mad.lo.u32     %r142,qtmp,512,0xFFFFFE00;
	}
	// end inline asm
	mul.hi.u32 	%r209, %r142, %r108;
	cvt.u64.u32 	%rd36, %r209;
	// begin inline asm
	mad.wide.u32 %rd7,%r142,%r104,%rd6;
	// end inline asm
	add.s64 	%rd37, %rd7, %rd36;
	neg.s64 	%rd9, %rd37;
	// begin inline asm
	mov.b64 {%r146,%r147},%rd9;
	// end inline asm
	setp.gt.s32 	%p5, %r147, -1;
	selp.b32 	%r174, %r142, 0, %p5;
	shr.u32 	%r210, %r147, 31;
	xor.b32  	%r177, %r210, 1;
	mov.u32 	%r151, -2147483648;
	// begin inline asm
	mad.hi.cc.u32 %r148,%r142,%r146,%r151;
	// end inline asm
	// begin inline asm
	addc.u32 %r152,%r194,%r194;
	// end inline asm
	// begin inline asm
	mad.lo.cc.u32 %r155,%r142,%r147,%r148;
	// end inline asm
	// begin inline asm
	madc.hi.u32 %r159,%r142,%r147,%r152;
	// end inline asm
	// begin inline asm
	add.cc.u32 %r163,%r155,%r146;
	// end inline asm
	// begin inline asm
	addc.cc.u32 %r166,%r159,%r147;
	// end inline asm
	// begin inline asm
	addc.u32 %r169,%r194,%r194;
	// end inline asm
	// begin inline asm
	add.cc.u32 %r172,%r166,%r174;
	// end inline asm
	// begin inline asm
	addc.u32 %r175,%r169,%r177;
	// end inline asm
	setp.eq.s32 	%p6, %r175, 2;
	selp.b32 	%r178, -1, %r172, %p6;
	mul.hi.u32 	%r211, %r178, %r108;
	cvt.u64.u32 	%rd11, %r211;
	// begin inline asm
	mad.wide.u32 %rd10,%r178,%r104,%rd11;
	// end inline asm
	// begin inline asm
	mov.b64 {%r180,%r181},%rd10;
	// end inline asm
	// begin inline asm
	add.cc.u32 %r182,%r180,%r108;
	// end inline asm
	// begin inline asm
	addc.cc.u32 %r185,%r181,%r104;
	// end inline asm
	// begin inline asm
	addc.u32 %r188,%r194,%r194;
	// end inline asm
	setp.eq.s32 	%p7, %r188, 0;
	setp.ne.s32 	%p8, %r178, -1;
	and.pred  	%p9, %p8, %p7;
	selp.u32 	%r212, 1, 0, %p9;
	add.s32 	%r11, %r178, %r212;
	mov.u32 	%r386, %r194;
	mov.u32 	%r400, %r194;
	mov.u32 	%r410, %r194;
	mov.u32 	%r411, %r194;

$L__BB1_1:
	.pragma "nounroll";
	mov.u32 	%r412, %r411;
	mov.u32 	%r411, %r410;
	mov.u32 	%r410, %r400;
	// begin inline asm
	sub.cc.u32 %r213,%r390,%r108;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r216,%r389,%r104;
	// end inline asm
	mov.u32 	%r221, -2;
	// begin inline asm
	subc.u32 %r219,%r388,%r221;
	// end inline asm
	// begin inline asm
	mad.hi.u32 %r222,%r388,%r11,%r219;
	// end inline asm
	setp.lt.u32 	%p10, %r222, %r388;
	selp.b32 	%r400, -1, %r222, %p10;
	// begin inline asm
	mad.lo.cc.u32 %r226,%r400,%r108,%r194;
	// end inline asm
	// begin inline asm
	madc.hi.u32 %r230,%r400,%r108,%r194;
	// end inline asm
	// begin inline asm
	mad.lo.cc.u32 %r234,%r400,%r104,%r230;
	// end inline asm
	// begin inline asm
	madc.hi.u32 %r238,%r400,%r104,%r194;
	// end inline asm
	// begin inline asm
	sub.cc.u32 %r401,%r390,%r226;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r402,%r389,%r234;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r403,%r388,%r238;
	// end inline asm
	// begin inline asm
	addc.u32 %r399,%r194,%r194;
	// end inline asm
	setp.ne.s32 	%p11, %r399, 0;
	@%p11 bra 	$L__BB1_3;

	// begin inline asm
	add.cc.u32 %r401,%r401,%r108;
	// end inline asm
	// begin inline asm
	addc.cc.u32 %r402,%r402,%r104;
	// end inline asm
	mov.u32 	%r265, 0;
	// begin inline asm
	addc.cc.u32 %r403,%r403,%r265;
	// end inline asm
	// begin inline asm
	addc.u32 %r399,%r265,%r265;
	// end inline asm
	add.s32 	%r400, %r400, -1;

$L__BB1_3:
	setp.ne.s32 	%p12, %r399, 0;
	@%p12 bra 	$L__BB1_5;

	// begin inline asm
	add.cc.u32 %r401,%r401,%r108;
	// end inline asm
	// begin inline asm
	addc.cc.u32 %r402,%r402,%r104;
	// end inline asm
	mov.u32 	%r274, 0;
	// begin inline asm
	addc.u32 %r403,%r403,%r274;
	// end inline asm
	add.s32 	%r400, %r400, -1;

$L__BB1_5:
	mov.u32 	%r308, 0;
	// begin inline asm
	mad.lo.cc.u32 %r275,%r400,%r116,%r308;
	// end inline asm
	// begin inline asm
	madc.hi.u32 %r279,%r400,%r116,%r308;
	// end inline asm
	// begin inline asm
	mad.lo.cc.u32 %r283,%r400,%r112,%r279;
	// end inline asm
	// begin inline asm
	madc.hi.u32 %r287,%r400,%r112,%r308;
	// end inline asm
	// begin inline asm
	sub.cc.u32 %r404,%r392,%r275;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r390,%r391,%r283;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r389,%r401,%r287;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r388,%r402,%r308;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r303,%r403,%r308;
	// end inline asm
	// begin inline asm
	subc.u32 %r306,%r308,%r308;
	// end inline asm
	setp.ne.s32 	%p13, %r306, -1;
	@%p13 bra 	$L__BB1_7;

	add.s32 	%r400, %r400, -1;
	// begin inline asm
	add.cc.u32 %r404,%r404,%r116;
	// end inline asm
	// begin inline asm
	addc.cc.u32 %r390,%r390,%r112;
	// end inline asm
	// begin inline asm
	addc.cc.u32 %r389,%r389,%r108;
	// end inline asm
	// begin inline asm
	addc.u32 %r388,%r388,%r104;
	// end inline asm

$L__BB1_7:
	add.s32 	%r386, %r386, 32;
	setp.le.u32 	%p14, %r386, %r139;
	mov.u32 	%r391, %r404;
	mov.u32 	%r392, %r308;
	@%p14 bra 	$L__BB1_1;

	xor.b64  	%rd38, %rd2, %rd4;
	setp.eq.s64 	%p15, %rd38, 0;
	@%p15 bra 	$L__BB1_10;

	mov.u32 	%r332, 0;
	// begin inline asm
	sub.cc.u32 %r400,%r332,%r400;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r410,%r332,%r410;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r411,%r332,%r411;
	// end inline asm
	// begin inline asm
	subc.u32 %r412,%r332,%r412;
	// end inline asm

$L__BB1_10:
	setp.eq.s32 	%p16, %r139, 128;
	selp.b32 	%r416, -1, %r412, %p16;
	selp.b32 	%r415, -1, %r411, %p16;
	selp.b32 	%r414, -1, %r410, %p16;
	selp.b32 	%r413, -1, %r400, %p16;
	and.b32  	%r71, %r416, -2147483648;
	setp.eq.s32 	%p17, %r71, 0;
	@%p17 bra 	$L__BB1_12;

	mov.u32 	%r344, 0;
	// begin inline asm
	sub.cc.u32 %r413,%r344,%r413;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r414,%r344,%r414;
	// end inline asm
	// begin inline asm
	subc.cc.u32 %r415,%r344,%r415;
	// end inline asm
	// begin inline asm
	subc.u32 %r416,%r344,%r416;
	// end inline asm

$L__BB1_12:
	or.b32  	%r372, %r416, %r415;
	setp.eq.s32 	%p18, %r372, 0;
	mov.u32 	%r359, 0;
	selp.b32 	%r373, %r414, %r416, %p18;
	selp.b32 	%r374, %r413, %r415, %p18;
	selp.b32 	%r375, 0, %r414, %p18;
	selp.b32 	%r376, 0, %r413, %p18;
	selp.b32 	%r377, 64, 0, %p18;
	setp.eq.s32 	%p19, %r373, 0;
	selp.b32 	%r348, %r374, %r373, %p19;
	selp.b32 	%r352, %r375, %r374, %p19;
	selp.b32 	%r356, %r376, %r375, %p19;
	selp.b32 	%r360, 0, %r376, %p19;
	selp.b32 	%r378, 32, 0, %p19;
	or.b32  	%r379, %r378, %r377;
	clz.b32 	%r380, %r348;
	add.s32 	%r361, %r379, %r380;
	// begin inline asm
	shf.l.wrap.b32 %r346,%r352,%r348,%r361;
	// end inline asm
	// begin inline asm
	shf.l.wrap.b32 %r350,%r356,%r352,%r361;
	// end inline asm
	// begin inline asm
	shf.l.wrap.b32 %r354,%r360,%r356,%r361;
	// end inline asm
	// begin inline asm
	shf.l.wrap.b32 %r358,%r359,%r360,%r361;
	// end inline asm
	shl.b32 	%r381, %r361, 20;
	or.b32  	%r382, %r358, %r354;
	setp.ne.s32 	%p20, %r382, 0;
	selp.u32 	%r383, 1, 0, %p20;
	or.b32  	%r362, %r350, %r383;
	// begin inline asm
	mov.b64 %rd39,{%r362,%r346};
	// end inline asm
	cvt.rn.f64.u64 	%fd1, %rd39;
	mov.b64 	%rd40, %fd1;
	// begin inline asm
	mov.b64 {%r364,%r365},%rd40;
	// end inline asm
	add.s32 	%r384, %r365, 67108864;
	sub.s32 	%r367, %r384, %r381;
	// begin inline asm
	mov.b64 %rd41,{%r364,%r367};
	// end inline asm
	setp.eq.s32 	%p21, %r361, 128;
	selp.b64 	%rd42, 0, %rd41, %p21;
	// begin inline asm
	mov.b64 {%r368,%r369},%rd42;
	// end inline asm
	or.b32  	%r371, %r369, %r71;
	// begin inline asm
	mov.b64 %rd43,{%r368,%r371};
	// end inline asm
	.loc	1 55 9
	st.local.u64 	[%rd1], %rd43;
	mov.u64 	%rd44, $str;
	cvta.global.u64 	%rd45, %rd44;
	{ // callseq 0, 0
	.reg .b32 temp_param_reg;
	.param .b64 param0;
	st.param.b64 	[param0+0], %rd45;
	.param .b64 param1;
	st.param.b64 	[param1+0], %rd13;
	.param .b32 retval0;
	call.uni (retval0), 
	vprintf, 
	(
	param0, 
	param1
	);
	ld.param.b32 	%r385, [retval0+0];
	} // callseq 0
	.loc	1 56 5
	ret;

}

I make a minimal code that produces this bug: GitHub - pwrliang/optix7course at optixbug

Environment:
CUDA 11.8
Optix 7.6
GCC 9.4.0
Driver Version: 520.61.05
OS: Ubuntu 20.04.5

Thanks or the report.

I filed an internal bug report for reproduction and analysis of the issue.

Hi Droettger, do you have any conclusions about this issue? Since I’m doing high precision with Optix, the __int128 supporting really matters to me. Thanks.

The necessary code changes to make this work exist internally.
It’s just a matter of merging all related changes in two different driver modules to a common driver release branch.
I cannot say which display driver version will get the fix at this time.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.