I have been developing and testing a rendering software for public users, which has run on GPU(with CUDA 8.0) or CPU(on Windows x64/x32, Mac OSX 10.9) without any problems. Recently, I have upgraded the CUDA version from 8.0 to 9.2(.148) and encountered a strange rendering glitch.
By tracing the source of the glitch, I found that cubins of the following code generated by nvcc is not working properly.
#define CLOSURE_IS_BSDF(type) ((type) <= CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSSRDF(type) ((type) >= CLOSURE_BSSRDF_CUBIC_ID && (type) <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (CLOSURE_IS_BSDF(type)||CLOSURE_IS_BSSRDF(type))
__device__ __inline__ const ShaderClosure *shader_bssrdf_pick(ShaderData *sd, float3 *throughput, float *randu)
{
int sampled = 0;
if(sd->num_closure > 1)
{
/* Pick a BSDF or BSSRDF or based on sample weights. */
float sum_bssrdf = 0.0f;
float sum = 0.f;
float sample_weight;
for(int i = 0; i < sd->num_closure; i++)
{
const ShaderClosure *sc = &sd->closure[i];
if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
{
sample_weight = sc->sample_weight;
sum += sample_weight;
if (CLOSURE_IS_BSSRDF(sc->type))
sum_bssrdf += sample_weight;
}
}
float r = (*randu)*sum;
float partial_sum = 0.0f;
for(int i = 0; i < sd->num_closure; i++)
{
const ShaderClosure *sc = &sd->closure[i];
if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
{
sample_weight = sc->sample_weight;
float next_sum = partial_sum + sample_weight;
if(r < next_sum)
{
if(CLOSURE_IS_BSDF(sc->type))
{
*throughput *= sum / (sum - sum_bssrdf);
return NULL;
}
else // if (CLOSURE_IS_BSSRDF(sc->type))
{
*throughput *= sum / sum_bssrdf;
sampled = i;
/* Rescale to reuse for direction sample, to better
* preserve stratifaction. */
*randu = (r - partial_sum) / sample_weight;
<b>break; // !!!! buggy !!!! </b>
// After 'break' of this for-loop, the function is
// supposed to return &sd->closure[sampled] /* == sc */
// but found to return NULL all the time.
// However, if 'break' is replaced with 'return sc', the function works correctly.
}
}
partial_sum = next_sum;
}
}
}
const ShaderClosure *sc = &sd->closure[sampled];
return CLOSURE_IS_BSSRDF(sc->type)? sc: NULL;
}
The ptx of the above code, translated by nvcc, is as follows:
.func (.param .b64 func_retval0) _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf(
.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_0,
.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_1,
.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_2
)
{
.reg .pred %p<29>;
.reg .f32 %f<108>;
.reg .b32 %r<40>;
.reg .b64 %rd<34>;
ld.param.u64 %rd15, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_0];
ld.param.u64 %rd16, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_1];
ld.param.u64 %rd17, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_2];
cvta.to.local.u64 %rd1, %rd17;
add.s64 %rd2, %rd15, 328;
ld.u32 %r1, [%rd15+328];
mov.u32 %r39, 0;
setp.lt.s32 %p1, %r1, 2;
@%p1 bra BB57_34;
mov.f32 %f38, 0f00000000;
setp.lt.s32 %p2, %r1, 1;
mov.f32 %f94, %f38;
mov.f32 %f95, %f38;
@%p2 bra BB57_26;
and.b32 %r2, %r1, 3;
setp.eq.s32 %p3, %r2, 0;
mov.f32 %f94, 0f00000000;
mov.u32 %r36, 0;
mov.f32 %f95, %f94;
@%p3 bra BB57_15;
setp.eq.s32 %p4, %r2, 1;
mov.f32 %f94, 0f00000000;
mov.u32 %r35, 0;
mov.f32 %f95, %f94;
@%p4 bra BB57_12;
setp.eq.s32 %p5, %r2, 2;
mov.f32 %f94, 0f00000000;
mov.u32 %r34, 0;
@%p5 bra BB57_5;
bra.uni BB57_6;
BB57_5:
mov.f32 %f95, %f94;
bra.uni BB57_9;
BB57_6:
ld.u32 %r3, [%rd2+84];
mov.u32 %r34, 1;
setp.gt.s32 %p6, %r3, 44;
@%p6 bra BB57_7;
ld.f32 %f52, [%rd2+88];
add.ftz.f32 %f95, %f52, 0f00000000;
setp.gt.s32 %p7, %r3, 38;
selp.f32 %f94, %f95, 0f00000000, %p7;
bra.uni BB57_9;
BB57_7:
mov.f32 %f95, %f94;
BB57_9:
cvt.u64.u32 %rd3, %r34;
mul.wide.u32 %rd18, %r34, 96;
add.s64 %rd19, %rd15, %rd18;
add.s64 %rd4, %rd19, 412;
ld.u32 %r5, [%rd19+412];
setp.gt.s32 %p8, %r5, 44;
@%p8 bra BB57_11;
ld.f32 %f53, [%rd4+4];
add.ftz.f32 %f95, %f95, %f53;
add.ftz.f32 %f54, %f94, %f53;
setp.gt.s32 %p9, %r5, 38;
selp.f32 %f94, %f54, %f94, %p9;
BB57_11:
cvt.u32.u64 %r28, %rd3;
add.s32 %r35, %r28, 1;
BB57_12:
cvt.s64.s32 %rd5, %r35;
mul.wide.s32 %rd20, %r35, 96;
add.s64 %rd21, %rd15, %rd20;
add.s64 %rd6, %rd21, 412;
ld.u32 %r8, [%rd21+412];
setp.gt.s32 %p10, %r8, 44;
@%p10 bra BB57_14;
ld.f32 %f55, [%rd6+4];
add.ftz.f32 %f95, %f95, %f55;
add.ftz.f32 %f56, %f94, %f55;
setp.gt.s32 %p11, %r8, 38;
selp.f32 %f94, %f56, %f94, %p11;
BB57_14:
cvt.u32.u64 %r29, %rd5;
add.s32 %r36, %r29, 1;
BB57_15:
setp.lt.u32 %p12, %r1, 4;
@%p12 bra BB57_26;
mul.lo.s32 %r30, %r36, 96;
cvt.s64.s32 %rd22, %r30;
add.s64 %rd32, %rd15, %rd22;
BB57_17:
add.s64 %rd9, %rd32, 412;
ld.u32 %r12, [%rd32+412];
setp.gt.s32 %p13, %r12, 44;
@%p13 bra BB57_19;
ld.f32 %f57, [%rd9+4];
add.ftz.f32 %f95, %f95, %f57;
add.ftz.f32 %f58, %f94, %f57;
setp.gt.s32 %p14, %r12, 38;
selp.f32 %f94, %f58, %f94, %p14;
BB57_19:
ld.u32 %r13, [%rd9+96];
setp.gt.s32 %p15, %r13, 44;
@%p15 bra BB57_21;
ld.f32 %f59, [%rd9+100];
add.ftz.f32 %f95, %f95, %f59;
add.ftz.f32 %f60, %f94, %f59;
setp.gt.s32 %p16, %r13, 38;
selp.f32 %f94, %f60, %f94, %p16;
BB57_21:
ld.u32 %r14, [%rd9+192];
setp.gt.s32 %p17, %r14, 44;
@%p17 bra BB57_23;
ld.f32 %f61, [%rd9+196];
add.ftz.f32 %f95, %f95, %f61;
add.ftz.f32 %f62, %f94, %f61;
setp.gt.s32 %p18, %r14, 38;
selp.f32 %f94, %f62, %f94, %p18;
BB57_23:
ld.u32 %r15, [%rd9+288];
setp.gt.s32 %p19, %r15, 44;
@%p19 bra BB57_25;
ld.f32 %f63, [%rd9+292];
add.ftz.f32 %f95, %f95, %f63;
add.ftz.f32 %f64, %f94, %f63;
setp.gt.s32 %p20, %r15, 38;
selp.f32 %f94, %f64, %f94, %p20;
BB57_25:
add.s64 %rd32, %rd32, 384;
add.s32 %r36, %r36, 4;
setp.lt.s32 %p21, %r36, %r1;
@%p21 bra BB57_17;
BB57_26:
ld.local.f32 %f66, [%rd1];
mul.ftz.f32 %f37, %f95, %f66;
mov.u32 %r32, 0;
mov.u32 %r39, %r32;
@%p2 bra BB57_34;
BB57_27:
mul.wide.s32 %rd23, %r39, 96;
add.s64 %rd24, %rd15, %rd23;
add.s64 %rd11, %rd24, 412;
ld.u32 %r18, [%rd24+412];
setp.gt.s32 %p23, %r18, 44;
@%p23 bra BB57_28;
ld.f32 %f39, [%rd11+4];
add.ftz.f32 %f107, %f38, %f39;
setp.lt.ftz.f32 %p24, %f37, %f107;
@%p24 bra BB57_32;
bra.uni BB57_30;
BB57_28:
mov.f32 %f107, %f38;
BB57_30:
add.s32 %r39, %r39, 1;
setp.lt.s32 %p25, %r39, %r1;
mov.f32 %f38, %f107;
@%p25 bra BB57_27;
mov.u32 %r39, %r32;
bra.uni BB57_34;
BB57_32:
setp.lt.s32 %p26, %r18, 39;
@%p26 bra BB57_36;
bra.uni BB57_33;
BB57_36:
sub.ftz.f32 %f76, %f95, %f94;
div.approx.ftz.f32 %f77, %f95, %f76;
ld.f32 %f78, [%rd16];
mul.ftz.f32 %f79, %f77, %f78;
ld.f32 %f80, [%rd16+4];
mul.ftz.f32 %f81, %f77, %f80;
ld.f32 %f82, [%rd16+8];
mul.ftz.f32 %f83, %f77, %f82;
st.f32 [%rd16], %f79;
st.f32 [%rd16+4], %f81;
st.f32 [%rd16+8], %f83;
mov.u64 %rd33, 0;
bra.uni BB57_37;
BB57_33:
div.approx.ftz.f32 %f67, %f95, %f94;
ld.f32 %f68, [%rd16];
mul.ftz.f32 %f69, %f67, %f68;
ld.f32 %f70, [%rd16+4];
mul.ftz.f32 %f71, %f67, %f70;
ld.f32 %f72, [%rd16+8];
mul.ftz.f32 %f73, %f67, %f72;
st.f32 [%rd16], %f69;
st.f32 [%rd16+4], %f71;
st.f32 [%rd16+8], %f73;
sub.ftz.f32 %f74, %f37, %f38;
div.approx.ftz.f32 %f75, %f74, %f39;
st.local.f32 [%rd1], %f75;
BB57_34:
mul.wide.s32 %rd26, %r39, 96;
add.s64 %rd27, %rd15, %rd26;
ld.u32 %r21, [%rd27+412];
mov.u64 %rd33, 0;
setp.lt.s32 %p27, %r21, 39;
@%p27 bra BB57_37;
setp.lt.s32 %p28, %r21, 45;
add.s64 %rd30, %rd27, 400;
selp.b64 %rd33, %rd30, 0, %p28;
BB57_37:
st.param.b64 [func_retval0+0], %rd33;
ret;
}
The correctly fuctionining code, where ‘break’ is replaced with ‘return sc’, is translated as follows:
.func (.param .b64 func_retval0) _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf(
.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_0,
.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_1,
.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_2
)
{
.reg .pred %p<29>;
.reg .f32 %f<108>;
.reg .b32 %r<36>;
.reg .b64 %rd<34>;
ld.param.u64 %rd17, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_0];
ld.param.u64 %rd18, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_1];
ld.param.u64 %rd19, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_2];
cvta.to.local.u64 %rd1, %rd19;
add.s64 %rd2, %rd17, 328;
ld.u32 %r1, [%rd17+328];
setp.lt.s32 %p1, %r1, 2;
@%p1 bra BB57_31;
mov.f32 %f38, 0f00000000;
setp.lt.s32 %p2, %r1, 1;
mov.f32 %f94, %f38;
mov.f32 %f95, %f38;
@%p2 bra BB57_26;
and.b32 %r2, %r1, 3;
setp.eq.s32 %p3, %r2, 0;
mov.f32 %f94, 0f00000000;
mov.u32 %r33, 0;
mov.f32 %f95, %f94;
@%p3 bra BB57_15;
setp.eq.s32 %p4, %r2, 1;
mov.f32 %f94, 0f00000000;
mov.u32 %r32, 0;
mov.f32 %f95, %f94;
@%p4 bra BB57_12;
setp.eq.s32 %p5, %r2, 2;
mov.f32 %f94, 0f00000000;
mov.u32 %r31, 0;
@%p5 bra BB57_5;
bra.uni BB57_6;
BB57_5:
mov.f32 %f95, %f94;
bra.uni BB57_9;
BB57_6:
ld.u32 %r3, [%rd2+84];
mov.u32 %r31, 1;
setp.gt.s32 %p6, %r3, 44;
@%p6 bra BB57_7;
ld.f32 %f52, [%rd2+88];
add.ftz.f32 %f95, %f52, 0f00000000;
setp.gt.s32 %p7, %r3, 38;
selp.f32 %f94, %f95, 0f00000000, %p7;
bra.uni BB57_9;
BB57_7:
mov.f32 %f95, %f94;
BB57_9:
cvt.u64.u32 %rd3, %r31;
mul.wide.u32 %rd20, %r31, 96;
add.s64 %rd21, %rd17, %rd20;
add.s64 %rd4, %rd21, 412;
ld.u32 %r5, [%rd21+412];
setp.gt.s32 %p8, %r5, 44;
@%p8 bra BB57_11;
ld.f32 %f53, [%rd4+4];
add.ftz.f32 %f95, %f95, %f53;
add.ftz.f32 %f54, %f94, %f53;
setp.gt.s32 %p9, %r5, 38;
selp.f32 %f94, %f54, %f94, %p9;
BB57_11:
cvt.u32.u64 %r26, %rd3;
add.s32 %r32, %r26, 1;
BB57_12:
cvt.s64.s32 %rd5, %r32;
mul.wide.s32 %rd22, %r32, 96;
add.s64 %rd23, %rd17, %rd22;
add.s64 %rd6, %rd23, 412;
ld.u32 %r8, [%rd23+412];
setp.gt.s32 %p10, %r8, 44;
@%p10 bra BB57_14;
ld.f32 %f55, [%rd6+4];
add.ftz.f32 %f95, %f95, %f55;
add.ftz.f32 %f56, %f94, %f55;
setp.gt.s32 %p11, %r8, 38;
selp.f32 %f94, %f56, %f94, %p11;
BB57_14:
cvt.u32.u64 %r27, %rd5;
add.s32 %r33, %r27, 1;
BB57_15:
setp.lt.u32 %p12, %r1, 4;
@%p12 bra BB57_26;
mul.lo.s32 %r28, %r33, 96;
cvt.s64.s32 %rd24, %r28;
add.s64 %rd32, %rd17, %rd24;
BB57_17:
add.s64 %rd9, %rd32, 412;
ld.u32 %r12, [%rd32+412];
setp.gt.s32 %p13, %r12, 44;
@%p13 bra BB57_19;
ld.f32 %f57, [%rd9+4];
add.ftz.f32 %f95, %f95, %f57;
add.ftz.f32 %f58, %f94, %f57;
setp.gt.s32 %p14, %r12, 38;
selp.f32 %f94, %f58, %f94, %p14;
BB57_19:
ld.u32 %r13, [%rd9+96];
setp.gt.s32 %p15, %r13, 44;
@%p15 bra BB57_21;
ld.f32 %f59, [%rd9+100];
add.ftz.f32 %f95, %f95, %f59;
add.ftz.f32 %f60, %f94, %f59;
setp.gt.s32 %p16, %r13, 38;
selp.f32 %f94, %f60, %f94, %p16;
BB57_21:
ld.u32 %r14, [%rd9+192];
setp.gt.s32 %p17, %r14, 44;
@%p17 bra BB57_23;
ld.f32 %f61, [%rd9+196];
add.ftz.f32 %f95, %f95, %f61;
add.ftz.f32 %f62, %f94, %f61;
setp.gt.s32 %p18, %r14, 38;
selp.f32 %f94, %f62, %f94, %p18;
BB57_23:
ld.u32 %r15, [%rd9+288];
setp.gt.s32 %p19, %r15, 44;
@%p19 bra BB57_25;
ld.f32 %f63, [%rd9+292];
add.ftz.f32 %f95, %f95, %f63;
add.ftz.f32 %f64, %f94, %f63;
setp.gt.s32 %p20, %r15, 38;
selp.f32 %f94, %f64, %f94, %p20;
BB57_25:
add.s64 %rd32, %rd32, 384;
add.s32 %r33, %r33, 4;
setp.lt.s32 %p21, %r33, %r1;
@%p21 bra BB57_17;
BB57_26:
ld.local.f32 %f66, [%rd1];
mul.ftz.f32 %f37, %f95, %f66;
mov.u32 %r35, 0;
@%p2 bra BB57_31;
BB57_27:
cvt.s64.s32 %rd11, %r35;
mul.wide.s32 %rd25, %r35, 96;
add.s64 %rd26, %rd17, %rd25;
add.s64 %rd12, %rd26, 412;
ld.u32 %r18, [%rd26+412];
setp.gt.s32 %p23, %r18, 44;
@%p23 bra BB57_28;
ld.f32 %f39, [%rd12+4];
add.ftz.f32 %f107, %f38, %f39;
setp.lt.ftz.f32 %p24, %f37, %f107;
@%p24 bra BB57_33;
bra.uni BB57_30;
BB57_28:
mov.f32 %f107, %f38;
BB57_30:
cvt.u32.u64 %r30, %rd11;
add.s32 %r35, %r30, 1;
setp.lt.s32 %p25, %r35, %r1;
mov.f32 %f38, %f107;
@%p25 bra BB57_27;
BB57_31:
ld.u32 %r20, [%rd2+84];
mov.u64 %rd33, 0;
setp.lt.s32 %p26, %r20, 39;
@%p26 bra BB57_36;
add.s64 %rd28, %rd17, 400;
setp.lt.s32 %p27, %r20, 45;
selp.b64 %rd33, %rd28, 0, %p27;
BB57_36:
st.param.b64 [func_retval0+0], %rd33;
ret;
BB57_33:
add.s64 %rd33, %rd26, 400;
setp.lt.s32 %p28, %r18, 39;
@%p28 bra BB57_35;
bra.uni BB57_34;
BB57_35:
sub.ftz.f32 %f76, %f95, %f94;
div.approx.ftz.f32 %f77, %f95, %f76;
ld.f32 %f78, [%rd18];
mul.ftz.f32 %f79, %f77, %f78;
ld.f32 %f80, [%rd18+4];
mul.ftz.f32 %f81, %f77, %f80;
ld.f32 %f82, [%rd18+8];
mul.ftz.f32 %f83, %f77, %f82;
st.f32 [%rd18], %f79;
st.f32 [%rd18+4], %f81;
st.f32 [%rd18+8], %f83;
mov.u64 %rd33, 0;
bra.uni BB57_36;
BB57_34:
div.approx.ftz.f32 %f67, %f95, %f94;
ld.f32 %f68, [%rd18];
mul.ftz.f32 %f69, %f67, %f68;
ld.f32 %f70, [%rd18+4];
mul.ftz.f32 %f71, %f67, %f70;
ld.f32 %f72, [%rd18+8];
mul.ftz.f32 %f73, %f67, %f72;
st.f32 [%rd18], %f69;
st.f32 [%rd18+4], %f71;
st.f32 [%rd18+8], %f73;
sub.ftz.f32 %f74, %f37, %f38;
div.approx.ftz.f32 %f75, %f74, %f39;
st.local.f32 [%rd1], %f75;
bra.uni BB57_36;
}
I tested this with GeForce GTX 980 Ti (CUDA compute capability 5.2) and GTX 750 Ti (5.0) and got the same result.
The compiler options I used are as follows:
nvcc -keep -D NVCC -m 64 --use_fast_math -arch=sm50(sm52) --cubin --ptxas-options=“-v” (…source and output info )
The situation is frustrating because upgrading from CUDA 8.0 to 9.2 boosted the rendering performace more than 2.5 times with scenes with no glitches.