CUDA 9.2 (9.2.148) Update1, nvcc compiler bug

I have been developing and testing a rendering software for public users, which has run on GPU(with CUDA 8.0) or CPU(on Windows x64/x32, Mac OSX 10.9) without any problems. Recently, I have upgraded the CUDA version from 8.0 to 9.2(.148) and encountered a strange rendering glitch.

By tracing the source of the glitch, I found that cubins of the following code generated by nvcc is not working properly.

#define CLOSURE_IS_BSDF(type) ((type) <= CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSSRDF(type) ((type) >= CLOSURE_BSSRDF_CUBIC_ID && (type) <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (CLOSURE_IS_BSDF(type)||CLOSURE_IS_BSSRDF(type))


__device__ __inline__ const ShaderClosure *shader_bssrdf_pick(ShaderData *sd, float3 *throughput, float *randu)
{
    int sampled = 0;

    if(sd->num_closure > 1)
    {
        /* Pick a BSDF or BSSRDF or based on sample weights. */
        float sum_bssrdf = 0.0f;
        float sum = 0.f;
        float sample_weight;

        for(int i = 0; i < sd->num_closure; i++)
        {
            const ShaderClosure *sc = &sd->closure[i];
            
            if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) 
            {
                sample_weight = sc->sample_weight;
                sum += sample_weight;
                if (CLOSURE_IS_BSSRDF(sc->type))
                    sum_bssrdf += sample_weight;
            }
        } 

        float r = (*randu)*sum;
        float partial_sum = 0.0f;

        for(int i = 0; i < sd->num_closure; i++)
        {
            const ShaderClosure *sc = &sd->closure[i];

            if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) 
            {
                sample_weight = sc->sample_weight;
                float next_sum = partial_sum + sample_weight;

                if(r < next_sum) 
                {
                    if(CLOSURE_IS_BSDF(sc->type)) 
                    {
                        *throughput *= sum / (sum - sum_bssrdf);
                        return NULL;
                    }
                    else // if (CLOSURE_IS_BSSRDF(sc->type))
                    {
                        *throughput *= sum / sum_bssrdf;
                        sampled = i;

                        /* Rescale to reuse for direction sample, to better
                         * preserve stratifaction. */
                        *randu = (r - partial_sum) / sample_weight;
                        <b>break; // !!!! buggy !!!! </b>

                        // After 'break' of this for-loop, the function is 
                        // supposed to return &sd->closure[sampled] /* == sc */
                        // but found to return NULL all the time.

                        // However, if 'break' is replaced with 'return sc', the function works correctly.
                    }
                }

                partial_sum = next_sum;
            }
        }
    } 

    const ShaderClosure *sc = &sd->closure[sampled];
    return CLOSURE_IS_BSSRDF(sc->type)? sc: NULL;
}

The ptx of the above code, translated by nvcc, is as follows:

.func  (.param .b64 func_retval0) _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf(
	.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_0,
	.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_1,
	.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_2
)
{
	.reg .pred 	%p<29>;
	.reg .f32 	%f<108>;
	.reg .b32 	%r<40>;
	.reg .b64 	%rd<34>;


	ld.param.u64 	%rd15, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_0];
	ld.param.u64 	%rd16, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_1];
	ld.param.u64 	%rd17, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_2];
	cvta.to.local.u64 	%rd1, %rd17;
	add.s64 	%rd2, %rd15, 328;
	ld.u32 	%r1, [%rd15+328];
	mov.u32 	%r39, 0;
	setp.lt.s32	%p1, %r1, 2;
	@%p1 bra 	BB57_34;

	mov.f32 	%f38, 0f00000000;
	setp.lt.s32	%p2, %r1, 1;
	mov.f32 	%f94, %f38;
	mov.f32 	%f95, %f38;
	@%p2 bra 	BB57_26;

	and.b32  	%r2, %r1, 3;
	setp.eq.s32	%p3, %r2, 0;
	mov.f32 	%f94, 0f00000000;
	mov.u32 	%r36, 0;
	mov.f32 	%f95, %f94;
	@%p3 bra 	BB57_15;

	setp.eq.s32	%p4, %r2, 1;
	mov.f32 	%f94, 0f00000000;
	mov.u32 	%r35, 0;
	mov.f32 	%f95, %f94;
	@%p4 bra 	BB57_12;

	setp.eq.s32	%p5, %r2, 2;
	mov.f32 	%f94, 0f00000000;
	mov.u32 	%r34, 0;
	@%p5 bra 	BB57_5;
	bra.uni 	BB57_6;

BB57_5:
	mov.f32 	%f95, %f94;
	bra.uni 	BB57_9;

BB57_6:
	ld.u32 	%r3, [%rd2+84];
	mov.u32 	%r34, 1;
	setp.gt.s32	%p6, %r3, 44;
	@%p6 bra 	BB57_7;

	ld.f32 	%f52, [%rd2+88];
	add.ftz.f32 	%f95, %f52, 0f00000000;
	setp.gt.s32	%p7, %r3, 38;
	selp.f32	%f94, %f95, 0f00000000, %p7;
	bra.uni 	BB57_9;

BB57_7:
	mov.f32 	%f95, %f94;

BB57_9:
	cvt.u64.u32	%rd3, %r34;
	mul.wide.u32 	%rd18, %r34, 96;
	add.s64 	%rd19, %rd15, %rd18;
	add.s64 	%rd4, %rd19, 412;
	ld.u32 	%r5, [%rd19+412];
	setp.gt.s32	%p8, %r5, 44;
	@%p8 bra 	BB57_11;

	ld.f32 	%f53, [%rd4+4];
	add.ftz.f32 	%f95, %f95, %f53;
	add.ftz.f32 	%f54, %f94, %f53;
	setp.gt.s32	%p9, %r5, 38;
	selp.f32	%f94, %f54, %f94, %p9;

BB57_11:
	cvt.u32.u64	%r28, %rd3;
	add.s32 	%r35, %r28, 1;

BB57_12:
	cvt.s64.s32	%rd5, %r35;
	mul.wide.s32 	%rd20, %r35, 96;
	add.s64 	%rd21, %rd15, %rd20;
	add.s64 	%rd6, %rd21, 412;
	ld.u32 	%r8, [%rd21+412];
	setp.gt.s32	%p10, %r8, 44;
	@%p10 bra 	BB57_14;

	ld.f32 	%f55, [%rd6+4];
	add.ftz.f32 	%f95, %f95, %f55;
	add.ftz.f32 	%f56, %f94, %f55;
	setp.gt.s32	%p11, %r8, 38;
	selp.f32	%f94, %f56, %f94, %p11;

BB57_14:
	cvt.u32.u64	%r29, %rd5;
	add.s32 	%r36, %r29, 1;

BB57_15:
	setp.lt.u32	%p12, %r1, 4;
	@%p12 bra 	BB57_26;

	mul.lo.s32 	%r30, %r36, 96;
	cvt.s64.s32	%rd22, %r30;
	add.s64 	%rd32, %rd15, %rd22;

BB57_17:
	add.s64 	%rd9, %rd32, 412;
	ld.u32 	%r12, [%rd32+412];
	setp.gt.s32	%p13, %r12, 44;
	@%p13 bra 	BB57_19;

	ld.f32 	%f57, [%rd9+4];
	add.ftz.f32 	%f95, %f95, %f57;
	add.ftz.f32 	%f58, %f94, %f57;
	setp.gt.s32	%p14, %r12, 38;
	selp.f32	%f94, %f58, %f94, %p14;

BB57_19:
	ld.u32 	%r13, [%rd9+96];
	setp.gt.s32	%p15, %r13, 44;
	@%p15 bra 	BB57_21;

	ld.f32 	%f59, [%rd9+100];
	add.ftz.f32 	%f95, %f95, %f59;
	add.ftz.f32 	%f60, %f94, %f59;
	setp.gt.s32	%p16, %r13, 38;
	selp.f32	%f94, %f60, %f94, %p16;

BB57_21:
	ld.u32 	%r14, [%rd9+192];
	setp.gt.s32	%p17, %r14, 44;
	@%p17 bra 	BB57_23;

	ld.f32 	%f61, [%rd9+196];
	add.ftz.f32 	%f95, %f95, %f61;
	add.ftz.f32 	%f62, %f94, %f61;
	setp.gt.s32	%p18, %r14, 38;
	selp.f32	%f94, %f62, %f94, %p18;

BB57_23:
	ld.u32 	%r15, [%rd9+288];
	setp.gt.s32	%p19, %r15, 44;
	@%p19 bra 	BB57_25;

	ld.f32 	%f63, [%rd9+292];
	add.ftz.f32 	%f95, %f95, %f63;
	add.ftz.f32 	%f64, %f94, %f63;
	setp.gt.s32	%p20, %r15, 38;
	selp.f32	%f94, %f64, %f94, %p20;

BB57_25:
	add.s64 	%rd32, %rd32, 384;
	add.s32 	%r36, %r36, 4;
	setp.lt.s32	%p21, %r36, %r1;
	@%p21 bra 	BB57_17;

BB57_26:
	ld.local.f32 	%f66, [%rd1];
	mul.ftz.f32 	%f37, %f95, %f66;
	mov.u32 	%r32, 0;
	mov.u32 	%r39, %r32;
	@%p2 bra 	BB57_34;

BB57_27:
	mul.wide.s32 	%rd23, %r39, 96;
	add.s64 	%rd24, %rd15, %rd23;
	add.s64 	%rd11, %rd24, 412;
	ld.u32 	%r18, [%rd24+412];
	setp.gt.s32	%p23, %r18, 44;
	@%p23 bra 	BB57_28;

	ld.f32 	%f39, [%rd11+4];
	add.ftz.f32 	%f107, %f38, %f39;
	setp.lt.ftz.f32	%p24, %f37, %f107;
	@%p24 bra 	BB57_32;
	bra.uni 	BB57_30;

BB57_28:
	mov.f32 	%f107, %f38;

BB57_30:
	add.s32 	%r39, %r39, 1;
	setp.lt.s32	%p25, %r39, %r1;
	mov.f32 	%f38, %f107;
	@%p25 bra 	BB57_27;

	mov.u32 	%r39, %r32;
	bra.uni 	BB57_34;

BB57_32:
	setp.lt.s32	%p26, %r18, 39;
	@%p26 bra 	BB57_36;
	bra.uni 	BB57_33;

BB57_36:
	sub.ftz.f32 	%f76, %f95, %f94;
	div.approx.ftz.f32 	%f77, %f95, %f76;
	ld.f32 	%f78, [%rd16];
	mul.ftz.f32 	%f79, %f77, %f78;
	ld.f32 	%f80, [%rd16+4];
	mul.ftz.f32 	%f81, %f77, %f80;
	ld.f32 	%f82, [%rd16+8];
	mul.ftz.f32 	%f83, %f77, %f82;
	st.f32 	[%rd16], %f79;
	st.f32 	[%rd16+4], %f81;
	st.f32 	[%rd16+8], %f83;
	mov.u64 	%rd33, 0;
	bra.uni 	BB57_37;

BB57_33:
	div.approx.ftz.f32 	%f67, %f95, %f94;
	ld.f32 	%f68, [%rd16];
	mul.ftz.f32 	%f69, %f67, %f68;
	ld.f32 	%f70, [%rd16+4];
	mul.ftz.f32 	%f71, %f67, %f70;
	ld.f32 	%f72, [%rd16+8];
	mul.ftz.f32 	%f73, %f67, %f72;
	st.f32 	[%rd16], %f69;
	st.f32 	[%rd16+4], %f71;
	st.f32 	[%rd16+8], %f73;
	sub.ftz.f32 	%f74, %f37, %f38;
	div.approx.ftz.f32 	%f75, %f74, %f39;
	st.local.f32 	[%rd1], %f75;

BB57_34:
	mul.wide.s32 	%rd26, %r39, 96;
	add.s64 	%rd27, %rd15, %rd26;
	ld.u32 	%r21, [%rd27+412];
	mov.u64 	%rd33, 0;
	setp.lt.s32	%p27, %r21, 39;
	@%p27 bra 	BB57_37;

	setp.lt.s32	%p28, %r21, 45;
	add.s64 	%rd30, %rd27, 400;
	selp.b64	%rd33, %rd30, 0, %p28;

BB57_37:
	st.param.b64	[func_retval0+0], %rd33;
	ret;
}

The correctly fuctionining code, where ‘break’ is replaced with ‘return sc’, is translated as follows:

.func  (.param .b64 func_retval0) _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf(
	.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_0,
	.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_1,
	.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_2
)
{
	.reg .pred 	%p<29>;
	.reg .f32 	%f<108>;
	.reg .b32 	%r<36>;
	.reg .b64 	%rd<34>;


	ld.param.u64 	%rd17, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_0];
	ld.param.u64 	%rd18, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_1];
	ld.param.u64 	%rd19, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_2];
	cvta.to.local.u64 	%rd1, %rd19;
	add.s64 	%rd2, %rd17, 328;
	ld.u32 	%r1, [%rd17+328];
	setp.lt.s32	%p1, %r1, 2;
	@%p1 bra 	BB57_31;

	mov.f32 	%f38, 0f00000000;
	setp.lt.s32	%p2, %r1, 1;
	mov.f32 	%f94, %f38;
	mov.f32 	%f95, %f38;
	@%p2 bra 	BB57_26;

	and.b32  	%r2, %r1, 3;
	setp.eq.s32	%p3, %r2, 0;
	mov.f32 	%f94, 0f00000000;
	mov.u32 	%r33, 0;
	mov.f32 	%f95, %f94;
	@%p3 bra 	BB57_15;

	setp.eq.s32	%p4, %r2, 1;
	mov.f32 	%f94, 0f00000000;
	mov.u32 	%r32, 0;
	mov.f32 	%f95, %f94;
	@%p4 bra 	BB57_12;

	setp.eq.s32	%p5, %r2, 2;
	mov.f32 	%f94, 0f00000000;
	mov.u32 	%r31, 0;
	@%p5 bra 	BB57_5;
	bra.uni 	BB57_6;

BB57_5:
	mov.f32 	%f95, %f94;
	bra.uni 	BB57_9;

BB57_6:
	ld.u32 	%r3, [%rd2+84];
	mov.u32 	%r31, 1;
	setp.gt.s32	%p6, %r3, 44;
	@%p6 bra 	BB57_7;

	ld.f32 	%f52, [%rd2+88];
	add.ftz.f32 	%f95, %f52, 0f00000000;
	setp.gt.s32	%p7, %r3, 38;
	selp.f32	%f94, %f95, 0f00000000, %p7;
	bra.uni 	BB57_9;

BB57_7:
	mov.f32 	%f95, %f94;

BB57_9:
	cvt.u64.u32	%rd3, %r31;
	mul.wide.u32 	%rd20, %r31, 96;
	add.s64 	%rd21, %rd17, %rd20;
	add.s64 	%rd4, %rd21, 412;
	ld.u32 	%r5, [%rd21+412];
	setp.gt.s32	%p8, %r5, 44;
	@%p8 bra 	BB57_11;

	ld.f32 	%f53, [%rd4+4];
	add.ftz.f32 	%f95, %f95, %f53;
	add.ftz.f32 	%f54, %f94, %f53;
	setp.gt.s32	%p9, %r5, 38;
	selp.f32	%f94, %f54, %f94, %p9;

BB57_11:
	cvt.u32.u64	%r26, %rd3;
	add.s32 	%r32, %r26, 1;

BB57_12:
	cvt.s64.s32	%rd5, %r32;
	mul.wide.s32 	%rd22, %r32, 96;
	add.s64 	%rd23, %rd17, %rd22;
	add.s64 	%rd6, %rd23, 412;
	ld.u32 	%r8, [%rd23+412];
	setp.gt.s32	%p10, %r8, 44;
	@%p10 bra 	BB57_14;

	ld.f32 	%f55, [%rd6+4];
	add.ftz.f32 	%f95, %f95, %f55;
	add.ftz.f32 	%f56, %f94, %f55;
	setp.gt.s32	%p11, %r8, 38;
	selp.f32	%f94, %f56, %f94, %p11;

BB57_14:
	cvt.u32.u64	%r27, %rd5;
	add.s32 	%r33, %r27, 1;

BB57_15:
	setp.lt.u32	%p12, %r1, 4;
	@%p12 bra 	BB57_26;

	mul.lo.s32 	%r28, %r33, 96;
	cvt.s64.s32	%rd24, %r28;
	add.s64 	%rd32, %rd17, %rd24;

BB57_17:
	add.s64 	%rd9, %rd32, 412;
	ld.u32 	%r12, [%rd32+412];
	setp.gt.s32	%p13, %r12, 44;
	@%p13 bra 	BB57_19;

	ld.f32 	%f57, [%rd9+4];
	add.ftz.f32 	%f95, %f95, %f57;
	add.ftz.f32 	%f58, %f94, %f57;
	setp.gt.s32	%p14, %r12, 38;
	selp.f32	%f94, %f58, %f94, %p14;

BB57_19:
	ld.u32 	%r13, [%rd9+96];
	setp.gt.s32	%p15, %r13, 44;
	@%p15 bra 	BB57_21;

	ld.f32 	%f59, [%rd9+100];
	add.ftz.f32 	%f95, %f95, %f59;
	add.ftz.f32 	%f60, %f94, %f59;
	setp.gt.s32	%p16, %r13, 38;
	selp.f32	%f94, %f60, %f94, %p16;

BB57_21:
	ld.u32 	%r14, [%rd9+192];
	setp.gt.s32	%p17, %r14, 44;
	@%p17 bra 	BB57_23;

	ld.f32 	%f61, [%rd9+196];
	add.ftz.f32 	%f95, %f95, %f61;
	add.ftz.f32 	%f62, %f94, %f61;
	setp.gt.s32	%p18, %r14, 38;
	selp.f32	%f94, %f62, %f94, %p18;

BB57_23:
	ld.u32 	%r15, [%rd9+288];
	setp.gt.s32	%p19, %r15, 44;
	@%p19 bra 	BB57_25;

	ld.f32 	%f63, [%rd9+292];
	add.ftz.f32 	%f95, %f95, %f63;
	add.ftz.f32 	%f64, %f94, %f63;
	setp.gt.s32	%p20, %r15, 38;
	selp.f32	%f94, %f64, %f94, %p20;

BB57_25:
	add.s64 	%rd32, %rd32, 384;
	add.s32 	%r33, %r33, 4;
	setp.lt.s32	%p21, %r33, %r1;
	@%p21 bra 	BB57_17;

BB57_26:
	ld.local.f32 	%f66, [%rd1];
	mul.ftz.f32 	%f37, %f95, %f66;
	mov.u32 	%r35, 0;
	@%p2 bra 	BB57_31;

BB57_27:
	cvt.s64.s32	%rd11, %r35;
	mul.wide.s32 	%rd25, %r35, 96;
	add.s64 	%rd26, %rd17, %rd25;
	add.s64 	%rd12, %rd26, 412;
	ld.u32 	%r18, [%rd26+412];
	setp.gt.s32	%p23, %r18, 44;
	@%p23 bra 	BB57_28;

	ld.f32 	%f39, [%rd12+4];
	add.ftz.f32 	%f107, %f38, %f39;
	setp.lt.ftz.f32	%p24, %f37, %f107;
	@%p24 bra 	BB57_33;
	bra.uni 	BB57_30;

BB57_28:
	mov.f32 	%f107, %f38;

BB57_30:
	cvt.u32.u64	%r30, %rd11;
	add.s32 	%r35, %r30, 1;
	setp.lt.s32	%p25, %r35, %r1;
	mov.f32 	%f38, %f107;
	@%p25 bra 	BB57_27;

BB57_31:
	ld.u32 	%r20, [%rd2+84];
	mov.u64 	%rd33, 0;
	setp.lt.s32	%p26, %r20, 39;
	@%p26 bra 	BB57_36;

	add.s64 	%rd28, %rd17, 400;
	setp.lt.s32	%p27, %r20, 45;
	selp.b64	%rd33, %rd28, 0, %p27;

BB57_36:
	st.param.b64	[func_retval0+0], %rd33;
	ret;

BB57_33:
	add.s64 	%rd33, %rd26, 400;
	setp.lt.s32	%p28, %r18, 39;
	@%p28 bra 	BB57_35;
	bra.uni 	BB57_34;

BB57_35:
	sub.ftz.f32 	%f76, %f95, %f94;
	div.approx.ftz.f32 	%f77, %f95, %f76;
	ld.f32 	%f78, [%rd18];
	mul.ftz.f32 	%f79, %f77, %f78;
	ld.f32 	%f80, [%rd18+4];
	mul.ftz.f32 	%f81, %f77, %f80;
	ld.f32 	%f82, [%rd18+8];
	mul.ftz.f32 	%f83, %f77, %f82;
	st.f32 	[%rd18], %f79;
	st.f32 	[%rd18+4], %f81;
	st.f32 	[%rd18+8], %f83;
	mov.u64 	%rd33, 0;
	bra.uni 	BB57_36;

BB57_34:
	div.approx.ftz.f32 	%f67, %f95, %f94;
	ld.f32 	%f68, [%rd18];
	mul.ftz.f32 	%f69, %f67, %f68;
	ld.f32 	%f70, [%rd18+4];
	mul.ftz.f32 	%f71, %f67, %f70;
	ld.f32 	%f72, [%rd18+8];
	mul.ftz.f32 	%f73, %f67, %f72;
	st.f32 	[%rd18], %f69;
	st.f32 	[%rd18+4], %f71;
	st.f32 	[%rd18+8], %f73;
	sub.ftz.f32 	%f74, %f37, %f38;
	div.approx.ftz.f32 	%f75, %f74, %f39;
	st.local.f32 	[%rd1], %f75;
	bra.uni 	BB57_36;
}

I tested this with GeForce GTX 980 Ti (CUDA compute capability 5.2) and GTX 750 Ti (5.0) and got the same result.

The compiler options I used are as follows:

nvcc -keep -D NVCC -m 64 --use_fast_math -arch=sm50(sm52) --cubin --ptxas-options=“-v” (…source and output info )

The situation is frustrating because upgrading from CUDA 8.0 to 9.2 boosted the rendering performace more than 2.5 times with scenes with no glitches.

my suggestion would be to file a bug at developer.nvidia.com

provide an exact definition and description of how to obtain both versions of PTX

Thanks.

Please, keep this post here because I refered it on my bug report to developer.nvidia.com.

Unfortunately, I was not able to file a bug to developer.nvidia.com.

When I sent a message via https://developer.nvidia.com/contact, a ‘Forbidden …’ error message was shown and a copy of my message was not sent to me although I checked the option.

When I tried to ‘report a bug’ from my account, I encountered ‘An AJAX HTTP error occurred. HTTP Result Code: 403 … 403 - Forbidden’.

Anyway, all constants and structures used from the code can be found here:
https://git.blender.org/gitweb/gitweb.cgi/blender.git/tree/refs/heads/master:/intern/cycles/kernel

I thought that they were not necessary because they would affects constants and offsets of the resulting ptx codes.

File a simple bug report that has almost nothing in it. That should get you past the “Forbidden” error.

You can then update it later, or provide the system-provided bug number and I will add a link to this forum entry.

Thanks.

However, no matter how simple the bug report is, it is kept blocked by the ‘Forbidden’ error.

I’ve just succeeded in filing an ‘empty’ bug report.

Bug ID: 2336125
Area: Other CUDA Tools

Would you add the report a link to this forum entry?

I’ve updated the report.

If you want an expedited response, I suggest developing a complete test case. It should be complete, standalone application. it should not be your whole code.

You should identify the expected and actual output. Show the complete command necessary to compile.

Done.

The complete test case is here:

I inserted a testing code into an existing CUDA sample.

  1. Unzip the file and copy the generated folder CUDABreakTest into ‘C:/ProgramData/NVIDIA Corporation/CUDA Samples/v9.2/0_Simple’.
  2. Open ‘simpleAssert_vs2013.sln’ with Visual Studio 2013.
  3. Change ‘Solution configuration’ and ‘Platform’ to ‘Release’ and ‘x64’, respectively.
  4. Go to ‘Properties->Configuration properties->CUDA C/C+±>Device’
    and update ‘Code Generation’ reflecting your nVIDIA card.
  5. Build and run with Visual Studio 2013 & CUDA SDK 9.2.148.

Expected output:
Result of A : 1 // the ‘err’ function returns non-null pointer.
Result of B : 1 // the ‘ok’ function returns non-null pointer.

Actual output:
Result of A : 0 // the ‘err’ function returns null;
Result of B : 1 // the ‘ok’ function returns non-null pointer.

I tried to add the complete test case above into my bug report as a comment but got the usual ‘An AJAX HTTP error occurred. HTTP Result Code: 403 … 403 - Forbidden’ error.

Would you append the test case into my bug report instead, please?

I’ve updated it, including change from Linux to Win7