CUDA 9.2 (9.2.148) Update1, nvcc compiler bug

robobegkget8 · August 17, 2018, 7:45pm

I have been developing and testing a rendering software for public users, which has run on GPU(with CUDA 8.0) or CPU(on Windows x64/x32, Mac OSX 10.9) without any problems. Recently, I have upgraded the CUDA version from 8.0 to 9.2(.148) and encountered a strange rendering glitch.

By tracing the source of the glitch, I found that cubins of the following code generated by nvcc is not working properly.

#define CLOSURE_IS_BSDF(type) ((type) <= CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSSRDF(type) ((type) >= CLOSURE_BSSRDF_CUBIC_ID && (type) <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (CLOSURE_IS_BSDF(type)||CLOSURE_IS_BSSRDF(type))


__device__ __inline__ const ShaderClosure *shader_bssrdf_pick(ShaderData *sd, float3 *throughput, float *randu)
{
    int sampled = 0;

    if(sd->num_closure > 1)
    {
        /* Pick a BSDF or BSSRDF or based on sample weights. */
        float sum_bssrdf = 0.0f;
        float sum = 0.f;
        float sample_weight;

        for(int i = 0; i < sd->num_closure; i++)
        {
            const ShaderClosure *sc = &sd->closure[i];
            
            if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) 
            {
                sample_weight = sc->sample_weight;
                sum += sample_weight;
                if (CLOSURE_IS_BSSRDF(sc->type))
                    sum_bssrdf += sample_weight;
            }
        } 

        float r = (*randu)*sum;
        float partial_sum = 0.0f;

        for(int i = 0; i < sd->num_closure; i++)
        {
            const ShaderClosure *sc = &sd->closure[i];

            if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) 
            {
                sample_weight = sc->sample_weight;
                float next_sum = partial_sum + sample_weight;

                if(r < next_sum) 
                {
                    if(CLOSURE_IS_BSDF(sc->type)) 
                    {
                        *throughput *= sum / (sum - sum_bssrdf);
                        return NULL;
                    }
                    else // if (CLOSURE_IS_BSSRDF(sc->type))
                    {
                        *throughput *= sum / sum_bssrdf;
                        sampled = i;

                        /* Rescale to reuse for direction sample, to better
                         * preserve stratifaction. */
                        *randu = (r - partial_sum) / sample_weight;
                        <b>break; // !!!! buggy !!!! </b>

                        // After 'break' of this for-loop, the function is 
                        // supposed to return &sd->closure[sampled] /* == sc */
                        // but found to return NULL all the time.

                        // However, if 'break' is replaced with 'return sc', the function works correctly.
                    }
                }

                partial_sum = next_sum;
            }
        }
    } 

    const ShaderClosure *sc = &sd->closure[sampled];
    return CLOSURE_IS_BSSRDF(sc->type)? sc: NULL;
}

The ptx of the above code, translated by nvcc, is as follows:

.func  (.param .b64 func_retval0) _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf(
	.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_0,
	.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_1,
	.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_2
)
{
	.reg .pred 	%p<29>;
	.reg .f32 	%f<108>;
	.reg .b32 	%r<40>;
	.reg .b64 	%rd<34>;


	ld.param.u64 	%rd15, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_0];
	ld.param.u64 	%rd16, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_1];
	ld.param.u64 	%rd17, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_2];
	cvta.to.local.u64 	%rd1, %rd17;
	add.s64 	%rd2, %rd15, 328;
	ld.u32 	%r1, [%rd15+328];
	mov.u32 	%r39, 0;
	setp.lt.s32	%p1, %r1, 2;
	@%p1 bra 	BB57_34;

	mov.f32 	%f38, 0f00000000;
	setp.lt.s32	%p2, %r1, 1;
	mov.f32 	%f94, %f38;
	mov.f32 	%f95, %f38;
	@%p2 bra 	BB57_26;

	and.b32  	%r2, %r1, 3;
	setp.eq.s32	%p3, %r2, 0;
	mov.f32 	%f94, 0f00000000;
	mov.u32 	%r36, 0;
	mov.f32 	%f95, %f94;
	@%p3 bra 	BB57_15;

	setp.eq.s32	%p4, %r2, 1;
	mov.f32 	%f94, 0f00000000;
	mov.u32 	%r35, 0;
	mov.f32 	%f95, %f94;
	@%p4 bra 	BB57_12;

	setp.eq.s32	%p5, %r2, 2;
	mov.f32 	%f94, 0f00000000;
	mov.u32 	%r34, 0;
	@%p5 bra 	BB57_5;
	bra.uni 	BB57_6;

BB57_5:
	mov.f32 	%f95, %f94;
	bra.uni 	BB57_9;

BB57_6:
	ld.u32 	%r3, [%rd2+84];
	mov.u32 	%r34, 1;
	setp.gt.s32	%p6, %r3, 44;
	@%p6 bra 	BB57_7;

	ld.f32 	%f52, [%rd2+88];
	add.ftz.f32 	%f95, %f52, 0f00000000;
	setp.gt.s32	%p7, %r3, 38;
	selp.f32	%f94, %f95, 0f00000000, %p7;
	bra.uni 	BB57_9;

BB57_7:
	mov.f32 	%f95, %f94;

BB57_9:
	cvt.u64.u32	%rd3, %r34;
	mul.wide.u32 	%rd18, %r34, 96;
	add.s64 	%rd19, %rd15, %rd18;
	add.s64 	%rd4, %rd19, 412;
	ld.u32 	%r5, [%rd19+412];
	setp.gt.s32	%p8, %r5, 44;
	@%p8 bra 	BB57_11;

	ld.f32 	%f53, [%rd4+4];
	add.ftz.f32 	%f95, %f95, %f53;
	add.ftz.f32 	%f54, %f94, %f53;
	setp.gt.s32	%p9, %r5, 38;
	selp.f32	%f94, %f54, %f94, %p9;

BB57_11:
	cvt.u32.u64	%r28, %rd3;
	add.s32 	%r35, %r28, 1;

BB57_12:
	cvt.s64.s32	%rd5, %r35;
	mul.wide.s32 	%rd20, %r35, 96;
	add.s64 	%rd21, %rd15, %rd20;
	add.s64 	%rd6, %rd21, 412;
	ld.u32 	%r8, [%rd21+412];
	setp.gt.s32	%p10, %r8, 44;
	@%p10 bra 	BB57_14;

	ld.f32 	%f55, [%rd6+4];
	add.ftz.f32 	%f95, %f95, %f55;
	add.ftz.f32 	%f56, %f94, %f55;
	setp.gt.s32	%p11, %r8, 38;
	selp.f32	%f94, %f56, %f94, %p11;

BB57_14:
	cvt.u32.u64	%r29, %rd5;
	add.s32 	%r36, %r29, 1;

BB57_15:
	setp.lt.u32	%p12, %r1, 4;
	@%p12 bra 	BB57_26;

	mul.lo.s32 	%r30, %r36, 96;
	cvt.s64.s32	%rd22, %r30;
	add.s64 	%rd32, %rd15, %rd22;

BB57_17:
	add.s64 	%rd9, %rd32, 412;
	ld.u32 	%r12, [%rd32+412];
	setp.gt.s32	%p13, %r12, 44;
	@%p13 bra 	BB57_19;

	ld.f32 	%f57, [%rd9+4];
	add.ftz.f32 	%f95, %f95, %f57;
	add.ftz.f32 	%f58, %f94, %f57;
	setp.gt.s32	%p14, %r12, 38;
	selp.f32	%f94, %f58, %f94, %p14;

BB57_19:
	ld.u32 	%r13, [%rd9+96];
	setp.gt.s32	%p15, %r13, 44;
	@%p15 bra 	BB57_21;

	ld.f32 	%f59, [%rd9+100];
	add.ftz.f32 	%f95, %f95, %f59;
	add.ftz.f32 	%f60, %f94, %f59;
	setp.gt.s32	%p16, %r13, 38;
	selp.f32	%f94, %f60, %f94, %p16;

BB57_21:
	ld.u32 	%r14, [%rd9+192];
	setp.gt.s32	%p17, %r14, 44;
	@%p17 bra 	BB57_23;

	ld.f32 	%f61, [%rd9+196];
	add.ftz.f32 	%f95, %f95, %f61;
	add.ftz.f32 	%f62, %f94, %f61;
	setp.gt.s32	%p18, %r14, 38;
	selp.f32	%f94, %f62, %f94, %p18;

BB57_23:
	ld.u32 	%r15, [%rd9+288];
	setp.gt.s32	%p19, %r15, 44;
	@%p19 bra 	BB57_25;

	ld.f32 	%f63, [%rd9+292];
	add.ftz.f32 	%f95, %f95, %f63;
	add.ftz.f32 	%f64, %f94, %f63;
	setp.gt.s32	%p20, %r15, 38;
	selp.f32	%f94, %f64, %f94, %p20;

BB57_25:
	add.s64 	%rd32, %rd32, 384;
	add.s32 	%r36, %r36, 4;
	setp.lt.s32	%p21, %r36, %r1;
	@%p21 bra 	BB57_17;

BB57_26:
	ld.local.f32 	%f66, [%rd1];
	mul.ftz.f32 	%f37, %f95, %f66;
	mov.u32 	%r32, 0;
	mov.u32 	%r39, %r32;
	@%p2 bra 	BB57_34;

BB57_27:
	mul.wide.s32 	%rd23, %r39, 96;
	add.s64 	%rd24, %rd15, %rd23;
	add.s64 	%rd11, %rd24, 412;
	ld.u32 	%r18, [%rd24+412];
	setp.gt.s32	%p23, %r18, 44;
	@%p23 bra 	BB57_28;

	ld.f32 	%f39, [%rd11+4];
	add.ftz.f32 	%f107, %f38, %f39;
	setp.lt.ftz.f32	%p24, %f37, %f107;
	@%p24 bra 	BB57_32;
	bra.uni 	BB57_30;

BB57_28:
	mov.f32 	%f107, %f38;

BB57_30:
	add.s32 	%r39, %r39, 1;
	setp.lt.s32	%p25, %r39, %r1;
	mov.f32 	%f38, %f107;
	@%p25 bra 	BB57_27;

	mov.u32 	%r39, %r32;
	bra.uni 	BB57_34;

BB57_32:
	setp.lt.s32	%p26, %r18, 39;
	@%p26 bra 	BB57_36;
	bra.uni 	BB57_33;

BB57_36:
	sub.ftz.f32 	%f76, %f95, %f94;
	div.approx.ftz.f32 	%f77, %f95, %f76;
	ld.f32 	%f78, [%rd16];
	mul.ftz.f32 	%f79, %f77, %f78;
	ld.f32 	%f80, [%rd16+4];
	mul.ftz.f32 	%f81, %f77, %f80;
	ld.f32 	%f82, [%rd16+8];
	mul.ftz.f32 	%f83, %f77, %f82;
	st.f32 	[%rd16], %f79;
	st.f32 	[%rd16+4], %f81;
	st.f32 	[%rd16+8], %f83;
	mov.u64 	%rd33, 0;
	bra.uni 	BB57_37;

BB57_33:
	div.approx.ftz.f32 	%f67, %f95, %f94;
	ld.f32 	%f68, [%rd16];
	mul.ftz.f32 	%f69, %f67, %f68;
	ld.f32 	%f70, [%rd16+4];
	mul.ftz.f32 	%f71, %f67, %f70;
	ld.f32 	%f72, [%rd16+8];
	mul.ftz.f32 	%f73, %f67, %f72;
	st.f32 	[%rd16], %f69;
	st.f32 	[%rd16+4], %f71;
	st.f32 	[%rd16+8], %f73;
	sub.ftz.f32 	%f74, %f37, %f38;
	div.approx.ftz.f32 	%f75, %f74, %f39;
	st.local.f32 	[%rd1], %f75;

BB57_34:
	mul.wide.s32 	%rd26, %r39, 96;
	add.s64 	%rd27, %rd15, %rd26;
	ld.u32 	%r21, [%rd27+412];
	mov.u64 	%rd33, 0;
	setp.lt.s32	%p27, %r21, 39;
	@%p27 bra 	BB57_37;

	setp.lt.s32	%p28, %r21, 45;
	add.s64 	%rd30, %rd27, 400;
	selp.b64	%rd33, %rd30, 0, %p28;

BB57_37:
	st.param.b64	[func_retval0+0], %rd33;
	ret;
}

The correctly fuctionining code, where ‘break’ is replaced with ‘return sc’, is translated as follows:

.func  (.param .b64 func_retval0) _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf(
	.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_0,
	.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_1,
	.param .b64 _Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_2
)
{
	.reg .pred 	%p<29>;
	.reg .f32 	%f<108>;
	.reg .b32 	%r<36>;
	.reg .b64 	%rd<34>;


	ld.param.u64 	%rd17, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_0];
	ld.param.u64 	%rd18, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_1];
	ld.param.u64 	%rd19, [_Z18shader_bssrdf_pickP10ShaderDataP6float3Pf_param_2];
	cvta.to.local.u64 	%rd1, %rd19;
	add.s64 	%rd2, %rd17, 328;
	ld.u32 	%r1, [%rd17+328];
	setp.lt.s32	%p1, %r1, 2;
	@%p1 bra 	BB57_31;

	mov.f32 	%f38, 0f00000000;
	setp.lt.s32	%p2, %r1, 1;
	mov.f32 	%f94, %f38;
	mov.f32 	%f95, %f38;
	@%p2 bra 	BB57_26;

	and.b32  	%r2, %r1, 3;
	setp.eq.s32	%p3, %r2, 0;
	mov.f32 	%f94, 0f00000000;
	mov.u32 	%r33, 0;
	mov.f32 	%f95, %f94;
	@%p3 bra 	BB57_15;

	setp.eq.s32	%p4, %r2, 1;
	mov.f32 	%f94, 0f00000000;
	mov.u32 	%r32, 0;
	mov.f32 	%f95, %f94;
	@%p4 bra 	BB57_12;

	setp.eq.s32	%p5, %r2, 2;
	mov.f32 	%f94, 0f00000000;
	mov.u32 	%r31, 0;
	@%p5 bra 	BB57_5;
	bra.uni 	BB57_6;

BB57_5:
	mov.f32 	%f95, %f94;
	bra.uni 	BB57_9;

BB57_6:
	ld.u32 	%r3, [%rd2+84];
	mov.u32 	%r31, 1;
	setp.gt.s32	%p6, %r3, 44;
	@%p6 bra 	BB57_7;

	ld.f32 	%f52, [%rd2+88];
	add.ftz.f32 	%f95, %f52, 0f00000000;
	setp.gt.s32	%p7, %r3, 38;
	selp.f32	%f94, %f95, 0f00000000, %p7;
	bra.uni 	BB57_9;

BB57_7:
	mov.f32 	%f95, %f94;

BB57_9:
	cvt.u64.u32	%rd3, %r31;
	mul.wide.u32 	%rd20, %r31, 96;
	add.s64 	%rd21, %rd17, %rd20;
	add.s64 	%rd4, %rd21, 412;
	ld.u32 	%r5, [%rd21+412];
	setp.gt.s32	%p8, %r5, 44;
	@%p8 bra 	BB57_11;

	ld.f32 	%f53, [%rd4+4];
	add.ftz.f32 	%f95, %f95, %f53;
	add.ftz.f32 	%f54, %f94, %f53;
	setp.gt.s32	%p9, %r5, 38;
	selp.f32	%f94, %f54, %f94, %p9;

BB57_11:
	cvt.u32.u64	%r26, %rd3;
	add.s32 	%r32, %r26, 1;

BB57_12:
	cvt.s64.s32	%rd5, %r32;
	mul.wide.s32 	%rd22, %r32, 96;
	add.s64 	%rd23, %rd17, %rd22;
	add.s64 	%rd6, %rd23, 412;
	ld.u32 	%r8, [%rd23+412];
	setp.gt.s32	%p10, %r8, 44;
	@%p10 bra 	BB57_14;

	ld.f32 	%f55, [%rd6+4];
	add.ftz.f32 	%f95, %f95, %f55;
	add.ftz.f32 	%f56, %f94, %f55;
	setp.gt.s32	%p11, %r8, 38;
	selp.f32	%f94, %f56, %f94, %p11;

BB57_14:
	cvt.u32.u64	%r27, %rd5;
	add.s32 	%r33, %r27, 1;

BB57_15:
	setp.lt.u32	%p12, %r1, 4;
	@%p12 bra 	BB57_26;

	mul.lo.s32 	%r28, %r33, 96;
	cvt.s64.s32	%rd24, %r28;
	add.s64 	%rd32, %rd17, %rd24;

BB57_17:
	add.s64 	%rd9, %rd32, 412;
	ld.u32 	%r12, [%rd32+412];
	setp.gt.s32	%p13, %r12, 44;
	@%p13 bra 	BB57_19;

	ld.f32 	%f57, [%rd9+4];
	add.ftz.f32 	%f95, %f95, %f57;
	add.ftz.f32 	%f58, %f94, %f57;
	setp.gt.s32	%p14, %r12, 38;
	selp.f32	%f94, %f58, %f94, %p14;

BB57_19:
	ld.u32 	%r13, [%rd9+96];
	setp.gt.s32	%p15, %r13, 44;
	@%p15 bra 	BB57_21;

	ld.f32 	%f59, [%rd9+100];
	add.ftz.f32 	%f95, %f95, %f59;
	add.ftz.f32 	%f60, %f94, %f59;
	setp.gt.s32	%p16, %r13, 38;
	selp.f32	%f94, %f60, %f94, %p16;

BB57_21:
	ld.u32 	%r14, [%rd9+192];
	setp.gt.s32	%p17, %r14, 44;
	@%p17 bra 	BB57_23;

	ld.f32 	%f61, [%rd9+196];
	add.ftz.f32 	%f95, %f95, %f61;
	add.ftz.f32 	%f62, %f94, %f61;
	setp.gt.s32	%p18, %r14, 38;
	selp.f32	%f94, %f62, %f94, %p18;

BB57_23:
	ld.u32 	%r15, [%rd9+288];
	setp.gt.s32	%p19, %r15, 44;
	@%p19 bra 	BB57_25;

	ld.f32 	%f63, [%rd9+292];
	add.ftz.f32 	%f95, %f95, %f63;
	add.ftz.f32 	%f64, %f94, %f63;
	setp.gt.s32	%p20, %r15, 38;
	selp.f32	%f94, %f64, %f94, %p20;

BB57_25:
	add.s64 	%rd32, %rd32, 384;
	add.s32 	%r33, %r33, 4;
	setp.lt.s32	%p21, %r33, %r1;
	@%p21 bra 	BB57_17;

BB57_26:
	ld.local.f32 	%f66, [%rd1];
	mul.ftz.f32 	%f37, %f95, %f66;
	mov.u32 	%r35, 0;
	@%p2 bra 	BB57_31;

BB57_27:
	cvt.s64.s32	%rd11, %r35;
	mul.wide.s32 	%rd25, %r35, 96;
	add.s64 	%rd26, %rd17, %rd25;
	add.s64 	%rd12, %rd26, 412;
	ld.u32 	%r18, [%rd26+412];
	setp.gt.s32	%p23, %r18, 44;
	@%p23 bra 	BB57_28;

	ld.f32 	%f39, [%rd12+4];
	add.ftz.f32 	%f107, %f38, %f39;
	setp.lt.ftz.f32	%p24, %f37, %f107;
	@%p24 bra 	BB57_33;
	bra.uni 	BB57_30;

BB57_28:
	mov.f32 	%f107, %f38;

BB57_30:
	cvt.u32.u64	%r30, %rd11;
	add.s32 	%r35, %r30, 1;
	setp.lt.s32	%p25, %r35, %r1;
	mov.f32 	%f38, %f107;
	@%p25 bra 	BB57_27;

BB57_31:
	ld.u32 	%r20, [%rd2+84];
	mov.u64 	%rd33, 0;
	setp.lt.s32	%p26, %r20, 39;
	@%p26 bra 	BB57_36;

	add.s64 	%rd28, %rd17, 400;
	setp.lt.s32	%p27, %r20, 45;
	selp.b64	%rd33, %rd28, 0, %p27;

BB57_36:
	st.param.b64	[func_retval0+0], %rd33;
	ret;

BB57_33:
	add.s64 	%rd33, %rd26, 400;
	setp.lt.s32	%p28, %r18, 39;
	@%p28 bra 	BB57_35;
	bra.uni 	BB57_34;

BB57_35:
	sub.ftz.f32 	%f76, %f95, %f94;
	div.approx.ftz.f32 	%f77, %f95, %f76;
	ld.f32 	%f78, [%rd18];
	mul.ftz.f32 	%f79, %f77, %f78;
	ld.f32 	%f80, [%rd18+4];
	mul.ftz.f32 	%f81, %f77, %f80;
	ld.f32 	%f82, [%rd18+8];
	mul.ftz.f32 	%f83, %f77, %f82;
	st.f32 	[%rd18], %f79;
	st.f32 	[%rd18+4], %f81;
	st.f32 	[%rd18+8], %f83;
	mov.u64 	%rd33, 0;
	bra.uni 	BB57_36;

BB57_34:
	div.approx.ftz.f32 	%f67, %f95, %f94;
	ld.f32 	%f68, [%rd18];
	mul.ftz.f32 	%f69, %f67, %f68;
	ld.f32 	%f70, [%rd18+4];
	mul.ftz.f32 	%f71, %f67, %f70;
	ld.f32 	%f72, [%rd18+8];
	mul.ftz.f32 	%f73, %f67, %f72;
	st.f32 	[%rd18], %f69;
	st.f32 	[%rd18+4], %f71;
	st.f32 	[%rd18+8], %f73;
	sub.ftz.f32 	%f74, %f37, %f38;
	div.approx.ftz.f32 	%f75, %f74, %f39;
	st.local.f32 	[%rd1], %f75;
	bra.uni 	BB57_36;
}

I tested this with GeForce GTX 980 Ti (CUDA compute capability 5.2) and GTX 750 Ti (5.0) and got the same result.

The compiler options I used are as follows:

nvcc -keep -D NVCC -m 64 --use_fast_math -arch=sm50(sm52) --cubin --ptxas-options=“-v” (…source and output info )

The situation is frustrating because upgrading from CUDA 8.0 to 9.2 boosted the rendering performace more than 2.5 times with scenes with no glitches.

Robert_Crovella · August 17, 2018, 7:54pm

my suggestion would be to file a bug at developer.nvidia.com

provide an exact definition and description of how to obtain both versions of PTX

robobegkget8 · August 17, 2018, 8:07pm

Thanks.

Please, keep this post here because I refered it on my bug report to developer.nvidia.com.

robobegkget8 · August 17, 2018, 9:02pm

Unfortunately, I was not able to file a bug to developer.nvidia.com.

When I sent a message via https://developer.nvidia.com/contact, a ‘Forbidden …’ error message was shown and a copy of my message was not sent to me although I checked the option.

When I tried to ‘report a bug’ from my account, I encountered ‘An AJAX HTTP error occurred. HTTP Result Code: 403 … 403 - Forbidden’.

Anyway, all constants and structures used from the code can be found here:
https://git.blender.org/gitweb/gitweb.cgi/blender.git/tree/refs/heads/master:/intern/cycles/kernel

I thought that they were not necessary because they would affects constants and offsets of the resulting ptx codes.

Robert_Crovella · August 17, 2018, 9:09pm

File a simple bug report that has almost nothing in it. That should get you past the “Forbidden” error.

You can then update it later, or provide the system-provided bug number and I will add a link to this forum entry.

robobegkget8 · August 17, 2018, 9:25pm

Thanks.

However, no matter how simple the bug report is, it is kept blocked by the ‘Forbidden’ error.

robobegkget8 · August 18, 2018, 2:40am

I’ve just succeeded in filing an ‘empty’ bug report.

Bug ID: 2336125
Area: Other CUDA Tools

Would you add the report a link to this forum entry?

Robert_Crovella · August 18, 2018, 4:08am

I’ve updated the report.

If you want an expedited response, I suggest developing a complete test case. It should be complete, standalone application. it should not be your whole code.

You should identify the expected and actual output. Show the complete command necessary to compile.

robobegkget8 · August 18, 2018, 8:00pm

Done.

The complete test case is here:

I inserted a testing code into an existing CUDA sample.

Unzip the file and copy the generated folder CUDABreakTest into ‘C:/ProgramData/NVIDIA Corporation/CUDA Samples/v9.2/0_Simple’.
Open ‘simpleAssert_vs2013.sln’ with Visual Studio 2013.
Change ‘Solution configuration’ and ‘Platform’ to ‘Release’ and ‘x64’, respectively.
Go to ‘Properties->Configuration properties->CUDA C/C+±>Device’
and update ‘Code Generation’ reflecting your nVIDIA card.
Build and run with Visual Studio 2013 & CUDA SDK 9.2.148.

Expected output:
Result of A : 1 // the ‘err’ function returns non-null pointer.
Result of B : 1 // the ‘ok’ function returns non-null pointer.

Actual output:
Result of A : 0 // the ‘err’ function returns null;
Result of B : 1 // the ‘ok’ function returns non-null pointer.

robobegkget8 · August 18, 2018, 8:04pm

I tried to add the complete test case above into my bug report as a comment but got the usual ‘An AJAX HTTP error occurred. HTTP Result Code: 403 … 403 - Forbidden’ error.

Would you append the test case into my bug report instead, please?

Robert_Crovella · August 18, 2018, 9:27pm

I’ve updated it, including change from Linux to Win7

Topic		Replies	Views
CUDA 8.0.26 Inline PTX "addc" Bug CUDA Programming and Performance	18	1490	December 26, 2017
CUDA version not available message with nvc++ on Ubuntu nvc, nvc++ and nvfortran	11	7657	April 30, 2021
CUDA Toolkit and SDK v2.2 released CUDA Programming and Performance	59	64622	January 25, 2011
Ubuntu 10.04 and CUDA CUDA Programming and Performance	24	155578	April 6, 2011
Problems with CUDA10 & VS2017 - Build & Clean errors CUDA Setup and Installation	3	1923	January 21, 2021
CUDA VS Wizard 2.0 beta CUDA Programming and Performance	81	99787	July 15, 2009
Cuda compilation error: class template has already been defined and invalid records warnings CUDA Programming and Performance	5	3117	July 7, 2018
CUDA 4.2 Install in Ubuntu 12.04 CUDA Programming and Performance	12	19868	August 25, 2017
Matlab 2007b and CUDA Can't compile CUDA example with Matlab CUDA Programming and Performance	19	20101	June 25, 2009
Peculiar Shared Memory Behavior - NEED HELP! A test program appears to have two arrays sharing v CUDA Programming and Performance	21	2159	June 5, 2011

CUDA 9.2 (9.2.148) Update1, nvcc compiler bug

Related topics