Unexpected behavior with restrict keyword?

pedro-1.hermosilla-casaju · August 6, 2019, 1:59pm

Dear all,

I have some questions related to the restrict keyword. In the following function if I declare the input parameters with restrict the kernel generates wrong results. However, if I do not use it, the kernel behaves as expected.

template<int D>
__global__ void pre_proc_neighbors_gpu_kernel(
    const unsigned int pNeighOffset,
    const unsigned int pNumNeighbors,       
    const mccnn::fpoint<D>* __restrict__ pInPtsGPUPtr,
    const mccnn::fpoint<D>* __restrict__ pInSamplesGPUPtr,
    const int2* __restrict__ pInNeighborsGPUPtr,
    const int* __restrict__ pInSampleNeighIGPUPtr,
    const mccnn::fpoint<D>* __restrict__ pInInvRadiiGPUPtr,
    const float* __restrict__ pInPDFsGPUPtr,
    mccnn::fpoint<D>* __restrict__ pOutPtDiffsGPUPtr,
    float*  __restrict__ pOutPtWeightsGPUPtr)
{
    const unsigned int initThreadIndex = mccnn::compute_global_index_gpu_funct();
    const unsigned int totalNumThreads = mccnn::compute_total_threads_gpu_funct(); 

    for(unsigned int curIter = initThreadIndex; 
        curIter < pNumNeighbors; curIter += totalNumThreads)
    {
        int2 neighAndSampleIndices = pInNeighborsGPUPtr[curIter+pNeighOffset];

        mccnn::fpoint<D> ptDiff = (pInPtsGPUPtr[neighAndSampleIndices.x] - 
            pInSamplesGPUPtr[neighAndSampleIndices.y])*<b>pInInvRadiiGPUPtr[0]</b>;
                
        float weightVal = 1.0f - max(2.0f*mccnn::length(ptDiff) - 1.0f, 0.0f);
        weightVal = weightVal * weightVal * (3.0f - 2.0f * weightVal);

        float numPts = (float)((neighAndSampleIndices.y > 0)?
            pInSampleNeighIGPUPtr[neighAndSampleIndices.y] - 
            pInSampleNeighIGPUPtr[neighAndSampleIndices.y-1]:
            pInSampleNeighIGPUPtr[neighAndSampleIndices.y]);

        weightVal *= 1.0f/(pInPDFsGPUPtr[curIter+pNeighOffset]*numPts);

        pOutPtDiffsGPUPtr[curIter] = ptDiff;
        pOutPtWeightsGPUPtr[curIter] = weightVal;
    }
}

I generated the assembly code of such kernel for both cases and the only difference is that the kernel with restrict only loads variable pInInvRadiiGPUPtr once and uses the value in the register each iteration, while the kernel without the restrict loads it every iteration. The compiler is behaving as expected since assumes that the value of the variable is not changed. However, both version should produce correct results but this is not the case. This value is constant and loading it once should generate correct results, and even if this value is modified, loading it again should produce wrong results too because the value should be constant. In order to be sure this is the problem, I removed the restrict keyword from the parameter pInInvRadiiGPUPtr and one of the output parameters and then the compiler generates code that loads the value each iteration and the results are ok.

Here is the assembly code for the two versions of the kernel.

Assembly with restrict:

cvta.to.global.u64 	%rd11, %rd7;
	mov.u32 	%r13, %nctaid.x;
	mul.lo.s32 	%r3, %r13, %r1;
	[b]ld.global.f32 	%f1, [%rd11];
	ld.global.f32 	%f2, [%rd11+4];[/b]
	cvta.to.global.u64 	%rd12, %rd5;
	cvta.to.global.u64 	%rd16, %rd3;
	cvta.to.global.u64 	%rd19, %rd4;
	cvta.to.global.u64 	%rd22, %rd6;
	cvta.to.global.u64 	%rd24, %rd8;
	cvta.to.global.u64 	%rd27, %rd9;
	cvta.to.global.u64 	%rd30, %rd10;

BB0_2:
	add.s32 	%r14, %r20, %r9;
	cvt.u64.u32	%rd1, %r14;
	mul.wide.u32 	%rd13, %r14, 8;
	add.s64 	%rd14, %rd12, %rd13;
	ld.global.v2.u32 	{%r15, %r16}, [%rd14];
	cvt.s64.s32	%rd15, %r16;
	mul.wide.s32 	%rd17, %r15, 8;
	add.s64 	%rd18, %rd16, %rd17;
	mul.wide.s32 	%rd20, %r16, 8;
	add.s64 	%rd21, %rd19, %rd20;
	ld.global.f32 	%f6, [%rd21];
	ld.global.f32 	%f7, [%rd18];
	sub.f32 	%f8, %f7, %f6;
	ld.global.f32 	%f9, [%rd21+4];
	ld.global.f32 	%f10, [%rd18+4];
	sub.f32 	%f11, %f10, %f9;
	mul.f32 	%f3, %f8, %f1;
	mul.f32 	%f4, %f11, %f2;
	fma.rn.f32 	%f12, %f3, %f3, %f13;
	fma.rn.f32 	%f14, %f4, %f4, %f12;
	sqrt.rn.f32 	%f15, %f14;
	fma.rn.f32 	%f16, %f15, 0f40000000, 0fBF800000;
	mov.f32 	%f17, 0f00000000;
	max.f32 	%f5, %f16, %f17;
	mul.wide.s32 	%rd23, %r16, 4;
	add.s64 	%rd2, %rd22, %rd23;
	ld.global.u32 	%r21, [%rd2];
	setp.lt.s64	%p2, %rd15, 1;
	@%p2 bra 	BB0_4;

	ld.global.u32 	%r19, [%rd2+-4];
	sub.s32 	%r21, %r21, %r19;

BB0_4:
	mov.f32 	%f18, 0f3F800000;
	sub.f32 	%f19, %f18, %f5;
	mul.f32 	%f20, %f19, %f19;
	fma.rn.f32 	%f21, %f19, 0fC0000000, 0f40400000;
	mul.f32 	%f22, %f20, %f21;
	shl.b64 	%rd25, %rd1, 2;
	add.s64 	%rd26, %rd24, %rd25;
	ld.global.f32 	%f23, [%rd26];
	cvt.rn.f32.s32	%f24, %r21;
	mul.f32 	%f25, %f23, %f24;
	rcp.rn.f32 	%f26, %f25;
	mul.f32 	%f27, %f22, %f26;
	mul.wide.u32 	%rd28, %r20, 8;
	add.s64 	%rd29, %rd27, %rd28;
	st.global.f32 	[%rd29], %f3;
	st.global.f32 	[%rd29+4], %f4;
	mul.wide.u32 	%rd31, %r20, 4;
	add.s64 	%rd32, %rd30, %rd31;
	st.global.f32 	[%rd32], %f27;
	add.s32 	%r20, %r20, %r3;
	setp.lt.u32	%p3, %r20, %r10;
	@%p3 bra 	BB0_2;

BB0_5:
	ret;

Assembly without restrict:

cvta.to.global.u64 	%rd1, %rd12;
	cvta.to.global.u64 	%rd2, %rd11;
	mov.u32 	%r13, %nctaid.x;
	mul.lo.s32 	%r3, %r13, %r1;
	cvta.to.global.u64 	%rd13, %rd9;
	cvta.to.global.u64 	%rd14, %rd7;
	cvta.to.global.u64 	%rd18, %rd5;
	cvta.to.global.u64 	%rd21, %rd6;
	cvta.to.global.u64 	%rd24, %rd8;
	cvta.to.global.u64 	%rd26, %rd10;

BB0_2:
	add.s32 	%r14, %r20, %r9;
	cvt.u64.u32	%rd3, %r14;
	mul.wide.u32 	%rd15, %r14, 8;
	add.s64 	%rd16, %rd14, %rd15;
	ld.global.v2.u32 	{%r15, %r16}, [%rd16];
	cvt.s64.s32	%rd17, %r16;
	mul.wide.s32 	%rd19, %r15, 8;
	add.s64 	%rd20, %rd18, %rd19;
	mul.wide.s32 	%rd22, %r16, 8;
	add.s64 	%rd23, %rd21, %rd22;
	ld.global.f32 	%f4, [%rd23];
	ld.global.f32 	%f5, [%rd20];
	sub.f32 	%f6, %f5, %f4;
	ld.global.f32 	%f7, [%rd23+4];
	ld.global.f32 	%f8, [%rd20+4];
	sub.f32 	%f9, %f8, %f7;
	<b>ld.global.f32 	%f10, [%rd13];</b>
	mul.f32 	%f1, %f6, %f10;
	<b>ld.global.f32 	%f11, [%rd13+4];</b>
	mul.f32 	%f2, %f9, %f11;
	fma.rn.f32 	%f12, %f1, %f1, %f13;
	fma.rn.f32 	%f14, %f2, %f2, %f12;
	sqrt.rn.f32 	%f15, %f14;
	fma.rn.f32 	%f16, %f15, 0f40000000, 0fBF800000;
	mov.f32 	%f17, 0f00000000;
	max.f32 	%f3, %f16, %f17;
	mul.wide.s32 	%rd25, %r16, 4;
	add.s64 	%rd4, %rd24, %rd25;
	ld.global.u32 	%r21, [%rd4];
	setp.lt.s64	%p2, %rd17, 1;
	@%p2 bra 	BB0_4;

	ld.global.u32 	%r19, [%rd4+-4];
	sub.s32 	%r21, %r21, %r19;

BB0_4:
	mov.f32 	%f18, 0f3F800000;
	sub.f32 	%f19, %f18, %f3;
	mul.f32 	%f20, %f19, %f19;
	fma.rn.f32 	%f21, %f19, 0fC0000000, 0f40400000;
	mul.f32 	%f22, %f20, %f21;
	shl.b64 	%rd27, %rd3, 2;
	add.s64 	%rd28, %rd26, %rd27;
	ld.global.f32 	%f23, [%rd28];
	cvt.rn.f32.s32	%f24, %r21;
	mul.f32 	%f25, %f23, %f24;
	rcp.rn.f32 	%f26, %f25;
	mul.f32 	%f27, %f22, %f26;
	mul.wide.u32 	%rd29, %r20, 8;
	add.s64 	%rd30, %rd2, %rd29;
	st.global.f32 	[%rd30], %f1;
	st.global.f32 	[%rd30+4], %f2;
	mul.wide.u32 	%rd31, %r20, 4;
	add.s64 	%rd32, %rd1, %rd31;
	st.global.f32 	[%rd32], %f27;
	add.s32 	%r20, %r20, %r3;
	setp.lt.u32	%p3, %r20, %r10;
	@%p3 bra 	BB0_2;

BB0_5:
	ret;

I am working on a Linux (Ubuntu 16.04) system with and RTX 2080 and driver 410.57. I would really appreciate if someone can help me to understand what is going on.

Thank you in advance

Topic		Replies	Views
Restrict usage full overlapping element-by-element processing CUDA Programming and Performance	15	1336	March 11, 2021
Does CUDA harness the restrict functionality? CUDA Programming and Performance	9	2378	September 2, 2016
Using __restricct__ in CUDA is not giving any significant performance benifit CUDA Programming and Performance	2	588	February 9, 2023
Clarification of __restrict__ in cuda CUDA Programming and Performance compile	1	1104	September 12, 2020
__restrict__ keyword CUDA Programming and Performance	0	1572	August 24, 2010
__restrict__ seems to be ignored for base pointers in structs. having base pointers with restrict as kernel arguments directly works as expected CUDA Programming and Performance	12	2264	October 17, 2020
CUDA Pro Tip: Optimize for Pointer Aliasing Technical Blog	13	1172	April 11, 2019
Does the use of 16-bit, __restrict__ const kernel arguments hurt performance? CUDA Programming and Performance	4	4746	May 24, 2018
__restrict__ seems to be ignored for base pointers in structs. having base pointers with restrict as kernel arguments directly works as expected nvc, nvc++ and nvfortran cuda	1	607	March 10, 2021
Order of keywords (restrict) for loop dependencies Legacy PGI Compilers	1	3104	August 26, 2010

Unexpected behavior with __restrict__ keyword?

Related topics

Unexpected behavior with restrict keyword?