Unexpected behavior with __restrict__ keyword?

Dear all,

I have some questions related to the restrict keyword. In the following function if I declare the input parameters with restrict the kernel generates wrong results. However, if I do not use it, the kernel behaves as expected.

template<int D>
__global__ void pre_proc_neighbors_gpu_kernel(
    const unsigned int pNeighOffset,
    const unsigned int pNumNeighbors,       
    const mccnn::fpoint<D>* __restrict__ pInPtsGPUPtr,
    const mccnn::fpoint<D>* __restrict__ pInSamplesGPUPtr,
    const int2* __restrict__ pInNeighborsGPUPtr,
    const int* __restrict__ pInSampleNeighIGPUPtr,
    const mccnn::fpoint<D>* __restrict__ pInInvRadiiGPUPtr,
    const float* __restrict__ pInPDFsGPUPtr,
    mccnn::fpoint<D>* __restrict__ pOutPtDiffsGPUPtr,
    float*  __restrict__ pOutPtWeightsGPUPtr)
{
    const unsigned int initThreadIndex = mccnn::compute_global_index_gpu_funct();
    const unsigned int totalNumThreads = mccnn::compute_total_threads_gpu_funct(); 

    for(unsigned int curIter = initThreadIndex; 
        curIter < pNumNeighbors; curIter += totalNumThreads)
    {
        int2 neighAndSampleIndices = pInNeighborsGPUPtr[curIter+pNeighOffset];

        mccnn::fpoint<D> ptDiff = (pInPtsGPUPtr[neighAndSampleIndices.x] - 
            pInSamplesGPUPtr[neighAndSampleIndices.y])*<b>pInInvRadiiGPUPtr[0]</b>;
                
        float weightVal = 1.0f - max(2.0f*mccnn::length(ptDiff) - 1.0f, 0.0f);
        weightVal = weightVal * weightVal * (3.0f - 2.0f * weightVal);

        float numPts = (float)((neighAndSampleIndices.y > 0)?
            pInSampleNeighIGPUPtr[neighAndSampleIndices.y] - 
            pInSampleNeighIGPUPtr[neighAndSampleIndices.y-1]:
            pInSampleNeighIGPUPtr[neighAndSampleIndices.y]);

        weightVal *= 1.0f/(pInPDFsGPUPtr[curIter+pNeighOffset]*numPts);

        pOutPtDiffsGPUPtr[curIter] = ptDiff;
        pOutPtWeightsGPUPtr[curIter] = weightVal;
    }
}

I generated the assembly code of such kernel for both cases and the only difference is that the kernel with restrict only loads variable pInInvRadiiGPUPtr once and uses the value in the register each iteration, while the kernel without the restrict loads it every iteration. The compiler is behaving as expected since assumes that the value of the variable is not changed. However, both version should produce correct results but this is not the case. This value is constant and loading it once should generate correct results, and even if this value is modified, loading it again should produce wrong results too because the value should be constant. In order to be sure this is the problem, I removed the restrict keyword from the parameter pInInvRadiiGPUPtr and one of the output parameters and then the compiler generates code that loads the value each iteration and the results are ok.

Here is the assembly code for the two versions of the kernel.

Assembly with restrict:

cvta.to.global.u64 	%rd11, %rd7;
	mov.u32 	%r13, %nctaid.x;
	mul.lo.s32 	%r3, %r13, %r1;
	[b]ld.global.f32 	%f1, [%rd11];
	ld.global.f32 	%f2, [%rd11+4];[/b]
	cvta.to.global.u64 	%rd12, %rd5;
	cvta.to.global.u64 	%rd16, %rd3;
	cvta.to.global.u64 	%rd19, %rd4;
	cvta.to.global.u64 	%rd22, %rd6;
	cvta.to.global.u64 	%rd24, %rd8;
	cvta.to.global.u64 	%rd27, %rd9;
	cvta.to.global.u64 	%rd30, %rd10;

BB0_2:
	add.s32 	%r14, %r20, %r9;
	cvt.u64.u32	%rd1, %r14;
	mul.wide.u32 	%rd13, %r14, 8;
	add.s64 	%rd14, %rd12, %rd13;
	ld.global.v2.u32 	{%r15, %r16}, [%rd14];
	cvt.s64.s32	%rd15, %r16;
	mul.wide.s32 	%rd17, %r15, 8;
	add.s64 	%rd18, %rd16, %rd17;
	mul.wide.s32 	%rd20, %r16, 8;
	add.s64 	%rd21, %rd19, %rd20;
	ld.global.f32 	%f6, [%rd21];
	ld.global.f32 	%f7, [%rd18];
	sub.f32 	%f8, %f7, %f6;
	ld.global.f32 	%f9, [%rd21+4];
	ld.global.f32 	%f10, [%rd18+4];
	sub.f32 	%f11, %f10, %f9;
	mul.f32 	%f3, %f8, %f1;
	mul.f32 	%f4, %f11, %f2;
	fma.rn.f32 	%f12, %f3, %f3, %f13;
	fma.rn.f32 	%f14, %f4, %f4, %f12;
	sqrt.rn.f32 	%f15, %f14;
	fma.rn.f32 	%f16, %f15, 0f40000000, 0fBF800000;
	mov.f32 	%f17, 0f00000000;
	max.f32 	%f5, %f16, %f17;
	mul.wide.s32 	%rd23, %r16, 4;
	add.s64 	%rd2, %rd22, %rd23;
	ld.global.u32 	%r21, [%rd2];
	setp.lt.s64	%p2, %rd15, 1;
	@%p2 bra 	BB0_4;

	ld.global.u32 	%r19, [%rd2+-4];
	sub.s32 	%r21, %r21, %r19;

BB0_4:
	mov.f32 	%f18, 0f3F800000;
	sub.f32 	%f19, %f18, %f5;
	mul.f32 	%f20, %f19, %f19;
	fma.rn.f32 	%f21, %f19, 0fC0000000, 0f40400000;
	mul.f32 	%f22, %f20, %f21;
	shl.b64 	%rd25, %rd1, 2;
	add.s64 	%rd26, %rd24, %rd25;
	ld.global.f32 	%f23, [%rd26];
	cvt.rn.f32.s32	%f24, %r21;
	mul.f32 	%f25, %f23, %f24;
	rcp.rn.f32 	%f26, %f25;
	mul.f32 	%f27, %f22, %f26;
	mul.wide.u32 	%rd28, %r20, 8;
	add.s64 	%rd29, %rd27, %rd28;
	st.global.f32 	[%rd29], %f3;
	st.global.f32 	[%rd29+4], %f4;
	mul.wide.u32 	%rd31, %r20, 4;
	add.s64 	%rd32, %rd30, %rd31;
	st.global.f32 	[%rd32], %f27;
	add.s32 	%r20, %r20, %r3;
	setp.lt.u32	%p3, %r20, %r10;
	@%p3 bra 	BB0_2;

BB0_5:
	ret;

Assembly without restrict:

cvta.to.global.u64 	%rd1, %rd12;
	cvta.to.global.u64 	%rd2, %rd11;
	mov.u32 	%r13, %nctaid.x;
	mul.lo.s32 	%r3, %r13, %r1;
	cvta.to.global.u64 	%rd13, %rd9;
	cvta.to.global.u64 	%rd14, %rd7;
	cvta.to.global.u64 	%rd18, %rd5;
	cvta.to.global.u64 	%rd21, %rd6;
	cvta.to.global.u64 	%rd24, %rd8;
	cvta.to.global.u64 	%rd26, %rd10;

BB0_2:
	add.s32 	%r14, %r20, %r9;
	cvt.u64.u32	%rd3, %r14;
	mul.wide.u32 	%rd15, %r14, 8;
	add.s64 	%rd16, %rd14, %rd15;
	ld.global.v2.u32 	{%r15, %r16}, [%rd16];
	cvt.s64.s32	%rd17, %r16;
	mul.wide.s32 	%rd19, %r15, 8;
	add.s64 	%rd20, %rd18, %rd19;
	mul.wide.s32 	%rd22, %r16, 8;
	add.s64 	%rd23, %rd21, %rd22;
	ld.global.f32 	%f4, [%rd23];
	ld.global.f32 	%f5, [%rd20];
	sub.f32 	%f6, %f5, %f4;
	ld.global.f32 	%f7, [%rd23+4];
	ld.global.f32 	%f8, [%rd20+4];
	sub.f32 	%f9, %f8, %f7;
	<b>ld.global.f32 	%f10, [%rd13];</b>
	mul.f32 	%f1, %f6, %f10;
	<b>ld.global.f32 	%f11, [%rd13+4];</b>
	mul.f32 	%f2, %f9, %f11;
	fma.rn.f32 	%f12, %f1, %f1, %f13;
	fma.rn.f32 	%f14, %f2, %f2, %f12;
	sqrt.rn.f32 	%f15, %f14;
	fma.rn.f32 	%f16, %f15, 0f40000000, 0fBF800000;
	mov.f32 	%f17, 0f00000000;
	max.f32 	%f3, %f16, %f17;
	mul.wide.s32 	%rd25, %r16, 4;
	add.s64 	%rd4, %rd24, %rd25;
	ld.global.u32 	%r21, [%rd4];
	setp.lt.s64	%p2, %rd17, 1;
	@%p2 bra 	BB0_4;

	ld.global.u32 	%r19, [%rd4+-4];
	sub.s32 	%r21, %r21, %r19;

BB0_4:
	mov.f32 	%f18, 0f3F800000;
	sub.f32 	%f19, %f18, %f3;
	mul.f32 	%f20, %f19, %f19;
	fma.rn.f32 	%f21, %f19, 0fC0000000, 0f40400000;
	mul.f32 	%f22, %f20, %f21;
	shl.b64 	%rd27, %rd3, 2;
	add.s64 	%rd28, %rd26, %rd27;
	ld.global.f32 	%f23, [%rd28];
	cvt.rn.f32.s32	%f24, %r21;
	mul.f32 	%f25, %f23, %f24;
	rcp.rn.f32 	%f26, %f25;
	mul.f32 	%f27, %f22, %f26;
	mul.wide.u32 	%rd29, %r20, 8;
	add.s64 	%rd30, %rd2, %rd29;
	st.global.f32 	[%rd30], %f1;
	st.global.f32 	[%rd30+4], %f2;
	mul.wide.u32 	%rd31, %r20, 4;
	add.s64 	%rd32, %rd1, %rd31;
	st.global.f32 	[%rd32], %f27;
	add.s32 	%r20, %r20, %r3;
	setp.lt.u32	%p3, %r20, %r10;
	@%p3 bra 	BB0_2;

BB0_5:
	ret;

I am working on a Linux (Ubuntu 16.04) system with and RTX 2080 and driver 410.57. I would really appreciate if someone can help me to understand what is going on.

Thank you in advance