Unsigned int tea( unsigned int val0, unsigned int val1) in raygen program returns an incorrect result!

template<unsigned int N>
static __device__ __inline__ unsigned int tea( unsigned int val0, unsigned int val1,bool p=false )
{
  unsigned int v0 = val0;
  unsigned int v1 = val1;
  unsigned int s0 = 0;

  // if(p)printf("Iteration %d: v0 = %d, v1 = %d, s0 = %d\n", N,v0, v1, s0);
  for( unsigned int n = 0; n < N; n++ )
  {
    s0 += 0x9e3779b9;
    v0 += ((v1<<4)+0xa341316c)^(v1+s0)^((v1>>5)+0xc8013ea4);
    v1 += ((v0<<4)+0xad90777d)^(v0+s0)^((v0>>5)+0x7e95761e);
    // if(p)printf("Iteration %d: v0 = %u, v1 = %u, s0 = %u\n", n, v0, v1, s0);
  }

  // if(p)printf("over %d: v0 = %u, v1 = %u, s0 = %u\n", N,v0, v1, s0);
  return v0;
}

tea<16>(0,1); The cpu result is1892921073,but in raygen program it returns 4118247010.then I turn on printf just like:

template<unsigned int N>
static __device__ __inline__ unsigned int tea( unsigned int val0, unsigned int val1,bool p=false )
{
  unsigned int v0 = val0;
  unsigned int v1 = val1;
  unsigned int s0 = 0;

  if(p)printf("Iteration %d: v0 = %d, v1 = %d, s0 = %d\n", N,v0, v1, s0);
  for( unsigned int n = 0; n < N; n++ )
  {
    s0 += 0x9e3779b9;
    v0 += ((v1<<4)+0xa341316c)^(v1+s0)^((v1>>5)+0xc8013ea4);
    v1 += ((v0<<4)+0xad90777d)^(v0+s0)^((v0>>5)+0x7e95761e);
    if(p)printf("Iteration %d: v0 = %u, v1 = %u, s0 = %u\n", n, v0, v1, s0);
  }

  if(p)printf("over %d: v0 = %u, v1 = %u, s0 = %u\n", N,v0, v1, s0);
  return v0;
}

things become different:

Iteration 16: v0 = 0, v1 = 1, s0 = 0
Iteration 0: v0 = 4118247010, v1 = 283647064, s0 = 2654435769
Iteration 1: v0 = 697629954, v1 = 1877591623, s0 = 1013904242
Iteration 2: v0 = 1247946618, v1 = 1715258152, s0 = 3668340011
Iteration 3: v0 = 1569133783, v1 = 941708474, s0 = 2027808484
Iteration 4: v0 = 19502569, v1 = 3735762688, s0 = 387276957
Iteration 5: v0 = 3525557839, v1 = 2890237912, s0 = 3041712726
Iteration 6: v0 = 660004408, v1 = 3560623501, s0 = 1401181199
Iteration 7: v0 = 353526449, v1 = 1535207764, s0 = 4055616968
Iteration 8: v0 = 2417009224, v1 = 1762521688, s0 = 2415085441
Iteration 9: v0 = 4167167376, v1 = 4236864917, s0 = 774553914
Iteration 10: v0 = 1698174388, v1 = 2914919014, s0 = 3428989683
Iteration 11: v0 = 257148189, v1 = 1374668584, s0 = 1788458156
Iteration 12: v0 = 1640083417, v1 = 1955196887, s0 = 147926629
Iteration 13: v0 = 2644043492, v1 = 3032958913, s0 = 2802362398
Iteration 14: v0 = 1974379242, v1 = 3852325290, s0 = 1161830871
Iteration 15: v0 = 1892921073, v1 = 1379537219, s0 = 3816266640
over 16: v0 = 1892921073, v1 = 1379537219, s0 = 3816266640

Now it’s the correct result.So why is it happen?I can’t understand… I use optix8 and /fp:precise option.

I found a similar issue in another piece of my code:

float a = 0;
for(i=0;i<2;i++){
   a+=1
}

It will give the wrong result 0,But when I add some irrelevant code:

float a = 0;
float b=0;
for(i=0;i<2;i++){
   a+=1
   b+=1;
}

The result of a is correct again.This was very annoying, and I had to keep using printf() to debug what went wrong.I strongly suspected that there was a compilation error

Hi @790989228,

This isn’t enough information to verify that something is wrong. Can you put together a complete and minimal reproducer and post the entire project somewhere, perhaps github?

The tea function is only integer math, so fp:precise won’t do anything here.

Is this code in your raygen program or something else?

Using printf might not be the best way to validate what’s wrong. Threads (and hence printf outputs from different threads) can execute in arbitrary order. Have you limited your GPU launch to 1 single thread?


David.

Also please try using a CUDA kernel with 1 thread instead of an OptiX kernel. This will allow you to verify whether OptiX is even involved in your issue, and it will allow you to use the CUDA debugger to step through your kernel instruction by instruction, follow the intermediate results, compare it to a CPU debugger session, and pinpoint whether and exactly where something is different between them.


David.

Yes, I am reproducing this result stably with 1 thread. I tried to package it as a minimal replicator, but I am not sure if I can reproduce it as well as I can now, because like I said, when I add some unrelated code, the result becomes correct

I was able to reproduce this problem in my project with minimal code, but not in another project (optix sample)

extern "C" __global__ void __raygen__ambient_probe_denoise()
{
	for (int i = 0; i != 1; i++)
	{
		printf("i:%d\n",i);
	}
}

this is cpp code, I call it just 1 thread

g_LaunchParams.UpdateHostToDevice();
    sbt.raygenRecord = m_RGRecordBuffer[EntryPointIndex].getCU();
    OPTIX_CHECK_LOG(optixLaunch( /*! pipeline we're launching launch: */
        pipeline,stream,
        /*! parameters and SBT */
        g_LaunchParams.d_param,
        sizeof(g_LaunchParams.data),
        &sbt,
        /*! dimensions of the launch: */
        1,
        1,
        1
    ));
    // sync - make sure the frame is rendered before we download and
    // display (obviously, for a high-performance application you
    // want to use streams and double-buffering, but for this simple
    // example, this will have to do)
    CUDA_SYNC_CHECK();

I ran it with a single thread,then Stuck indefinitely in this loop, I used the same cu code in another project and couldn’t reproduce it

i:0
i:2
i:2
i:2
i:2
i:2
i:2
i:2
i:2

This is the ptx code(going wrong):

	// .globl	__raygen__ambient_probe_denoise
.visible .entry __raygen__ambient_probe_denoise()
{
	.local .align 8 .b8 	__local_depot24[8];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .b32 	%r<3>;
	.reg .b64 	%rd<5>;
	.loc	1 2744 0


	mov.u64 	%SPL, __local_depot24;
	cvta.local.u64 	%SP, %SPL;
	add.u64 	%rd1, %SP, 0;
	add.u64 	%rd2, %SPL, 0;
	.loc	1 2748 3
	mov.u32 	%r1, 0;
	st.local.u32 	[%rd2], %r1;
	mov.u64 	%rd3, $str$3;
	cvta.global.u64 	%rd4, %rd3;
	{ // callseq 0, 0
	.reg .b32 temp_param_reg;
	.param .b64 param0;
	st.param.b64 	[param0+0], %rd4;
	.param .b64 param1;
	st.param.b64 	[param1+0], %rd1;
	.param .b32 retval0;
	call.uni (retval0), 
	vprintf, 
	(
	param0, 
	param1
	);
	ld.param.b32 	%r2, [retval0+0];
	} // callseq 0
	.loc	1 2750 1
	ret;

}

And the ptx code in another project without any problem(this is good):

.visible .entry __raygen__draw_solid_color()
{
	.local .align 8 .b8 	__local_depot0[8];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .b32 	%r<3>;
	.reg .b64 	%rd<5>;
	.loc	1 40 0
$L__func_begin0:
	.loc	1 40 0


	mov.u64 	%SPL, __local_depot0;
	cvta.local.u64 	%SP, %SPL;
	add.u64 	%rd1, %SP, 0;
	add.u64 	%rd2, %SPL, 0;
$L__tmp0:
	.loc	1 44 9
	mov.u32 	%r1, 0;
	st.local.u32 	[%rd2], %r1;
	mov.u64 	%rd3, $str;
	cvta.global.u64 	%rd4, %rd3;
	{ // callseq 0, 0
	.reg .b32 temp_param_reg;
	.param .b64 param0;
	st.param.b64 	[param0+0], %rd4;
	.param .b64 param1;
	st.param.b64 	[param1+0], %rd1;
	.param .b32 retval0;
	call.uni (retval0), 
	vprintf, 
	(
	param0, 
	param1
	);
	ld.param.b32 	%r2, [retval0+0];
	} // callseq 0
$L__tmp1:
	.loc	1 47 1
	ret;
$L__tmp2:
$L__func_end0:

}

thanks for your help!

The previous error happened in the debug configuration. Today I compiled release, then compiled debug, and ran debug program. Without any code modification, it returned to normal, no longer trapped in an endless loop, but I was really about to crash. Compared to the current optix8, optix6.5 is stable with no surprises. I can only pray that this kind of problem won’t happen again

Have you tried using a CUDA kernel?


David.