atomicAdd introduces error even when not executed

Below are two versions of a small part of my ray intersection code that runs on CUDA 11.7:
The first one uses atomicAdd to increase the request counter, whereas the second one would run into a race condition by skipping the atomicAdd, hence I just removed the whole if branch altogether.

When running version 1 I encountered visual errors akin missed intersections or intersections where they should not occur. These do not occur when running version 2. Even more puzzling to me is the fact that this branch (nodeIndex < 0) is never ran (on both versions), yet it seems to create problems for me.

Version 1:

GpuTraceResult Intersect(...)
{
		// Irrelevant code...
		const int nodeIndex = cache[id];
		if (nodeIndex < 0)
		{
			if (atomicAdd(&cache->ObjectRequestCounter, 1U) == 0)
			{
				cache->ObjectRequestId = id;
			}
			return GpuTraceResult();
		}
		// Some more irrelevant code...
		GpuTraceResult result = Trace(...);
		return result;
}

Version 2:

GpuTraceResult Intersect(...)
{
		// Irrelevant code...
		const int nodeIndex = cache[id];
		if (nodeIndex < 0)
		{
			cache->ObjectRequestCounter++;
			cache->ObjectRequestId = id;
			return GpuTraceResult();
		}
		// Some more irrelevant code...
		GpuTraceResult result = Trace(...);
		return result;
}

The ptx with source code interleaved:

Version 1:

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:25 		const int nodeIndex = cache[id];
	.loc	3 25 23, function_name $L__info_string16, inlined_at 3 99 35
	ld.global.u32 	%r105, [%rd15+-8];
	ld.global.u64 	%rd52, [%rd2+96];
	mul.wide.u32 	%rd53, %r105, 4;
	add.s64 	%rd54, %rd52, %rd53;
	ld.u32 	%r214, [%rd54];

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:26 		if (nodeIndex < 0)
	.loc	3 26 3, function_name $L__info_string16, inlined_at 3 99 35
	setp.lt.s32 	%p24, %r214, 0;

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:47 		return result;
	.loc	3 47 3, function_name $L__info_string16, inlined_at 3 99 35
	mov.f32 	%f341, 0fBF800000;

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:26 		if (nodeIndex < 0)
	.loc	3 26 3, function_name $L__info_string16, inlined_at 3 99 35
	@%p24 bra 	$L__BB0_43;

//... skipped some parts here
$L__BB0_43:

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:28 			if (atomicAdd(&cache->ObjectRequestCounter, 1U) == 0)
	.loc	3 28 8, function_name $L__info_string16, inlined_at 3 98 35

//C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\include\device_atomic_functions.hpp:82   return __uAtomicAdd(address, val);
	.loc	2 82 10, function_name $L__info_string0, inlined_at 3 28 8
	add.s64 	%rd110, %rd2, 120;
	atom.global.add.u32 	%r186, [%rd110], 1;

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:28 			if (atomicAdd(&cache->ObjectRequestCounter, 1U) == 0)
	.loc	3 28 8, function_name $L__info_string16, inlined_at 3 98 35
	setp.ne.s32 	%p58, %r186, 0;
	mov.u32 	%r214, 0;
	@%p58 bra 	$L__BB0_45;


//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:30 				cache->ObjectRequestId = id;
	.loc	3 30 5, function_name $L__info_string16, inlined_at 3 98 35
	ld.global.u32 	%r188, [%rd15+-8];
	st.global.u32 	[%rd2+116], %r188;
	bra.uni 	$L__BB0_45;

Version 2:

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:26 		const int nodeIndex = cache[id];
	.loc	3 26 23, function_name $L__info_string16, inlined_at 3 98 35
	ld.global.u32 	%r105, [%rd15+-8];
	ld.global.u64 	%rd52, [%rd2+96];
	mul.wide.u32 	%rd53, %r105, 4;
	add.s64 	%rd54, %rd52, %rd53;
	ld.u32 	%r214, [%rd54];

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:27 		if (nodeIndex < 0)
	.loc	3 27 3, function_name $L__info_string16, inlined_at 3 98 35
	setp.lt.s32 	%p24, %r214, 0;
	@%p24 bra 	$L__BB0_43;

//... skipped some parts here
$L__BB0_43:

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:29 			cache->ObjectRequestCounter++;
	.loc	3 29 4, function_name $L__info_string16, inlined_at 3 98 35
	ld.global.u32 	%r186, [%rd2+120];
	add.s32 	%r187, %r186, 1;
	st.global.u32 	[%rd2+120], %r187;

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:30 			cache->ObjectRequestId = id;
	.loc	3 30 4, function_name $L__info_string16, inlined_at 3 98 35
	ld.global.u32 	%r188, [%rd15+-8];
	st.global.u32 	[%rd2+116], %r188;
	mov.f32 	%f339, 0fBF800000;
	mov.u32 	%r214, 0;
	bra.uni 	$L__BB0_44;

I can’t seem to find the culprit from looking at the ptx files. It seems to set the default (mov.f32 %f341, 0fBF800000;) for the GpuTraceResult before moving on the branch in version 1, but both versions end up executing the same code besides the parts above (when diff-ing both files).

I’m hoping that someone could shed some light on this issue for me.

Command line used to generate ptx (this was modified from the VS cmd line to generate the final code):

nvcc.exe -ptx -src-in-ptx --generate-line-info -gencode=arch=compute_75,code=\"sm_75,compute_75\" -ccbin "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.32.31326\bin\HostX64\x64" -x cu -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\include" --keep-dir x64\DebugRelease -maxrregcount=0 --machine 64 -D_MBCS -Xcompiler "/EHsc /W1 /nologo /O2 /FS /Zi /MD" -o  "RayTracingKernels.ptx" "RayTracingKernels.cu"

I found the cause of the problem: In the code that calls these intersect functions I had aggregate atomic operations as explained here: CUDA Pro Tip: Optimized Filtering with Warp-Aggregated Atomics | NVIDIA Technical Blog

Since I moved to a Volta architecture (sm 7.0+) these were both redundant and I had made an error by not realizing some of the operations weren’t executed at the same time by the whole warp. After fixing that to a single atomicAdd (since the compiler now performs the aggregate atomics) the issue I described above was completely removed.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.