atomicAdd introduces error even when not executed

robbin.marcus · August 24, 2022, 2:06pm

Below are two versions of a small part of my ray intersection code that runs on CUDA 11.7:
The first one uses atomicAdd to increase the request counter, whereas the second one would run into a race condition by skipping the atomicAdd, hence I just removed the whole if branch altogether.

When running version 1 I encountered visual errors akin missed intersections or intersections where they should not occur. These do not occur when running version 2. Even more puzzling to me is the fact that this branch (nodeIndex < 0) is never ran (on both versions), yet it seems to create problems for me.

Version 1:

GpuTraceResult Intersect(...)
{
		// Irrelevant code...
		const int nodeIndex = cache[id];
		if (nodeIndex < 0)
		{
			if (atomicAdd(&cache->ObjectRequestCounter, 1U) == 0)
			{
				cache->ObjectRequestId = id;
			}
			return GpuTraceResult();
		}
		// Some more irrelevant code...
		GpuTraceResult result = Trace(...);
		return result;
}

Version 2:

GpuTraceResult Intersect(...)
{
		// Irrelevant code...
		const int nodeIndex = cache[id];
		if (nodeIndex < 0)
		{
			cache->ObjectRequestCounter++;
			cache->ObjectRequestId = id;
			return GpuTraceResult();
		}
		// Some more irrelevant code...
		GpuTraceResult result = Trace(...);
		return result;
}

The ptx with source code interleaved:

Version 1:

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:25 		const int nodeIndex = cache[id];
	.loc	3 25 23, function_name $L__info_string16, inlined_at 3 99 35
	ld.global.u32 	%r105, [%rd15+-8];
	ld.global.u64 	%rd52, [%rd2+96];
	mul.wide.u32 	%rd53, %r105, 4;
	add.s64 	%rd54, %rd52, %rd53;
	ld.u32 	%r214, [%rd54];

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:26 		if (nodeIndex < 0)
	.loc	3 26 3, function_name $L__info_string16, inlined_at 3 99 35
	setp.lt.s32 	%p24, %r214, 0;

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:47 		return result;
	.loc	3 47 3, function_name $L__info_string16, inlined_at 3 99 35
	mov.f32 	%f341, 0fBF800000;

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:26 		if (nodeIndex < 0)
	.loc	3 26 3, function_name $L__info_string16, inlined_at 3 99 35
	@%p24 bra 	$L__BB0_43;

//... skipped some parts here
$L__BB0_43:

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:28 			if (atomicAdd(&cache->ObjectRequestCounter, 1U) == 0)
	.loc	3 28 8, function_name $L__info_string16, inlined_at 3 98 35

//C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\include\device_atomic_functions.hpp:82   return __uAtomicAdd(address, val);
	.loc	2 82 10, function_name $L__info_string0, inlined_at 3 28 8
	add.s64 	%rd110, %rd2, 120;
	atom.global.add.u32 	%r186, [%rd110], 1;

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:28 			if (atomicAdd(&cache->ObjectRequestCounter, 1U) == 0)
	.loc	3 28 8, function_name $L__info_string16, inlined_at 3 98 35
	setp.ne.s32 	%p58, %r186, 0;
	mov.u32 	%r214, 0;
	@%p58 bra 	$L__BB0_45;


//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:30 				cache->ObjectRequestId = id;
	.loc	3 30 5, function_name $L__info_string16, inlined_at 3 98 35
	ld.global.u32 	%r188, [%rd15+-8];
	st.global.u32 	[%rd2+116], %r188;
	bra.uni 	$L__BB0_45;

Version 2:

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:26 		const int nodeIndex = cache[id];
	.loc	3 26 23, function_name $L__info_string16, inlined_at 3 98 35
	ld.global.u32 	%r105, [%rd15+-8];
	ld.global.u64 	%rd52, [%rd2+96];
	mul.wide.u32 	%rd53, %r105, 4;
	add.s64 	%rd54, %rd52, %rd53;
	ld.u32 	%r214, [%rd54];

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:27 		if (nodeIndex < 0)
	.loc	3 27 3, function_name $L__info_string16, inlined_at 3 98 35
	setp.lt.s32 	%p24, %r214, 0;
	@%p24 bra 	$L__BB0_43;

//... skipped some parts here
$L__BB0_43:

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:29 			cache->ObjectRequestCounter++;
	.loc	3 29 4, function_name $L__info_string16, inlined_at 3 98 35
	ld.global.u32 	%r186, [%rd2+120];
	add.s32 	%r187, %r186, 1;
	st.global.u32 	[%rd2+120], %r187;

//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:30 			cache->ObjectRequestId = id;
	.loc	3 30 4, function_name $L__info_string16, inlined_at 3 98 35
	ld.global.u32 	%r188, [%rd15+-8];
	st.global.u32 	[%rd2+116], %r188;
	mov.f32 	%f339, 0fBF800000;
	mov.u32 	%r214, 0;
	bra.uni 	$L__BB0_44;

I can’t seem to find the culprit from looking at the ptx files. It seems to set the default (mov.f32 %f341, 0fBF800000;) for the GpuTraceResult before moving on the branch in version 1, but both versions end up executing the same code besides the parts above (when diff-ing both files).

I’m hoping that someone could shed some light on this issue for me.

Command line used to generate ptx (this was modified from the VS cmd line to generate the final code):

nvcc.exe -ptx -src-in-ptx --generate-line-info -gencode=arch=compute_75,code=\"sm_75,compute_75\" -ccbin "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.32.31326\bin\HostX64\x64" -x cu -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\include" --keep-dir x64\DebugRelease -maxrregcount=0 --machine 64 -D_MBCS -Xcompiler "/EHsc /W1 /nologo /O2 /FS /Zi /MD" -o  "RayTracingKernels.ptx" "RayTracingKernels.cu"

robbin.marcus · September 11, 2022, 10:19am

I found the cause of the problem: In the code that calls these intersect functions I had aggregate atomic operations as explained here: CUDA Pro Tip: Optimized Filtering with Warp-Aggregated Atomics | NVIDIA Technical Blog

Since I moved to a Volta architecture (sm 7.0+) these were both redundant and I had made an error by not realizing some of the operations weren’t executed at the same time by the whole warp. After fixing that to a single atomicAdd (since the compiler now performs the aggregate atomics) the issue I described above was completely removed.

system · September 25, 2022, 10:20am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
AtomicAdd with Visual Studio 2013 CUDA Setup and Installation	11	5489	February 26, 2015
atomicAdd crash CUDA Programming and Performance	8	1310	August 25, 2016
unexpected behaviour from atomics on Volta CUDA Programming and Performance	12	1584	May 31, 2019
Problems with BFS implementation based on atomicCAS and atomicAdd CUDA Programming and Performance	11	1433	January 28, 2020
Half2 atomics generate unused code CUDA Programming and Performance	13	165	August 8, 2024
atomicAdd CUDA Programming and Performance	10	7212	September 26, 2013
atomicAdd(float,float) - atomicMul(float,float) ... CUDA Programming and Performance	13	56513	July 29, 2010
Issues with atomicAdd when doing Inter block sync CUDA Programming and Performance	12	3996	August 19, 2010
atomicAdd with float2 no API support, workarounds ? CUDA Programming and Performance	23	5173	January 28, 2021
To use atomic add Legacy PGI Compilers	13	11775	June 30, 2012

atomicAdd introduces error even when not executed

Related topics