Below are two versions of a small part of my ray intersection code that runs on CUDA 11.7:
The first one uses atomicAdd
to increase the request counter, whereas the second one would run into a race condition by skipping the atomicAdd
, hence I just removed the whole if branch altogether.
When running version 1 I encountered visual errors akin missed intersections or intersections where they should not occur. These do not occur when running version 2. Even more puzzling to me is the fact that this branch (nodeIndex < 0
) is never ran (on both versions), yet it seems to create problems for me.
Version 1:
GpuTraceResult Intersect(...)
{
// Irrelevant code...
const int nodeIndex = cache[id];
if (nodeIndex < 0)
{
if (atomicAdd(&cache->ObjectRequestCounter, 1U) == 0)
{
cache->ObjectRequestId = id;
}
return GpuTraceResult();
}
// Some more irrelevant code...
GpuTraceResult result = Trace(...);
return result;
}
Version 2:
GpuTraceResult Intersect(...)
{
// Irrelevant code...
const int nodeIndex = cache[id];
if (nodeIndex < 0)
{
cache->ObjectRequestCounter++;
cache->ObjectRequestId = id;
return GpuTraceResult();
}
// Some more irrelevant code...
GpuTraceResult result = Trace(...);
return result;
}
The ptx with source code interleaved:
Version 1:
//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:25 const int nodeIndex = cache[id];
.loc 3 25 23, function_name $L__info_string16, inlined_at 3 99 35
ld.global.u32 %r105, [%rd15+-8];
ld.global.u64 %rd52, [%rd2+96];
mul.wide.u32 %rd53, %r105, 4;
add.s64 %rd54, %rd52, %rd53;
ld.u32 %r214, [%rd54];
//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:26 if (nodeIndex < 0)
.loc 3 26 3, function_name $L__info_string16, inlined_at 3 99 35
setp.lt.s32 %p24, %r214, 0;
//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:47 return result;
.loc 3 47 3, function_name $L__info_string16, inlined_at 3 99 35
mov.f32 %f341, 0fBF800000;
//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:26 if (nodeIndex < 0)
.loc 3 26 3, function_name $L__info_string16, inlined_at 3 99 35
@%p24 bra $L__BB0_43;
//... skipped some parts here
$L__BB0_43:
//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:28 if (atomicAdd(&cache->ObjectRequestCounter, 1U) == 0)
.loc 3 28 8, function_name $L__info_string16, inlined_at 3 98 35
//C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\include\device_atomic_functions.hpp:82 return __uAtomicAdd(address, val);
.loc 2 82 10, function_name $L__info_string0, inlined_at 3 28 8
add.s64 %rd110, %rd2, 120;
atom.global.add.u32 %r186, [%rd110], 1;
//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:28 if (atomicAdd(&cache->ObjectRequestCounter, 1U) == 0)
.loc 3 28 8, function_name $L__info_string16, inlined_at 3 98 35
setp.ne.s32 %p58, %r186, 0;
mov.u32 %r214, 0;
@%p58 bra $L__BB0_45;
//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:30 cache->ObjectRequestId = id;
.loc 3 30 5, function_name $L__info_string16, inlined_at 3 98 35
ld.global.u32 %r188, [%rd15+-8];
st.global.u32 [%rd2+116], %r188;
bra.uni $L__BB0_45;
Version 2:
//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:26 const int nodeIndex = cache[id];
.loc 3 26 23, function_name $L__info_string16, inlined_at 3 98 35
ld.global.u32 %r105, [%rd15+-8];
ld.global.u64 %rd52, [%rd2+96];
mul.wide.u32 %rd53, %r105, 4;
add.s64 %rd54, %rd52, %rd53;
ld.u32 %r214, [%rd54];
//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:27 if (nodeIndex < 0)
.loc 3 27 3, function_name $L__info_string16, inlined_at 3 98 35
setp.lt.s32 %p24, %r214, 0;
@%p24 bra $L__BB0_43;
//... skipped some parts here
$L__BB0_43:
//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:29 cache->ObjectRequestCounter++;
.loc 3 29 4, function_name $L__info_string16, inlined_at 3 98 35
ld.global.u32 %r186, [%rd2+120];
add.s32 %r187, %r186, 1;
st.global.u32 [%rd2+120], %r187;
//E:\Projects\RyneEngine\Cuda\BVHCacheTracing.h:30 cache->ObjectRequestId = id;
.loc 3 30 4, function_name $L__info_string16, inlined_at 3 98 35
ld.global.u32 %r188, [%rd15+-8];
st.global.u32 [%rd2+116], %r188;
mov.f32 %f339, 0fBF800000;
mov.u32 %r214, 0;
bra.uni $L__BB0_44;
I can’t seem to find the culprit from looking at the ptx files. It seems to set the default (mov.f32 %f341, 0fBF800000;
) for the GpuTraceResult before moving on the branch in version 1, but both versions end up executing the same code besides the parts above (when diff-ing both files).
I’m hoping that someone could shed some light on this issue for me.
Command line used to generate ptx (this was modified from the VS cmd line to generate the final code):
nvcc.exe -ptx -src-in-ptx --generate-line-info -gencode=arch=compute_75,code=\"sm_75,compute_75\" -ccbin "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.32.31326\bin\HostX64\x64" -x cu -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\include" --keep-dir x64\DebugRelease -maxrregcount=0 --machine 64 -D_MBCS -Xcompiler "/EHsc /W1 /nologo /O2 /FS /Zi /MD" -o "RayTracingKernels.ptx" "RayTracingKernels.cu"