Consider the following code:
#include <cuda_runtime_api.h>
template<class F>
void invoke(F&& f) {f();}
template<class F>
__global__ void invoke_kernel(F f){
f();
}
int main() {
int* ptr = nullptr;
cudaMalloc(&ptr, sizeof(int));
invoke_kernel<<<1,1>>>([=](){
invoke([&]() {
*myptr = 1;
});
});
}
Firstly, nvc++ -cuda -std=c++17
emits the following warning:
"nvcxx-test.cpp", line 15: warning: function "lambda []" captures local object "ptr" by reference, will likely cause an illegal memory access when run on the device
invoke([&]() {
However, I believe this to be a false positive as the original lambda captures by value, and the nested lambda should only capture the already captured-by-value copy by reference. This is also how other heterogeneous compilers (e.g. clang) behave.
To resolve the warning, I tried to force it to use the captured copy:
#include <cuda_runtime_api.h>
template<class F>
void invoke(F&& f) {f();}
template<class F>
__global__ void invoke_kernel(F f){
f();
}
int main() {
int* ptr = nullptr;
cudaMalloc(&ptr, sizeof(int));
invoke_kernel<<<1,1>>>([myptr=ptr](){
invoke([&]() {
*myptr = 1;
});
});
}
This however results in an ICE:
nvc++-Fatal-/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/compilers/bin/tools/cpp1 TERMINATED by signal 11
Arguments to /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/compilers/bin/tools/cpp1
/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4A__ -D__ABM__ -D__SSE4_2__ -D__AVX__ -D__AVX2__ -D__FMA__ -D__XSAVE__ -D__POPCNT__ -D__FXSR__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++17 -I- --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/compilers/include-stdpar --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/11.4/include --sys_include /usr/include/c++/11.1.0 --sys_include /usr/include/c++/11.1.0/x86_64-pc-linux-gnu --sys_include /usr/include/c++/11.1.0/backward --sys_include /usr/lib/gcc/x86_64-pc-linux-gnu/11.1.0/include --sys_include /usr/local/include --sys_include /usr/lib/gcc/x86_64-pc-linux-gnu/11.1.0/include-fixed --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -DCUDA_VERSION=11040 -DPGI_TESLA_TARGET -D__PGI_CUDA_ARCH__=600 -D__NVCOMPILER_CUDA_ARCH__=600 -D_CUDA -DCUDA_VERSION=11040 --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110100 -D__pgnu_vsn=110100 --cuda -D__CUDACC__ -D_NVHPC_CUDA_CPP -D__NV_NO_HOST_COMPILER_CHECK --preinclude _cuda_preinclude.h --cudacap=60 --cudacap=61 --cudacap=70 --cudacap=75 --cudacap=80 --cudacap=86 -D_NVHPC_RDC -q -o /tmp/nvc++YKPqw0iv4Tlf.il nvcxx-test.cpp
A similar pattern using nested lambdas in my larger production code also results in an ICE. There I get
"/usr/include/c++/11.1.0/array", line 51: internal error: assertion failed: process_fill_in: specified fill-in not found (edg_error.cpp, line 3593 in process_fill_in)
typedef _Tp _Type[_Nm];
^
1 catastrophic error detected in the compilation of "extensions.cpp".
Compilation aborted.
nvc++-Fatal-/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/compilers/bin/tools/cpp1 TERMINATED by signal 6
Arguments to /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/compilers/bin/tools/cpp1
/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4A__ -D__ABM__ -D__SSE4_2__ -D__AVX__ -D__AVX2__ -D__FMA__ -D__XSAVE__ -D__POPCNT__ -D__FXSR__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++17 -I- -I/usr/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/compilers/include-stdpar --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/11.4/include --sys_include /usr/include/c++/11.1.0 --sys_include /usr/include/c++/11.1.0/x86_64-pc-linux-gnu --sys_include /usr/include/c++/11.1.0/backward --sys_include /usr/lib/gcc/x86_64-pc-linux-gnu/11.1.0/include --sys_include /usr/local/include --sys_include /usr/lib/gcc/x86_64-pc-linux-gnu/11.1.0/include-fixed --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -DCUDA_VERSION=11040 -DPGI_TESLA_TARGET -D__HIPSYCL__ -D__HIPSYCL_ENABLE_OMPHOST_TARGET__ -D__PGI_CUDA_ARCH__=610 -D__NVCOMPILER_CUDA_ARCH__=610 -D_CUDA -DCUDA_VERSION=11040 -D__HIPSYCL_ENABLE_CUDA_TARGET__ -U__FLOAT128__ -U__SIZEOF_FLOAT128__ -DBOOST_ALL_NO_LIB -DBOOST_UNIT_TEST_FRAMEWORK_DYN_LINK -DHIPSYCL_DEBUG_LEVEL=2 --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110100 -D__pgnu_vsn=110100 --mp -D_OPENMP=202011 -DCUDA_VERSION=11040 --cuda -D__CUDACC__ -D_NVHPC_CUDA_CPP -D__NV_NO_HOST_COMPILER_CHECK --preinclude _cuda_preinclude.h --cudacap=61 -g --dwarf2 --dependencies_target_to_stdout CMakeFiles/extensions.cpp.o -D_NVHPC_RDC --dependencies_to_file CMakeFiles/extensions.cpp.o.d -q -o /tmp/nvc++64VoUrCqqB3p.il extensions.cpp
However, I could not yet find a minimal reproducer for this particular error message. If I replace nested capture-by-reference lambdas with capture-by-value the ICE disappears.
Are nested lambdas in kernels not yet supported, or am I looking at a bug?