NVCC fails to build with ARM neon instructions (.cpp vs. cu)

Hi,

i am trying to compile a library (bitsandbytes GitHub - TimDettmers/bitsandbytes: 8-bit CUDA functions for PyTorch) on Jetson Xavier (ARM) with nvcc (CUDA 11.4.315) but it fails, not finding some builtin functions:

..
/usr/lib/gcc/aarch64-linux-gnu/9/include/arm_neon.h(72): error: identifier "__fp16" is undefined
/usr/lib/gcc/aarch64-linux-gnu/9/include/arm_neon.h(795): error: identifier "__builtin_aarch64_saddlv8qi" is undefined
/usr/lib/gcc/aarch64-linux-gnu/9/include/arm_neon.h(802): error: identifier "__builtin_aarch64_saddlv4hi" is undefined
..

It seems like nvcc does not recognize the builtin arm functions defined in arm_neon.h, BUT gcc (9.4.0) does.

Here is a minimal example which illustrates this beautifully:

TestNeon.cu:
#include <arm_neon.h>

int main () { }

NVCC of TestNeon.cu (fail):

nvcc -v TestNeon.cu
#$ _NVVM_BRANCH_=nvvm
#$ _SPACE_= 
#$ _CUDART_=cudart
#$ _HERE_=/usr/local/cuda-11.4/bin
#$ _THERE_=/usr/local/cuda-11.4/bin
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_DIR_=targets/aarch64-linux
#$ TOP=/usr/local/cuda-11.4/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda-11.4/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda-11.4/bin/../lib:/usr/local/cuda-11.4/lib64:
#$ PATH=/usr/local/cuda-11.4/bin/../nvvm/bin:/usr/local/cuda-11.4/bin:/home/g/.local/bin:/usr/local/cuda-11.4/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
#$ INCLUDES="-I/usr/local/cuda-11.4/bin/../targets/aarch64-linux/include"  
#$ LIBRARIES=  "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib/stubs" "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -D__CUDA_ARCH__=520 -E -x c++  -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__  "-I/usr/local/cuda-11.4/bin/../targets/aarch64-linux/include"    -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=4 -D__CUDACC_VER_BUILD__=315 -D__CUDA_API_VER_MAJOR__=11 -D__CUDA_API_VER_MINOR__=4 -include "cuda_runtime.h" "TestNeon.cu" -o "/tmp/tmpxft_00001e9c_00000000-9_TestNeon.cpp1.ii" 
#$ cicc --c++14 --gnu_version=90400 --orig_src_file_name "TestNeon.cu" --allow_managed --unsigned_chars --unsigned_wchar_t   -arch compute_52 -m64 --no-version-ident -ftz=0 -prec_div=1 -prec_sqrt=1 -fmad=1 --include_file_name "tmpxft_00001e9c_00000000-3_TestNeon.fatbin.c" -tused --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00001e9c_00000000-4_TestNeon.module_id" --gen_c_file_name "/tmp/tmpxft_00001e9c_00000000-6_TestNeon.cudafe1.c" --stub_file_name "/tmp/tmpxft_00001e9c_00000000-6_TestNeon.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00001e9c_00000000-6_TestNeon.cudafe1.gpu"  "/tmp/tmpxft_00001e9c_00000000-9_TestNeon.cpp1.ii" -o "/tmp/tmpxft_00001e9c_00000000-6_TestNeon.ptx"
/usr/lib/gcc/aarch64-linux-gnu/9/include/arm_neon.h(38): error: identifier "__Int8x8_t" is undefined
/usr/lib/gcc/aarch64-linux-gnu/9/include/arm_neon.h(39): error: identifier "__Int16x4_t" is undefined
/usr/lib/gcc/aarch64-linux-gnu/9/include/arm_neon.h(40): error: identifier "__Int32x2_t" is undefined
/usr/lib/gcc/aarch64-linux-gnu/9/include/arm_neon.h(41): error: identifier "__Int64x1_t" is undefined
... (full output in attachment below)

nvcc_fail_full.txt (13.3 KB)

NVCC of TestNeon.cpp (WORKS):

If I rename TestNeon.cu to TestNeon.cpp, it successfully compiles without complaint:

nvcc -v TestNeon.cpp
#$ _NVVM_BRANCH_=nvvm
#$ _SPACE_= 
#$ _CUDART_=cudart
#$ _HERE_=/usr/local/cuda-11.4/bin
#$ _THERE_=/usr/local/cuda-11.4/bin
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_DIR_=targets/aarch64-linux
#$ TOP=/usr/local/cuda-11.4/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda-11.4/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda-11.4/bin/../lib:/usr/local/cuda-11.4/lib64:
#$ PATH=/usr/local/cuda-11.4/bin/../nvvm/bin:/usr/local/cuda-11.4/bin:/home/g/.local/bin:/usr/local/cuda-11.4/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
#$ INCLUDES="-I/usr/local/cuda-11.4/bin/../targets/aarch64-linux/include"  
#$ LIBRARIES=  "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib/stubs" "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -c -x c++ -D__NVCC__  "-I/usr/local/cuda-11.4/bin/../targets/aarch64-linux/include"    -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=4 -D__CUDACC_VER_BUILD__=315 -D__CUDA_API_VER_MAJOR__=11 -D__CUDA_API_VER_MINOR__=4 "TestNeon.cpp" -o "/tmp/tmpxft_00001ea4_00000000-5_TestNeon.o" 
#$ nvlink -m64 --arch=sm_52 --register-link-binaries="/tmp/tmpxft_00001ea4_00000000-3_a_dlink.reg.c"    "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib/stubs" "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib" -cpu-arch=AARCH64 "/tmp/tmpxft_00001ea4_00000000-5_TestNeon.o"  -lcudadevrt  -o "/tmp/tmpxft_00001ea4_00000000-6_a_dlink.sm_52.cubin"
#$ fatbinary -64 -no-asm -link "--image3=kind=elf,sm=52,file=/tmp/tmpxft_00001ea4_00000000-6_a_dlink.sm_52.cubin" --embedded-fatbin="/tmp/tmpxft_00001ea4_00000000-4_a_dlink.fatbin.c" 
#$ rm /tmp/tmpxft_00001ea4_00000000-4_a_dlink.fatbin
#$ gcc -c -x c++ -DFATBINFILE="\"/tmp/tmpxft_00001ea4_00000000-4_a_dlink.fatbin.c\"" -DREGISTERLINKBINARYFILE="\"/tmp/tmpxft_00001ea4_00000000-3_a_dlink.reg.c\"" -I. -D__NV_EXTRA_INITIALIZATION= -D__NV_EXTRA_FINALIZATION= -D__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__  "-I/usr/local/cuda-11.4/bin/../targets/aarch64-linux/include"    -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=4 -D__CUDACC_VER_BUILD__=315 -D__CUDA_API_VER_MAJOR__=11 -D__CUDA_API_VER_MINOR__=4 "/usr/local/cuda-11.4/bin/crt/link.stub" -o "/tmp/tmpxft_00001ea4_00000000-7_a_dlink.o" 
#$ g++ -Wl,--start-group "/tmp/tmpxft_00001ea4_00000000-7_a_dlink.o" "/tmp/tmpxft_00001ea4_00000000-5_TestNeon.o"   "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib/stubs" "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib"  -lcudadevrt  -lcudart_static  -lrt -lpthread  -ldl  -Wl,--end-group -o "a.out" 

Is this behaviour expected?
Is it somehow possible to make nvcc recognize arm functions in code which also uses CUDA routines, ie. a .cu-file?

Note: the bitsandbytes library does not use arm_neon.h by default, it’s a modification to make it build on Jetson Xavier.

My Setup:

gcc --version
gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Sun_Oct_23_22:16:07_PDT_2022
Cuda compilation tools, release 11.4, V11.4.315
Build cuda_11.4.r11.4/compiler.31964100_0

Linux ubuntu 5.10.104-tegra #1 SMP PREEMPT Tue Jan 24 15:09:44 PST 2023 aarch64 aarch64 aarch64 GNU/Linux

Hi gerald.stampfel,
NVCC doesn’t support built-ins defined in “arm_neon.h” so far , we are working on improving our support for the ARM platform.

Encountered similar issue using SIMDe to port code with x86 intrinsics to ARM.

Still fails with CUDA 12.3

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Wed_Nov_22_11:03:34_PST_2023
Cuda compilation tools, release 12.3, V12.3.107
Build cuda_12.3.r12.3/compiler.33567101_0