Hi,
i am trying to compile a library (bitsandbytes GitHub - TimDettmers/bitsandbytes: 8-bit CUDA functions for PyTorch) on Jetson Xavier (ARM) with nvcc (CUDA 11.4.315) but it fails, not finding some builtin functions:
..
/usr/lib/gcc/aarch64-linux-gnu/9/include/arm_neon.h(72): error: identifier "__fp16" is undefined
/usr/lib/gcc/aarch64-linux-gnu/9/include/arm_neon.h(795): error: identifier "__builtin_aarch64_saddlv8qi" is undefined
/usr/lib/gcc/aarch64-linux-gnu/9/include/arm_neon.h(802): error: identifier "__builtin_aarch64_saddlv4hi" is undefined
..
It seems like nvcc does not recognize the builtin arm functions defined in arm_neon.h
, BUT gcc (9.4.0) does.
Here is a minimal example which illustrates this beautifully:
TestNeon.cu:
#include <arm_neon.h>
int main () { }
NVCC of TestNeon.cu (fail):
nvcc -v TestNeon.cu
#$ _NVVM_BRANCH_=nvvm
#$ _SPACE_=
#$ _CUDART_=cudart
#$ _HERE_=/usr/local/cuda-11.4/bin
#$ _THERE_=/usr/local/cuda-11.4/bin
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_DIR_=targets/aarch64-linux
#$ TOP=/usr/local/cuda-11.4/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda-11.4/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda-11.4/bin/../lib:/usr/local/cuda-11.4/lib64:
#$ PATH=/usr/local/cuda-11.4/bin/../nvvm/bin:/usr/local/cuda-11.4/bin:/home/g/.local/bin:/usr/local/cuda-11.4/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
#$ INCLUDES="-I/usr/local/cuda-11.4/bin/../targets/aarch64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib/stubs" "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -D__CUDA_ARCH__=520 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ "-I/usr/local/cuda-11.4/bin/../targets/aarch64-linux/include" -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=4 -D__CUDACC_VER_BUILD__=315 -D__CUDA_API_VER_MAJOR__=11 -D__CUDA_API_VER_MINOR__=4 -include "cuda_runtime.h" "TestNeon.cu" -o "/tmp/tmpxft_00001e9c_00000000-9_TestNeon.cpp1.ii"
#$ cicc --c++14 --gnu_version=90400 --orig_src_file_name "TestNeon.cu" --allow_managed --unsigned_chars --unsigned_wchar_t -arch compute_52 -m64 --no-version-ident -ftz=0 -prec_div=1 -prec_sqrt=1 -fmad=1 --include_file_name "tmpxft_00001e9c_00000000-3_TestNeon.fatbin.c" -tused --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00001e9c_00000000-4_TestNeon.module_id" --gen_c_file_name "/tmp/tmpxft_00001e9c_00000000-6_TestNeon.cudafe1.c" --stub_file_name "/tmp/tmpxft_00001e9c_00000000-6_TestNeon.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00001e9c_00000000-6_TestNeon.cudafe1.gpu" "/tmp/tmpxft_00001e9c_00000000-9_TestNeon.cpp1.ii" -o "/tmp/tmpxft_00001e9c_00000000-6_TestNeon.ptx"
/usr/lib/gcc/aarch64-linux-gnu/9/include/arm_neon.h(38): error: identifier "__Int8x8_t" is undefined
/usr/lib/gcc/aarch64-linux-gnu/9/include/arm_neon.h(39): error: identifier "__Int16x4_t" is undefined
/usr/lib/gcc/aarch64-linux-gnu/9/include/arm_neon.h(40): error: identifier "__Int32x2_t" is undefined
/usr/lib/gcc/aarch64-linux-gnu/9/include/arm_neon.h(41): error: identifier "__Int64x1_t" is undefined
... (full output in attachment below)
nvcc_fail_full.txt (13.3 KB)
NVCC of TestNeon.cpp (WORKS):
If I rename TestNeon.cu
to TestNeon.cpp
, it successfully compiles without complaint:
nvcc -v TestNeon.cpp
#$ _NVVM_BRANCH_=nvvm
#$ _SPACE_=
#$ _CUDART_=cudart
#$ _HERE_=/usr/local/cuda-11.4/bin
#$ _THERE_=/usr/local/cuda-11.4/bin
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_DIR_=targets/aarch64-linux
#$ TOP=/usr/local/cuda-11.4/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda-11.4/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda-11.4/bin/../lib:/usr/local/cuda-11.4/lib64:
#$ PATH=/usr/local/cuda-11.4/bin/../nvvm/bin:/usr/local/cuda-11.4/bin:/home/g/.local/bin:/usr/local/cuda-11.4/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
#$ INCLUDES="-I/usr/local/cuda-11.4/bin/../targets/aarch64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib/stubs" "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -c -x c++ -D__NVCC__ "-I/usr/local/cuda-11.4/bin/../targets/aarch64-linux/include" -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=4 -D__CUDACC_VER_BUILD__=315 -D__CUDA_API_VER_MAJOR__=11 -D__CUDA_API_VER_MINOR__=4 "TestNeon.cpp" -o "/tmp/tmpxft_00001ea4_00000000-5_TestNeon.o"
#$ nvlink -m64 --arch=sm_52 --register-link-binaries="/tmp/tmpxft_00001ea4_00000000-3_a_dlink.reg.c" "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib/stubs" "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib" -cpu-arch=AARCH64 "/tmp/tmpxft_00001ea4_00000000-5_TestNeon.o" -lcudadevrt -o "/tmp/tmpxft_00001ea4_00000000-6_a_dlink.sm_52.cubin"
#$ fatbinary -64 -no-asm -link "--image3=kind=elf,sm=52,file=/tmp/tmpxft_00001ea4_00000000-6_a_dlink.sm_52.cubin" --embedded-fatbin="/tmp/tmpxft_00001ea4_00000000-4_a_dlink.fatbin.c"
#$ rm /tmp/tmpxft_00001ea4_00000000-4_a_dlink.fatbin
#$ gcc -c -x c++ -DFATBINFILE="\"/tmp/tmpxft_00001ea4_00000000-4_a_dlink.fatbin.c\"" -DREGISTERLINKBINARYFILE="\"/tmp/tmpxft_00001ea4_00000000-3_a_dlink.reg.c\"" -I. -D__NV_EXTRA_INITIALIZATION= -D__NV_EXTRA_FINALIZATION= -D__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ "-I/usr/local/cuda-11.4/bin/../targets/aarch64-linux/include" -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=4 -D__CUDACC_VER_BUILD__=315 -D__CUDA_API_VER_MAJOR__=11 -D__CUDA_API_VER_MINOR__=4 "/usr/local/cuda-11.4/bin/crt/link.stub" -o "/tmp/tmpxft_00001ea4_00000000-7_a_dlink.o"
#$ g++ -Wl,--start-group "/tmp/tmpxft_00001ea4_00000000-7_a_dlink.o" "/tmp/tmpxft_00001ea4_00000000-5_TestNeon.o" "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib/stubs" "-L/usr/local/cuda-11.4/bin/../targets/aarch64-linux/lib" -lcudadevrt -lcudart_static -lrt -lpthread -ldl -Wl,--end-group -o "a.out"
Is this behaviour expected?
Is it somehow possible to make nvcc recognize arm functions in code which also uses CUDA routines, ie. a .cu
-file?
Note: the bitsandbytes library does not use arm_neon.h
by default, it’s a modification to make it build on Jetson Xavier.
My Setup:
gcc --version
gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Sun_Oct_23_22:16:07_PDT_2022
Cuda compilation tools, release 11.4, V11.4.315
Build cuda_11.4.r11.4/compiler.31964100_0
Linux ubuntu 5.10.104-tegra #1 SMP PREEMPT Tue Jan 24 15:09:44 PST 2023 aarch64 aarch64 aarch64 GNU/Linux