Hey, I have been working on cross compiling OpenCV 4.5 (currently 4.5.0 for compatibility reasons but the goal is to upgrade it to 4.5.latest and 4.6).
I have posted this question also in OpenCV’s forums.
Questions
- Is it possible to statically linked
libcudart
while cross compiling? - Is it possible to statically linked
libcublas
while cross compiling? - Does it make sense to statically link these libs or should I use only dynamic linking and load CUDA from a proper ARM64 device?
Description
I am using the following versions:
CUDA: 10.2
OpenCV: 4.5.0
cmake: 3.18
Ubuntu: 18.04
I have managed to cross compile it successfully. However, at the moment if I want to use the generated binaries, I have to have CUDA available on in my docker container, otherwise I end with:
/My_Test: error while loading shared libraries: libcudart.so.10.2: cannot open shared object file: No such file or directory
Steps Taken
- I have built a docker image using a base image of
FROM nvidia/cuda:10.2-devel-ubuntu1804
. - I have followed this example from NVIDIA: TensorRT/ubuntu-cross-aarch64.Dockerfile at main · NVIDIA/TensorRT · GitHub
- I used GitHub - dockcross/dockcross: Cross compiling toolchains in Docker images to set the basic requirements for cross compiling OpenCV.
- I am using the following cmake command to build OpenCV:
cmake .. \
-DCMAKE_TOOLCHAIN_FILE=../cmake_aarch64.toolchain \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=/output \
-DBUILD_SHARED_LIBS=OFF \
-DBUILD_ZLIB=ON \
-DBUILD_PNG=ON \
-DWITH_CUDA=ON \
-DWITH_OPENEXR=OFF \
-DWITH_WEBP=OFF \
-DWITH_OPENCL=OFF \
-DWITH_1394=OFF \
-DWITH_GTK=ON \
-DWITH_FFMPEG=ON \
-DCUDA_INC_PATH=/usr/local/cuda-${DESIRED_CUDA_VERSION}/targets/aarch64-linux/include \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda \
-DCMAKE_FIND_ROOT_PATH=/opencv/opencv/cmake \
-DOPENCV_EXTRA_MODULES_PATH=/opencv/opencv/opencv_contrib-${OPENCV_VERSION}/modules \
-DBUILD_TIFF=ON \
-DBUILD_TBB=ON \
-DWITH_LAPACK=OFF \
-DBUILD_NEW_PYTHON_SUPPORT=ON \
-DBUILD_JPEG=ON \
-DBUILD_JASPER=ON \
-DBUILD_EXAMPLES=OFF \
-DBUILD_JAVA=OFF \
-DBUILD_opencv_python2=ON \
-DBUILD_opencv_python3=ON \
-DCUDA_NVCC_FLAGS="-D_FORCE_INLINES" \
-DENABLE_NEON=ON \
-DWITH_OPENMP=OFF \
-DWITH_GSTREAMER=OFF \
-DWITH_GSTREAMER_0_10=OFF \
-DWITH_VTK=OFF \
-DWITH_TBB=ON \
-DCUDA_ARCH_BIN=7.2 \
-DCUDA_ARCH_PTX="" \
-DBUILD_EXAMPLES=OFF \
-DINSTALL_C_EXAMPLES=ON \
-D BUILD_PERF_TESTS=OFF \
-D BUILD_TESTS=OFF \
-DINSTALL_TESTS=OFF \
-DOPENCV_ENABLE_NONFREE=ON\
-DBUILD_opencv_xobjdetect=OFF
cat ../cmake_aarch64.toolchain
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR aarch64)
set(TRT_PLATFORM_ID "aarch64")
set(CUDA_PLATFORM_ID "aarch64-linux")
set(CMAKE_CROSSCOMPILING TRUE)
set(CMAKE_C_COMPILER_TARGET aarch64)
set(CMAKE_CXX_COMPILER_TARGET aarch64)
set(CMAKE_C_COMPILER /usr/xcc/aarch64-unknown-linux-gnueabi/bin/aarch64-unknown-linux-gnueabi-gcc)
set(CMAKE_CXX_COMPILER /usr/xcc/aarch64-unknown-linux-gnueabi/bin/aarch64-unknown-linux-gnueabi-g++)
set(DISABLE_SWIG TRUE)
set(CUDA_LIB_PATH /usr/local/cuda-$ENV{CUDA_VERSION}/targets/aarch64-linux/)
set(CUDA_BIN_PATH /usr/local/cuda/)
set(CUDA_TARGET_TRIPLET aarch64-linux)
set(CUDA_PATH /usr/local/cuda-$ENV{CUDA_VERSION}/targets/aarch64-linux/)
set(ADDITIONAL_PLATFORM_LIB_FLAGS -L${CUDA_LIB_PATH} -lcublas -lcudart -lstdc++ -lm)
Running My_Test
in the same container where I build OpenCV and my own MyTest binary, works as expected.
When I run My_Test
in a nvcr.io/nvidia/l4t-base:r32.4.3
docker container with CUDA, it works as expected.
Only when I run My_Test
in a non CUDA machine, that’s when it fails running.
The reason is related to libcudart.so.10.2
as mentioned above:
ldd ./My_Test
libGCBase_gcc_v3_1_Basler_pylon.so => /usr/local/pylon/lib/libGCBase_gcc_v3_1_Basler_pylon.so (0x00000055022a3000)
libpylonbase-6.2.0.so => /usr/local/pylon/lib/libpylonbase-6.2.0.so (0x00000055022d0000)
libGenApi_gcc_v3_1_Basler_pylon.so => /usr/local/pylon/lib/libGenApi_gcc_v3_1_Basler_pylon.so (0x0000005503042000)
libpylonutility-6.2.0.so => /usr/local/pylon/lib/libpylonutility-6.2.0.so (0x00000055033a3000)
librt.so.1 => /lib/aarch64-linux-gnu/librt.so.1 (0x0000005503599000)
libpthread.so.0 => /lib/aarch64-linux-gnu/libpthread.so.0 (0x00000055035b0000)
libdl.so.2 => /lib/aarch64-linux-gnu/libdl.so.2 (0x00000055035de000)
libcudart.so.10.2 => not found
libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x00000055035f3000)
libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x00000055036ac000)
/lib/ld-linux-aarch64.so.1 (0x0000005500000000)
libstdc++.so.6 => /usr/lib/aarch64-linux-gnu/libstdc++.so.6 (0x0000005503805000)
libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x000000550399b000)
libLog_gcc_v3_1_Basler_pylon.so => /usr/local/pylon/lib/libLog_gcc_v3_1_Basler_pylon.so (0x00000055039bf000)
libMathParser_gcc_v3_1_Basler_pylon.so => /usr/local/pylon/lib/libMathParser_gcc_v3_1_Basler_pylon.so (0x00000055039da000)
libXmlParser_gcc_v3_1_Basler_pylon.so => /usr/local/pylon/lib/libXmlParser_gcc_v3_1_Basler_pylon.so (0x00000055039f5000)
libNodeMapData_gcc_v3_1_Basler_pylon.so => /usr/local/pylon/lib/libNodeMapData_gcc_v3_1_Basler_pylon.so (0x0000005503b30000)
Also nm -e ./My_Test | grep -i cuda
:
...
U __cudaPopCallConfiguration@@libcudart.so.10.2
U __cudaPushCallConfiguration@@libcudart.so.10.2
U __cudaRegisterFatBinary@@libcudart.so.10.2
U __cudaRegisterFatBinaryEnd@@libcudart.so.10.2
U __cudaRegisterFunction@@libcudart.so.10.2
U __cudaRegisterVar@@libcudart.so.10.2
U __cudaUnregisterFatBinary@@libcudart.so.10.2
U cudaDeviceReset@@libcudart.so.10.2
U cudaDeviceSynchronize@@libcudart.so.10.2
U cudaDriverGetVersion@@libcudart.so.10.2
U cudaEventCreateWithFlags@@libcudart.so.10.2
U cudaEventDestroy@@libcudart.so.10.2
U cudaEventElapsedTime@@libcudart.so.10.2
U cudaEventQuery@@libcudart.so.10.2
U cudaEventRecord@@libcudart.so.10.2
U cudaEventSynchronize@@libcudart.so.10.2
U cudaFree@@libcudart.so.10.2
U cudaFreeHost@@libcudart.so.10.2
U cudaGetDevice@@libcudart.so.10.2
U cudaGetDeviceCount@@libcudart.so.10.2
U cudaGetDeviceProperties@@libcudart.so.10.2
U cudaGetErrorString@@libcudart.so.10.2
U cudaGetLastError@@libcudart.so.10.2
U cudaHostAlloc@@libcudart.so.10.2
U cudaHostGetDevicePointer@@libcudart.so.10.2
U cudaHostRegister@@libcudart.so.10.2
U cudaHostUnregister@@libcudart.so.10.2
U cudaLaunchKernel@@libcudart.so.10.2
U cudaMalloc@@libcudart.so.10.2
U cudaMallocPitch@@libcudart.so.10.2
U cudaMemGetInfo@@libcudart.so.10.2
U cudaMemcpy2D@@libcudart.so.10.2
U cudaMemcpy2DAsync@@libcudart.so.10.2
U cudaMemset2D@@libcudart.so.10.2
U cudaMemset2DAsync@@libcudart.so.10.2
U cudaRuntimeGetVersion@@libcudart.so.10.2
U cudaSetDevice@@libcudart.so.10.2
U cudaStreamAddCallback@@libcudart.so.10.2
U cudaStreamCreate@@libcudart.so.10.2
U cudaStreamDestroy@@libcudart.so.10.2
U cudaStreamQuery@@libcudart.so.10.2
U cudaStreamSynchronize@@libcudart.so.10.2
U cudaStreamWaitEvent@@libcudart.so.10.2
While I cannot show the content of My_Test
, basically it doesn’t make any use of CUDA at the moment (it will in the future) but it does use OpenCV with CUDA enabled and therefore it fails.