Build current versions of UCX, Openmpi, and pytorch with cuda/distributed.
Compile and install UCX with cuda
sudo apt update
sudo apt install -y build-essential git pkg-config \
autoconf automake libtool m4 \
libnuma-dev hwloc libhwloc-dev
export CUDA_HOME="/usr/local/cuda"
export UCX_PREFIX="/opt/ucx-1.20.0"
export PATH="${CUDA_HOME}/bin:${UCX_PREFIX}/bin:$PATH"
export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${UCX_PREFIX}/lib:$LD_LIBRARY_PATH"
sudo mkdir $UCX_PREFIX
git clone https://github.com/openucx/ucx.git && cd ucx
export UCX_TLS="cuda"
export UCX_NET_DEVICES="eno1" # change to your preferred NIC.
export LDFLAGS="-L/usr/lib/aarch64-linux-gnu/tegra"
./autogen.sh
./configure --prefix=$UCX_PREFIX \
--with-cuda="/usr/local/cuda" \
--enable-mt \
--disable-assertions \
--disable-debug \
--disable-params-check
make -j6
sudo make install
sudo tee /etc/profile.d/ucx.sh >/dev/null <<'UCXEOF'
export PATH=/opt/ucx-1.20.0/bin:$PATH
export LD_LIBRARY_PATH=/opt/ucx-1.20.0/lib:$LD_LIBRARY_PATH
UCXEOF
source /etc/profile.d/ucx.sh
cd ..
Compile and install Openmpi with cuda
wget https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.8.tar.gz
tar xfz openmpi-5.0.8.tar.gz
cd openmpi-5.0.8
export OMPI_PREFIX="/opt/openmpi-5.0.8"
sudo mkdir $OMPI_PREFIX
./configure --prefix=$OMPI_PREFIX \
--with-cuda=$CUDA_HOME \
--with-ucx=$UCX_PREFIX \
--with-ucx-libdir=$UCX_PREFIX/lib \
--with-cuda-libdir=/usr/lib/aarch64-linux-gnu \
--enable-mpirun-prefix-by-default
make -j6
sudo make install
sudo tee /etc/profile.d/openmpi.sh >/dev/null <<'EOF'
export PATH=/opt/openmpi-5.0.8/bin:$PATH
export LD_LIBRARY_PATH=/opt/openmpi-5.0.8/lib:$LD_LIBRARY_PATH
EOF
source /etc/profile.d/openmpi.sh
Build pytorch/pytorch with mpi and distributed.
Add “”-b release/2.8" after git clone below or just
use main which is currently 2.9.0a0 although as of today 2.8 is not quite released.
git clone https://github.com/pytorch/pytorch
cd pytorch
git submodule sync
git submodule update --init --recursive
pip install -r requirements.txt
sudo apt install cuda-nvtx-12-(YourCudaVersion6,8,or,9)
export MAX_JOBS=6
export TORCH_CUDA_ARCH_LIST="8.7"
export USE_CUDA=1
export USE_CUDNN=1
export USE_PRIORITIZED_TEXT_FOR_LD=1
export LD_LIBRARY_PATH='/usr/local/cuda/lib64:$LD_LIBRARY_PATH'
export USE_DISTRIBUTED=1
export USE_MPI=1
export PATH=$PATH:'/usr/lib/pkgconfig'
export USE_ROCM=0
python -m pip install --no-build-isolation -v .
To build a wheel to keep run following that will take a couple minutes.
python -m pip wheel --no-build-isolation -v . -w dist
Your pip wheel can now be found here:
./dist/torch*.whl
Test torch/openmpi:
cat > allreduce_mpi.py <<'EOF'
import torch
import torch.distributed as dist
def main():
gpu_ok = torch.cuda.is_available()
device = torch.device("cuda:0" if gpu_ok else "cpu")
dtype = torch.float32 # <= keep it FP32
dist.init_process_group("mpi")
rank = dist.get_rank()
world = dist.get_world_size()
tensor = torch.ones(4, device=device, dtype=dtype) * rank
dist.all_reduce(tensor)
expected = torch.ones(4, device=device, dtype=dtype) * sum(range(world))
assert torch.allclose(tensor, expected)
print(f"[{rank}/{world}] All-reduce OK → {tensor[0].item()}")
if __name__ == "__main__":
main()
EOF
mpirun -n 4 python allreduce_mpi.py
It took a couple hours and compilations to finally discover that this is what Openmpi 5.x.x wanted to enable Cuda.
—with-cuda-libdir=/usr/lib/aarch64-linux-gnu
libcuda.so on Jetpack 6.2.1 with Cuda-12.9
dpkg -S $(find /usr -name libcuda.so)
cuda-driver-dev-12-9: /usr/local/cuda-12.9/targets/aarch64-linux/lib/stubs/libcuda.so
cuda-compat-12-9: /usr/local/cuda-12.9/compat/libcuda.so
nvidia-l4t-cuda: /usr/lib/aarch64-linux-gnu/nvidia/libcuda.so
nvidia-l4t-cuda: /usr/lib/aarch64-linux-gnu/libcuda.so