Torch allreduce with low performance on cuda12.8 compatibility

leevan.bupt · August 20, 2025, 6:40am

We have a setup with 8 L20 (535 driver) connected to intel xeon 4 (snc4 configuration, 8 numa totally)directly, with nvidia cuda container, we have identified extreme low allreduce performance. It seems like nccl stage buffer only allocated on subnuma 0. Note that nccl-test seems like not suffer this issue. follow steps below,

sudo docker run -it --name nv_cuda_2204 --gpus all --net=host --shm-size=64g --privileged -v /data00:/data00:rw nvidia/cuda:12.8.0-runtime-ubuntu22.04 bash 

sudo apt update
sudo apt install software-properties-common -y 
sudo add-apt-repository ppa:deadsnakes/ppa
sudo apt update sudo apt install python3.11
sudo apt install python3.11-venv python3.11-dev python3-pip

python3.11 -m venv nv_cuda
pip install torch==2.7.0  --index-url https://download.pytorch.org/whl/cu128
torchrun --nproc_per_node=8 all_reduce.py --init=0

all_reduce.py

import os
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--init", type=int, default=None, help="init type")
parser.add_argument("--rank", type=int, default=None, help="Global rank of the process.")
parser.add_argument("--world-size", type=int, default=None, help="Total number of processes.")
parser.add_argument("--local-rank", type=int, default=None, help="Local rank on the node.")

parser.add_argument('--tensor_size', type=int, default=256*1024*1024,
                    help='Size of the tensor in number of float32 elements (default: 256M, i.e., 1GB)')

parser.add_argument('--iterations', type=int, default=20,
                    help='Number of iterations for performance measurement')
                    
parser.add_argument('--warmup', type=int, default=5,
                    help='Number of warmup iterations')

args = parser.parse_args()

local_rank = args.local_rank
os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank)
local_rank=0
args.local_rank=0

import torch
import torch.distributed as dist
#import os
import time
#import argparse



def init_process():
    dist.init_process_group(backend='nccl', init_method='env://')
    rank = int(os.environ['RANK'])
    world_size = int(os.environ['WORLD_SIZE'])
    local_rank = int(os.environ['LOCAL_RANK'])
    
    print("rank {} world_size {} local_rank {}".format(rank,world_size,local_rank))

    torch.cuda.set_device(local_rank)
    
    return rank, world_size, local_rank

def init_process_with_input(args):
  
    rank = args.rank
    world_size = args.world_size
    local_rank = args.local_rank
    # Sanity check
    if rank is None or world_size is None or local_rank is None:
        raise ValueError("For manual launch, --rank, --world-size, and --local-rank must be provided.")
    # For the 'env://' init method, PyTorch REQUIRES these env vars to be set.
    # So, we set them manually from our args.
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '29500' # A default free port
    os.environ['RANK'] = str(rank)
    os.environ['WORLD_SIZE'] = str(world_size)
    os.environ['LOCAL_RANK'] = str(local_rank)
    
    dist.init_process_group(backend='nccl', init_method='env://')
     
    torch.cuda.set_device(local_rank)
    
    return rank, world_size, local_rank



def run_test(args):

    if args.init == 0:
        rank, world_size, local_rank = init_process()
    else:
        rank, world_size, local_rank = init_process_with_input(args)

    if rank == 0:
        print("="*40)
        print(f"PyTorch NCCL All-Reduce Test on {world_size} GPUs")
        print(f"Target Hardware: NVIDIA L20")
        print(f"PyTorch Version: {torch.__version__}")
        print(f"CUDA Version: {torch.version.cuda}")
        print(f"NCCL Version: {torch.cuda.nccl.version()}")
        print("="*40)
        
    tensor_size_elements = args.tensor_size
    
    tensor_size_bytes = tensor_size_elements * 4
    tensor_size_gb = tensor_size_bytes / (1024**3)

    if rank == 0:
        print(f"Tensor Size (per GPU): {tensor_size_gb:.3f} GB")
        print(f"Iterations: {args.iterations}")
        print(f"Warmup Iterations: {args.warmup}")
        print("-" * 40)

    with torch.autograd.profiler.record_function("data_creation"):
        tensor = torch.ones(tensor_size_elements, device=f'cuda:{local_rank}', dtype=torch.float32)
    

    verify_tensor = tensor.clone()
    dist.all_reduce(verify_tensor, op=dist.ReduceOp.SUM)
    
    expected_value = float(world_size)
   
    if torch.allclose(verify_tensor, torch.full_like(verify_tensor, expected_value)):
        if rank == 0:
            print(f"[Rank {rank}] Verification PASSED. Tensor elements are all ~{expected_value}.")
    else:
       
        if rank == 0:
            print(f"[Rank {rank}] Verification FAILED! Expected {expected_value}, got {verify_tensor[0]}.")
        
        dist.destroy_process_group()
        return

   
    dist.barrier()
    
    if rank == 0:
        print("Starting warmup...")
    with torch.autograd.profiler.record_function("warm_up"):
        for _ in range(args.warmup):
            dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
    
    
    torch.cuda.synchronize()
    
    if rank == 0:
        print("Warmup finished. Starting performance measurement...")
    
    
    dist.barrier()
    start_time = time.time()
    
   
    with torch.autograd.profiler.record_function("all_reduce_time_loop"):
        for _ in range(args.iterations):
            dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
    
    torch.cuda.synchronize()
    
    end_time = time.time()
    total_time = end_time - start_time
    
    if rank == 0:
        avg_time_per_iter = total_time / args.iterations
        
        bus_bw_gbps = (2 * (world_size - 1) / world_size * tensor_size_bytes) / avg_time_per_iter / 1e9
        
        print("-" * 40)
        print("Performance Results (from Rank 0):")
        print(f"Average time per All-Reduce: {avg_time_per_iter * 1000:.4f} ms")
        print(f"Estimated Bus Bandwidth: {bus_bw_gbps:.4f} GB/s")
        print("="*40)

    dist.destroy_process_group()


if __name__ == "__main__":
    
    run_test(args)

you will get a busbw around 2.4GB/s, with a pcm-memory as below:

NOTE: with a clean pull ubuntu image (not nv cuda image). with the same torch version, you will get performance as below:

on nvidia cuda containor, numactl is also tried and checked with all the same results. So the questions is what is wrong with cuda12.8 compat and driver 535? which leading to nccl buffer soulds like restricted to sub-numa0. thank you.

additional information for your reference:

host OS/driver version, note that both debian9 and debian10 share the same issue, here is a debian 9 machine .

root@dc05-p13-t300-n017:~# cat /etc/*release
PRETTY_NAME="Debian GNU/Linux 9 (stretch)"
NAME="Debian GNU/Linux"
VERSION_ID="9"
VERSION="9 (stretch)"
VERSION_CODENAME=stretch
ID=debian
HOME_URL="https://www.debian.org/"
SUPPORT_URL="https://www.debian.org/support"
BUG_REPORT_URL="https://bugs.debian.org/"
root@dc05-p13-t300-n017:~# nvidia-smi
Thu Aug 21 17:19:56 2025
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.08             Driver Version: 535.161.08   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+

nv cuda container:

root@dc05-p13-t300-n017:~# docker exec -it nv_cuda_2204 bash
root@dc05-p13-t300-n017:/# cat /etc/*release
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=22.04
DISTRIB_CODENAME=jammy
DISTRIB_DESCRIPTION="Ubuntu 22.04.5 LTS"
PRETTY_NAME="Ubuntu 22.04.5 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.5 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=jammy
root@dc05-p13-t300-n017:/# nvidia-smi
Thu Aug 21 17:23:09 2025
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.08             Driver Version: 535.161.08   CUDA Version: 12.8     |
|-----------------------------------------+----------------------+----------------------+

Topic		Replies	Views
NCCL all_reduce_perf hangs with A100 SXM4 on AMD CPUs (driver 570.172.08 + CUDA 12.8) but works on driver 550.163.01 GPU-Accelerated Libraries cuda , nccl , a100 , software-and-drivers	0	125	September 3, 2025
About NCCL benchmark result GPU-Accelerated Libraries nccl	0	1600	November 17, 2022
Proccess block when call Nccl reduce CUDA Programming and Performance	1	800	May 19, 2018
How to check what is slowing down the kernel CUDA Programming and Performance	0	584	January 31, 2022
TAO5 - Detectnet_v2 - MultiGPU TAO API Stuck - EXTRA GPU TAO Toolkit	14	1147	November 7, 2023
tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.NcclAllReduce()) doesn't work with more than 2 GPUs CUDA Setup and Installation	1	561	October 5, 2024
ncclAllReduce failed: unhandled cuda error DGX Systems (Data Center)	9	4493	May 27, 2021
Fast Multi-GPU collectives with NCCL Technical Blog	14	1255	May 11, 2018
Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels GPU-Accelerated Libraries cuda , pytorch , ai-training , a100 , infiniband	0	4337	February 16, 2024
NCCL operation hangs Riva	1	656	November 29, 2023

Torch allreduce with low performance on cuda12.8 compatibility

Related topics