Torch allreduce with low performance on cuda12.8 compatibility

We have a setup with 8 L20 (535 driver) connected to intel xeon 4 (snc4 configuration, 8 numa totally)directly, with nvidia cuda container, we have identified extreme low allreduce performance. It seems like nccl stage buffer only allocated on subnuma 0. Note that nccl-test seems like not suffer this issue. follow steps below,

sudo docker run -it --name nv_cuda_2204 --gpus all --net=host --shm-size=64g --privileged -v /data00:/data00:rw nvidia/cuda:12.8.0-runtime-ubuntu22.04 bash 

sudo apt update
sudo apt install software-properties-common -y 
sudo add-apt-repository ppa:deadsnakes/ppa
sudo apt update sudo apt install python3.11
sudo apt install python3.11-venv python3.11-dev python3-pip

python3.11 -m venv nv_cuda
pip install torch==2.7.0  --index-url https://download.pytorch.org/whl/cu128
torchrun --nproc_per_node=8 all_reduce.py --init=0

all_reduce.py

import os
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--init", type=int, default=None, help="init type")
parser.add_argument("--rank", type=int, default=None, help="Global rank of the process.")
parser.add_argument("--world-size", type=int, default=None, help="Total number of processes.")
parser.add_argument("--local-rank", type=int, default=None, help="Local rank on the node.")

parser.add_argument('--tensor_size', type=int, default=256*1024*1024,
                    help='Size of the tensor in number of float32 elements (default: 256M, i.e., 1GB)')

parser.add_argument('--iterations', type=int, default=20,
                    help='Number of iterations for performance measurement')
                    
parser.add_argument('--warmup', type=int, default=5,
                    help='Number of warmup iterations')

args = parser.parse_args()

local_rank = args.local_rank
os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank)
local_rank=0
args.local_rank=0

import torch
import torch.distributed as dist
#import os
import time
#import argparse



def init_process():
    dist.init_process_group(backend='nccl', init_method='env://')
    rank = int(os.environ['RANK'])
    world_size = int(os.environ['WORLD_SIZE'])
    local_rank = int(os.environ['LOCAL_RANK'])
    
    print("rank {} world_size {} local_rank {}".format(rank,world_size,local_rank))

    torch.cuda.set_device(local_rank)
    
    return rank, world_size, local_rank

def init_process_with_input(args):
  
    rank = args.rank
    world_size = args.world_size
    local_rank = args.local_rank
    # Sanity check
    if rank is None or world_size is None or local_rank is None:
        raise ValueError("For manual launch, --rank, --world-size, and --local-rank must be provided.")
    # For the 'env://' init method, PyTorch REQUIRES these env vars to be set.
    # So, we set them manually from our args.
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '29500' # A default free port
    os.environ['RANK'] = str(rank)
    os.environ['WORLD_SIZE'] = str(world_size)
    os.environ['LOCAL_RANK'] = str(local_rank)
    
    dist.init_process_group(backend='nccl', init_method='env://')
     
    torch.cuda.set_device(local_rank)
    
    return rank, world_size, local_rank



def run_test(args):

    if args.init == 0:
        rank, world_size, local_rank = init_process()
    else:
        rank, world_size, local_rank = init_process_with_input(args)

    if rank == 0:
        print("="*40)
        print(f"PyTorch NCCL All-Reduce Test on {world_size} GPUs")
        print(f"Target Hardware: NVIDIA L20")
        print(f"PyTorch Version: {torch.__version__}")
        print(f"CUDA Version: {torch.version.cuda}")
        print(f"NCCL Version: {torch.cuda.nccl.version()}")
        print("="*40)
        
    tensor_size_elements = args.tensor_size
    
    tensor_size_bytes = tensor_size_elements * 4
    tensor_size_gb = tensor_size_bytes / (1024**3)

    if rank == 0:
        print(f"Tensor Size (per GPU): {tensor_size_gb:.3f} GB")
        print(f"Iterations: {args.iterations}")
        print(f"Warmup Iterations: {args.warmup}")
        print("-" * 40)

    with torch.autograd.profiler.record_function("data_creation"):
        tensor = torch.ones(tensor_size_elements, device=f'cuda:{local_rank}', dtype=torch.float32)
    

    verify_tensor = tensor.clone()
    dist.all_reduce(verify_tensor, op=dist.ReduceOp.SUM)
    
    expected_value = float(world_size)
   
    if torch.allclose(verify_tensor, torch.full_like(verify_tensor, expected_value)):
        if rank == 0:
            print(f"[Rank {rank}] Verification PASSED. Tensor elements are all ~{expected_value}.")
    else:
       
        if rank == 0:
            print(f"[Rank {rank}] Verification FAILED! Expected {expected_value}, got {verify_tensor[0]}.")
        
        dist.destroy_process_group()
        return

   
    dist.barrier()
    
    if rank == 0:
        print("Starting warmup...")
    with torch.autograd.profiler.record_function("warm_up"):
        for _ in range(args.warmup):
            dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
    
    
    torch.cuda.synchronize()
    
    if rank == 0:
        print("Warmup finished. Starting performance measurement...")
    
    
    dist.barrier()
    start_time = time.time()
    
   
    with torch.autograd.profiler.record_function("all_reduce_time_loop"):
        for _ in range(args.iterations):
            dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
    
    torch.cuda.synchronize()
    
    end_time = time.time()
    total_time = end_time - start_time
    
    if rank == 0:
        avg_time_per_iter = total_time / args.iterations
        
        bus_bw_gbps = (2 * (world_size - 1) / world_size * tensor_size_bytes) / avg_time_per_iter / 1e9
        
        print("-" * 40)
        print("Performance Results (from Rank 0):")
        print(f"Average time per All-Reduce: {avg_time_per_iter * 1000:.4f} ms")
        print(f"Estimated Bus Bandwidth: {bus_bw_gbps:.4f} GB/s")
        print("="*40)

    dist.destroy_process_group()


if __name__ == "__main__":
    
    run_test(args)


you will get a busbw around 2.4GB/s, with a pcm-memory as below:

NOTE: with a clean pull ubuntu image (not nv cuda image). with the same torch version, you will get performance as below:

on nvidia cuda containor, numactl is also tried and checked with all the same results. So the questions is what is wrong with cuda12.8 compat and driver 535? which leading to nccl buffer soulds like restricted to sub-numa0. thank you.

additional information for your reference:

host OS/driver version, note that both debian9 and debian10 share the same issue, here is a debian 9 machine .

root@dc05-p13-t300-n017:~# cat /etc/*release
PRETTY_NAME="Debian GNU/Linux 9 (stretch)"
NAME="Debian GNU/Linux"
VERSION_ID="9"
VERSION="9 (stretch)"
VERSION_CODENAME=stretch
ID=debian
HOME_URL="https://www.debian.org/"
SUPPORT_URL="https://www.debian.org/support"
BUG_REPORT_URL="https://bugs.debian.org/"
root@dc05-p13-t300-n017:~# nvidia-smi
Thu Aug 21 17:19:56 2025
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.08             Driver Version: 535.161.08   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+

nv cuda container:

root@dc05-p13-t300-n017:~# docker exec -it nv_cuda_2204 bash
root@dc05-p13-t300-n017:/# cat /etc/*release
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=22.04
DISTRIB_CODENAME=jammy
DISTRIB_DESCRIPTION="Ubuntu 22.04.5 LTS"
PRETTY_NAME="Ubuntu 22.04.5 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.5 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=jammy
root@dc05-p13-t300-n017:/# nvidia-smi
Thu Aug 21 17:23:09 2025
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.08             Driver Version: 535.161.08   CUDA Version: 12.8     |
|-----------------------------------------+----------------------+----------------------+