Per the blog:
Meanwhile, the run-time complexity of the
cudaEnablePeerAccess
API is roughlyO(N * lg(N))
where N is the number of allocations made on the source device that need to be mapped to the destination device. Often this is called for each device pair to enable full bidirectional peer access, being a totalO(D * D * N * lg(N))
, where D is the number of devices. Also, as mentioned earlier,cudaMalloc
must now map its allocations to all devices with peer access enabled. This means that the runtime complexity now scales asO(D * lg(N))
.
This means, if i enable p2p access between 2 gpus, after the p2p access, cudaMalloc
will be twice slower because it needs to map the memory in both processes.
I tried to verify this claim. If this is true, I need to adjust how I share memory between processes. This is what I do:
import os
import time
import torch
import torch.distributed as dist
import os
import ctypes
import time
import torch
import torch.distributed as dist
# Load CUDA runtime library
libcudart = ctypes.CDLL("libcudart.so")
# Define function prototypes in ctypes
cudaMalloc = libcudart.cudaMalloc
cudaMalloc.restype = ctypes.c_int
cudaMalloc.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]
cudaFree = libcudart.cudaFree
cudaFree.restype = ctypes.c_int
cudaFree.argtypes = [ctypes.c_void_p]
class CudaIpcMemHandle(ctypes.Structure):
_fields_ = [("reserved", ctypes.c_byte * 128)]
cudaIpcMemLazyEnablePeerAccess = 1
cudaIpcGetMemHandle = libcudart.cudaIpcGetMemHandle
cudaIpcGetMemHandle.restype = ctypes.c_int
cudaIpcGetMemHandle.argtypes = [ctypes.POINTER(CudaIpcMemHandle), ctypes.c_void_p]
cudaIpcOpenMemHandle = libcudart.cudaIpcOpenMemHandle
cudaIpcOpenMemHandle.restype = ctypes.c_int
cudaIpcOpenMemHandle.argtypes = [ctypes.POINTER(ctypes.c_void_p), CudaIpcMemHandle, ctypes.c_uint]
cudaMemcpy = libcudart.cudaMemcpy
cudaMemcpy.restype = ctypes.c_int
cudaMemcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int]
cudaDeviceSynchronize = libcudart.cudaDeviceSynchronize
cudaDeviceSynchronize.restype = ctypes.c_int
# Helper functions for CUDA memory management
def alloc(size):
ptr = ctypes.c_void_p()
result = cudaMalloc(ctypes.byref(ptr), size)
if result != 0:
raise Exception("cudaMalloc failed")
return ptr
def free(ptr):
result = cudaFree(ptr)
if result != 0:
raise Exception("cudaFree failed")
def synchronize():
cudaDeviceSynchronize()
def worker(ipc=False):
dist.init_process_group(backend="gloo")
rank = dist.get_rank()
world_size = dist.get_world_size()
torch.cuda.set_device(rank)
# warmup device
size_in_bytes = 1024 * 1024
n_elements = size_in_bytes // 4
ptr = alloc(size_in_bytes)
if ipc:
handle = CudaIpcMemHandle()
assert cudaIpcGetMemHandle(ctypes.byref(handle), ptr) == 0
ptrs = []
for i in range(world_size):
if i == rank:
ptrs.append(ptr)
dist.broadcast_object_list([handle], src=i)
else:
recv = [None]
dist.broadcast_object_list(recv, src=i)
recv_handle = recv[0]
recv_ptr = ctypes.c_void_p()
assert cudaIpcOpenMemHandle(ctypes.byref(recv_ptr), recv_handle, cudaIpcMemLazyEnablePeerAccess) == 0
ptrs.append(recv_ptr)
synchronize()
dist.barrier()
start = time.time()
data = []
for i in range(2000):
data.append(alloc(size_in_bytes))
synchronize()
end = time.time()
elapsed = end - start
print(f"time for cudaMalloc: {elapsed}")
dist.destroy_process_group()
if __name__ == "__main__":
ipc = bool(int(os.getenv("IPC", "0")))
print(f"ipc: {ipc}")
worker(ipc=ipc)
Basically, I use pytorch to broadcast the handle, and call cuda APIs through ctypes
.
Run it with torchrun --nproc-per-node 4 test.py
, either export IPC=0
or export IPC=1
, the results are:
ipc: False
ipc: False
time for cudaMalloc: 0.3364677429199219
time for cudaMalloc: 0.336867094039917
ipc: True
ipc: True
time for cudaMalloc: 0.33277273178100586
time for cudaMalloc: 0.33387041091918945
This means, after I turn on p2p access through ipc, the cudaMalloc
speed is kind of the same, rather than twice.
I’m running the program in DGX-V100 machine.
I even run the program with 4 GPUs:
ipc: False
ipc: False
ipc: False
ipc: False
time for cudaMalloc: 0.5533695220947266
time for cudaMalloc: 0.6247894763946533
time for cudaMalloc: 0.6472084522247314
time for cudaMalloc: 0.6500265598297119
ipc: True
ipc: True
ipc: True
ipc: True
time for cudaMalloc: 0.5264256000518799
time for cudaMalloc: 0.5990443229675293
time for cudaMalloc: 0.6208875179290771
time for cudaMalloc: 0.6369361877441406
Still quite the same, not four times.
It seems enabling p2p access does not hurt cudaMalloc
at all. Is it the general case? Or this is just a special case for my machine?