Fast memcpy micro-benchmarking: CUDA-Python wrapper multi-GPU seg fault

Main reference: George Hotz AMD GPU memcpy using HIP Python wrapper

The target of this micro-benchmarking is to have a convenient and fast to prototype script, using the CUDA Python wrapper for measuring memory-bandwidth (GB/s) via simple allocations (no tensors or n-d arrays). Nevertheless, in this script, I’m not capable of making the multi-GPU setting to work (there are 3 scenarios: cpu, single-GPU, multi-GPU), due to potential the lack of formalism while doing low level manipulation of raw device pointers, thus getting Segmentation faults. I can not properly isolate the underlying cause, but I suppose that the way that I handle devices and context is not appropriate.

example output:

CPU copy 6.20 ms, 16.24 GB/s
GPU copy 5.55 ms, 18.15 GB/s
Segmentation fault (core dumped)

memcpy.py

import time
import ctypes
from cuda import cuda, cudart
# see main cuda-python repo: https://github.com/NVIDIA/cuda-python/tree/dfd31fa609b9c81bcff925824f38531ab3c96706/examples/common
from common import common
from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV


def timeit(fxn):
    tms = []
    for _ in range(10):
        st = time.perf_counter()
        fxn()
        tms.append(time.perf_counter() - st)
    return min(tms)

# base on world_size
sz_bytes = 4096 * 4096 * 2

# Initialize
checkCudaErrors(cuda.cuInit(0))
# Create a context
cuDevice = findCudaDeviceDRV()
cuContext = checkCudaErrors(cuda.cuCtxCreate(0, cuDevice))
"""
inp = driver.pagelocked_empty(sz_bytes, dtype='byte')
out = driver.pagelocked_empty(sz_bytes, dtype='byte')

cuResult_inp, inp_ptr_raw = cuda.cuMemHostAlloc(sz_bytes, cuda.CU_MEMHOSTALLOC_DEVICEMAP)
inp = ctypes.c_void_p(inp_ptr_raw)  # Convert raw pointer to ctypes pointer
"""
cuResult_inp, inp_ptr_raw = cuda.cuMemHostAlloc(sz_bytes, cuda.CU_MEMHOSTALLOC_DEVICEMAP)
cuResult_out, out_ptr_raw = cuda.cuMemHostAlloc(sz_bytes, cuda.CU_MEMHOSTALLOC_DEVICEMAP)
inp = ctypes.c_void_p(inp_ptr_raw)
out = ctypes.c_void_p(out_ptr_raw)
# ***** CPU timing *****

def cpu_memcpy():

    ctypes.memmove(inp, out, sz_bytes)
print(f"CPU copy {(tm:=timeit(cpu_memcpy))*1000:.2f} ms, {sz_bytes*1e-9/tm:.2f} GB/s")

# ***** GPU timing *****

STREAMS = 16
sz_bytes_chunk = sz_bytes // STREAMS
# err, dX = cuda.cuMemAlloc(buffer_size)
buf = [cuda.cuMemAlloc(sz_bytes_chunk)[1] for _ in range(STREAMS)]
# err, stream = cuda.cuStreamCreate(0) -> Tuple
streams = [cuda.cuStreamCreate(0)[1] for _ in range(STREAMS)]

def gpu_roundtrip():
    for i in range(STREAMS):
        # Calculate the offset for each chunk
        offset_inp = inp.value + sz_bytes_chunk * i
        offset_out = out.value + sz_bytes_chunk * i
        # Perform the memory copy operations
        cuda.cuMemcpyHtoDAsync(buf[i], offset_inp, sz_bytes_chunk, streams[i])
        cuda.cuMemcpyDtoHAsync(offset_out, buf[i], sz_bytes_chunk, streams[i])

    for stream in streams:
        checkCudaErrors(cuda.cuStreamSynchronize(stream))
# exit(0)
print(f"GPU copy {(tm:=timeit(gpu_roundtrip))*1000:.2f} ms, {sz_bytes*1e-9/tm:.2f} GB/s")

# Cleanup
for buf_mem in buf:
    checkCudaErrors(cuda.cuMemFree(buf_mem))

checkCudaErrors(cuda.cuMemFreeHost(inp_ptr_raw))
checkCudaErrors(cuda.cuMemFreeHost(out_ptr_raw))
checkCudaErrors(cuda.cuCtxDestroy(cuContext))

# ***** multiGPU timing *****
STREAMS = 4
_, NUM_DEVICES = cuda.cuDeviceGetCount()

prop = cuda.CUmemAllocationProp()
prop.type = cuda.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
prop.location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
sz_bytes_chunk = sz_bytes//(STREAMS*NUM_DEVICES)
buf = [cudart.cudaSetDevice(device_id)[0].value or [cuda.cuMemAlloc(sz_bytes_chunk)[1] for _ in range(STREAMS)] for device_id in range(NUM_DEVICES)]
streams = [cudart.cudaSetDevice(device_id)[0].value or [cuda.cuStreamCreate(device_id)[1] for _ in range(STREAMS)] for device_id in range(NUM_DEVICES)]

device_contexts = []
for device_id in range(NUM_DEVICES):
    device = checkCudaErrors(cuda.cuDeviceGet(device_id))
    ctx = checkCudaErrors(cuda.cuCtxCreate(0, device))
    device_contexts.append(ctx)
    
def multigpu_roundtrip():
    for i in range(STREAMS):
        for device_id in range(NUM_DEVICES):
            cudart.cudaSetDevice(device_id)
            print(f"Device: {cudart.cudaGetDevice()}")
            offset = sz_bytes_chunk * (device_id*STREAMS + i)
            cuda.cuMemcpyHtoDAsync(buf[device_id][i], inp.value + offset, sz_bytes_chunk, streams[device_id][i])
            cuda.cuMemcpyDtoHAsync(out.value + offset, buf[device_id][i], sz_bytes_chunk, streams[device_id][i])
    for i in range(STREAMS):
        for device_id in range(NUM_DEVICES):
            cudart.cudaSetDevice(device_id)
            checkCudaErrors(cuda.cuStreamSynchronize(streams[device_id][i]))
print(f"GPU  {NUM_DEVICES}x  {(tm:=timeit(multigpu_roundtrip))*1000:.2f} ms, {sz_bytes*1e-9/tm:.2f} GB/s")

Thanks in advance to everybody for your reading time.