I am experiencing a weird behavior with cudaMemcpyPeerAsync
depending on the hardware that I am using (exact same code, hardware of the same major generation, running the same executable obtained for architecture sm_80
). The first one is a DGXA100 system, thus with NVLink. The second is a node with 8 RTX A5000 GPUs connected to an Intel CPU, without NVLink. The behavior is as expected on the former, while the latter fails to produce the correct result.
The code performs an equivalent to a MPI Alltoall
accross all devices, where each device sends (and receives) a buffer of a fixed size to (from) all devices. I first check and enable peer-to-peer communication across devices using cudaDeviceCanAccessPeer
and cudaDeviceEnablePeerAccess
. Then the communication looks as follows (minimal reproducible example is provided below):
for (int srcDevice=0; srcDevice<num_devices; ++srcDevice) {
cudaSetDevice(srcDevice);
for (int dstDevice=0; dstDevice<num_devices; ++dstDevice) {
if (dstDevice==srcDevice) {
cudaMemcpyAsync(
&d_b[dstDevice][n*srcDevice],
&d_a[srcDevice][n*dstDevice],
n*sizeof(int),
cudaMemcpyDeviceToDevice,
streams[srcDevice]);
} else {
cudaMemcpyPeerAsync(
&d_b[dstDevice][n*srcDevice],
dstDevice,
&d_a[srcDevice][n*dstDevice],
srcDevice,
n*sizeof(int),
streams[srcDevice]);
}
}
}
for (int dev_id=0; dev_id<num_devices; ++dev_id) {
cudaStreamSynchronize(streams[dev_id]);
}
Note that I implicitly assume that cudaStreamSynchronize(streams[dev_id])
will return once all destination devices have entirely received their buffers coming from device dev_id
which made use of stream streams[dev_id]
, not once the source device dev_id
has entirely sent its buffers (I don’t know the details on how that works, since I am chosing the stream on the source device, not the one on the destination device).
On the DGXA100 system, everything works perfectly. On the A5000 node, the code runs without error but the results are wrong. In fact, I observe that the peer-to-peer communication don’t work (the buffers are not peer-to-peer communicated). Moreover, the profile timeline looks strange.
DGXA100 profiler timeline (2 GPUs):
DGXA100 profiler timeline (3 GPUs):
A5000 profiler timeline (2 GPUs):
A5000 profiler timeline (3 GPUs):
Question
Is there anything special I should care about on the A5000 node, making it different from the DGXA100 system, which could explain why the code works on the DGXA100 but fails to produce the correct answer on the A5000 node ?
Sub-question
Why is there around 50 ms of “nothing” on the timeline of the A5000 system, between the peer-to-peer communication and the on-device communication (big blank area(s) on the figures above) ? From the CUDA API line of the profiler, it looks like cudaStreamSynchronize
is waiting for work to be finished, but this work (here, it can only be memcopies) is not captured by the profiler.
What I tried
I tried putting several cudaDeviceSynchronize()
all around communication steps. I also tried halting the host right after invoking a memcopy using std::this_thread::sleep_for(std::chrono::milliseconds(5000));
to give time to CUDA APIs to terminate (both on the send and receive side). No effect.
topology
Here is the output of nvidia-smi topo -m
on the A5000 node (here on an allocation with 2 out of the 8 GPUs):
GPU0 GPU1 NIC0 NIC1 CPU Affinity NUMA Affinity
GPU0 X PXB SYS SYS 0-1 0-1
GPU1 PXB X SYS SYS 0-1 0-1
NIC0 SYS SYS X PIX
NIC1 SYS SYS PIX X
Legend:
X = Self
SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
PIX = Connection traversing at most a single PCIe bridge
NV# = Connection traversing a bonded set of # NVLinks
NIC Legend:
NIC0: mlx5_0
NIC1: mlx5_1
Full reproducible example:
/*
Compilation: nvcc -arch=sm_80 main.cu -o main.x
(nvcc version 12.1.1)
*/
#include <cuda_runtime.h>
#include <iostream>
#include <stdio.h>
inline void gpuCheck(cudaError_t code, const char *file, const int line, const bool abort=true) {
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) {
fprintf(stderr, "aborting\n");
std::abort();
}
}
}
#define CUDA_CHECK_ERROR(code) { gpuCheck((code), __FILE__, __LINE__); }
inline void gpuCheckLast(const char *file, const int line) {
gpuCheck(cudaPeekAtLastError(), file, line);
gpuCheck(cudaDeviceSynchronize(), file, line);
}
#define CUDA_CHECK_KERNEL_ERROR { gpuCheckLast(__FILE__, __LINE__); }
__host__
void checkAndEnableP2P(int num_devices) {
for (int i=0; i<num_devices; i++) {
CUDA_CHECK_ERROR( cudaSetDevice(i) );
for (int j=0; j<num_devices; j++) {
int canAccessPeer;
if (i!=j) {
CUDA_CHECK_ERROR( cudaDeviceCanAccessPeer(&canAccessPeer, i, j) );
if (canAccessPeer==0) {
std::cerr << "Problem: device " << i << " cannot access peer device " << j << std::endl;
std::abort();
}
std::cerr << "Enabling access to peer device " << j << " from device " << i << std::endl;
CUDA_CHECK_ERROR( cudaDeviceEnablePeerAccess(j, 0) );
}
}
}
}
__global__
void kernelInit(int *d_v, int const size, int const val) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<size) {
d_v[i] = val;
}
}
__global__
void kernelA(int *d_a, int const size, int const num_devices, int const t) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<size) {
d_a[i] *= (t + num_devices*i/size);
}
}
__global__
void kernelB(int *d_b, int const size, int const u) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<size) {
d_b[i] += u;
}
}
__host__
void get_exact_values(int *checkVals, int const num_devices) {
for (int dev_id=0; dev_id<num_devices; ++dev_id) {
int base = 100;
for (int did=0; did<dev_id; ++did) {base*=10;}
for (int j=0; j<num_devices; ++j) {
checkVals[dev_id*num_devices + j] = base + (j+1)*(2*(j+1)+dev_id);
}
}
}
__global__
void kernelCheckValue(int const *d_b, int const size, int const checkVal, int *d_cW) {
// count the number of values in d_b which differ from checkVal. d_cW is the output count
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<size) {
if (d_b[i]!=checkVal) {
int old = atomicAdd(&d_cW[0], 1);
}
}
}
__global__
void printArray(int *d_v, int const size) {
for (int i=0; i<size; ++i) {
printf("array[%d] = %d\n", i, d_v[i]);
}
}
int main() {
int num_devices;
CUDA_CHECK_ERROR( cudaGetDeviceCount(&num_devices) );
checkAndEnableP2P(num_devices);
const int n = 1<<26;
const int N = num_devices * n;
std::cout << "num_devices = " << num_devices << std::endl;
std::cout << "n = " << n << std::endl;
std::cout << "N = " << N << std::endl;
const int blockSize = 256;
const int numBlocks = (N + blockSize - 1)/blockSize;
cudaStream_t streams[num_devices];
int *d_a[num_devices];
int *d_b[num_devices];
int *d_countWrong[num_devices];
for (int dev_id=0; dev_id<num_devices; ++dev_id) {
CUDA_CHECK_ERROR( cudaSetDevice(dev_id) );
CUDA_CHECK_ERROR( cudaStreamCreateWithFlags(&streams[dev_id], cudaStreamDefault) );
CUDA_CHECK_ERROR( cudaMalloc((void**)&d_a[dev_id], N*sizeof(int)) );
CUDA_CHECK_ERROR( cudaMalloc((void**)&d_b[dev_id], N*sizeof(int)) );
CUDA_CHECK_ERROR( cudaMalloc((void**)&d_countWrong[dev_id], num_devices*sizeof(int)) );
}
// buffers initialization
for (int dev_id=0; dev_id<num_devices; ++dev_id) {
CUDA_CHECK_ERROR( cudaSetDevice(dev_id) );
kernelInit<<<numBlocks, blockSize, 0, streams[dev_id]>>>(d_a[dev_id], N, dev_id+1);
}
// let's synchronize to start kernelA "at the same time" on all devices
for (int dev_id=0; dev_id<num_devices; ++dev_id) {
CUDA_CHECK_ERROR( cudaStreamSynchronize(streams[dev_id]) );
}
// kernelA
for (int dev_id=0; dev_id<num_devices; ++dev_id) {
CUDA_CHECK_ERROR( cudaSetDevice(dev_id) );
kernelA<<<numBlocks, blockSize, 0, streams[dev_id]>>>(d_a[dev_id], N, num_devices, 2*(dev_id+1));
}
// Communication Device to Device
for (int srcDevice=0; srcDevice<num_devices; ++srcDevice) {
CUDA_CHECK_ERROR( cudaSetDevice(srcDevice) );
for (int dstDevice=0; dstDevice<num_devices; ++dstDevice) {
if (dstDevice==srcDevice) {
CUDA_CHECK_ERROR( cudaMemcpyAsync(
&d_b[dstDevice][n*srcDevice],
&d_a[srcDevice][n*dstDevice],
n*sizeof(int),
cudaMemcpyDeviceToDevice,
streams[srcDevice]) );
} else {
CUDA_CHECK_ERROR( cudaMemcpyPeerAsync(
&d_b[dstDevice][n*srcDevice],
dstDevice,
&d_a[srcDevice][n*dstDevice],
srcDevice,
n*sizeof(int),
streams[srcDevice]) );
}
}
}
// make sure communication is over on all streams
for (int dev_id=0; dev_id<num_devices; ++dev_id) {
CUDA_CHECK_ERROR( cudaStreamSynchronize(streams[dev_id]) );
}
// kernelB
for (int dev_id=0; dev_id<num_devices; ++dev_id) {
CUDA_CHECK_ERROR( cudaSetDevice(dev_id) );
int adder = 100;
for (int did=0; did<dev_id; ++did) {adder*=10;} // adder <== 10**(2+dev_id)
kernelB<<<numBlocks, blockSize, 0, streams[dev_id]>>>(d_b[dev_id], N, adder);
}
// Compute the true exact values that each device must have
int *checkVals;
CUDA_CHECK_ERROR( cudaMallocHost((void**)&checkVals, num_devices*num_devices*sizeof(int)) );
get_exact_values(checkVals, num_devices);
// synchronize before printing
for (int dev_id=0; dev_id<num_devices; ++dev_id) {
CUDA_CHECK_ERROR( cudaStreamSynchronize(streams[dev_id]) );
}
// Print values [from the default stream]
for (int dev_id=0; dev_id<num_devices; ++dev_id) {
CUDA_CHECK_ERROR( cudaSetDevice(dev_id) );
for (int j=0; j<num_devices; ++j) {
std::cout << "Printing d_b[" << dev_id << "] from index " << n*j << std::endl;
std::cout << "expected result: " << checkVals[dev_id*num_devices+j] << std::endl;
printArray<<<1, 1, 0, 0>>>(&d_b[dev_id][n*j], 4);
CUDA_CHECK_ERROR( cudaDeviceSynchronize() );
}
}
// check the values [on the default streams]
for (int dev_id=0; dev_id<num_devices; ++dev_id) {
CUDA_CHECK_ERROR( cudaSetDevice(dev_id) );
CUDA_CHECK_ERROR( cudaMemset(d_countWrong[dev_id], 0, num_devices*sizeof(int)) );
for (int j=0; j<num_devices; ++j) {
const int numBlocksR = (n + blockSize - 1)/blockSize;
kernelCheckValue<<<numBlocksR, blockSize, 0, 0>>>(
&d_b[dev_id][n*j],
n,
checkVals[dev_id*num_devices+j],
&d_countWrong[dev_id][j]);
CUDA_CHECK_KERNEL_ERROR
int h_countWrong = 0;
CUDA_CHECK_ERROR( cudaMemcpy(&h_countWrong, &d_countWrong[dev_id][j], sizeof(int), cudaMemcpyDeviceToHost) );
if (h_countWrong>0) {
std::cout << "device " << dev_id << ", part " << j << " has " << h_countWrong << " WRONG value(s)." << std::endl;
} else {
std::cout << "device " << dev_id << ", part " << j << " has all CORRECT values." << std::endl;
}
}
}
// All cleaning
for (int dev_id=0; dev_id<num_devices; ++dev_id) {
CUDA_CHECK_ERROR( cudaSetDevice(dev_id) );
CUDA_CHECK_ERROR( cudaDeviceSynchronize() );
CUDA_CHECK_ERROR( cudaStreamDestroy(streams[dev_id]) );
CUDA_CHECK_ERROR( cudaFree(d_a[dev_id]) );
CUDA_CHECK_ERROR( cudaFree(d_b[dev_id]) );
CUDA_CHECK_ERROR( cudaFree(d_countWrong[dev_id]) );
}
CUDA_CHECK_ERROR( cudaFreeHost(checkVals) );
return 0;
}