Hi,
I have the problem that in a dual-GPU system, memcpies from one device to another don’t seem to work, if peer access is enabled between the devices.
My system:
- Windows 11
- 2x Nvidia RTX Quadro 4500 Ada Generation, both running in TCC driver mode
- Latest Nvidia driver (566.14), although also tested with 553.24
- Cuda 12.4 and 12.6
Following is a minimal code example, where I allocate a buffer on each device and two buffers on the host. I then copy data like this:
Host Buffer 0 → Device 0 → Device 1 → Host Buffer 1
My expectation is that afterwards the content of Host Buffers 0 and 1 are identical. However, they are not. The content of Host Buffer 1 is still zero (initial values).
#include <iostream>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
template <typename T>
void check(T result, char const* const func, const char* const file, int const line)
{
if (result)
{
std::cerr << "CUDA error at " << file << ": " << line << "\n" << static_cast<unsigned int>(result);
exit(EXIT_FAILURE);
}
}
#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
int main(int argc, char **argv)
{
int gpu_n;
CHECK_CUDA_ERROR(cudaGetDeviceCount(&gpu_n));
if (gpu_n < 2)
{
std::cout << "Less than 2 GPUs found\n";
return 0;
}
int can_access_peer01, can_access_peer10;
CHECK_CUDA_ERROR(cudaDeviceCanAccessPeer(&can_access_peer01, 0, 1));
CHECK_CUDA_ERROR(cudaDeviceCanAccessPeer(&can_access_peer10, 1, 0));
if (!can_access_peer01 || !can_access_peer10)
{
std::cout << "No P2P available\n";
return 0;
}
// Enable P2P access
CHECK_CUDA_ERROR(cudaSetDevice(0));
CHECK_CUDA_ERROR(cudaDeviceEnablePeerAccess(1, 0));
CHECK_CUDA_ERROR(cudaSetDevice(1));
CHECK_CUDA_ERROR(cudaDeviceEnablePeerAccess(0, 0));
const size_t float_n = 1024 * 1024 * 16;
const size_t buf_size = float_n * sizeof(float);
// Create buffers, one on each device and two on the host
float* d0, * d1, * h0, * h1;
CHECK_CUDA_ERROR(cudaSetDevice(0));
CHECK_CUDA_ERROR(cudaMalloc(&d0, buf_size));
CHECK_CUDA_ERROR(cudaSetDevice(1));
CHECK_CUDA_ERROR(cudaMalloc(&d1, buf_size));
CHECK_CUDA_ERROR(cudaMallocHost(&h0, buf_size));
CHECK_CUDA_ERROR(cudaMallocHost(&h1, buf_size));
// Fill first host buffer with random values, second with zeros.
for (size_t i = 0; i < float_n; ++i)
{
h0[i] = (float)rand() / RAND_MAX;
h1[i] = 0.f;
}
// Copy h0 -> d0
CHECK_CUDA_ERROR(cudaSetDevice(0));
CHECK_CUDA_ERROR(cudaMemcpy(d0, h0, buf_size, cudaMemcpyHostToDevice));
// P2P copy d0 -> d1
CHECK_CUDA_ERROR(cudaSetDevice(1)); // Tested with both device 0 and 1
CHECK_CUDA_ERROR(cudaMemcpy(d1, d0, buf_size, cudaMemcpyDefault)); // Also tested with cudaMemcpyPeer
// Copy d1 -> h1
CHECK_CUDA_ERROR(cudaSetDevice(1));
CHECK_CUDA_ERROR(cudaMemcpy(h1, d1, buf_size, cudaMemcpyDeviceToHost));
// Compare h1 to h0
bool equal = true;
for (size_t i = 0; i < float_n; ++i)
{
if (h0[i] != h1[i])
{
equal = false;
}
}
std::cout << "h0 == h1: " << std::boolalpha << equal << std::endl;
CHECK_CUDA_ERROR(cudaSetDevice(0));
CHECK_CUDA_ERROR(cudaFree(d0));
CHECK_CUDA_ERROR(cudaSetDevice(1));
CHECK_CUDA_ERROR(cudaFree(d1));
CHECK_CUDA_ERROR(cudaFreeHost(h0));
CHECK_CUDA_ERROR(cudaFreeHost(h1));
CHECK_CUDA_ERROR(cudaSetDevice(0));
CHECK_CUDA_ERROR(cudaDeviceDisablePeerAccess(1));
CHECK_CUDA_ERROR(cudaSetDevice(1));
CHECK_CUDA_ERROR(cudaDeviceDisablePeerAccess(0));
return 0;
}
A couple of observations:
- The memcpy shows up in Nsight Systems and looks exactly how I would expect.
- If I remove the following two lines, the copy suddenly works as expected. However, in my real program I would like to bi-directional access between the devices, so that is not really an option. Also I would expect some form of error from the API if bi-directional access is not possible.
// Removed
//CHECK_CUDA_ERROR(cudaSetDevice(0));
//CHECK_CUDA_ERROR(cudaDeviceEnablePeerAccess(1, 0));
- If I replace the device to device memcpy with a custom Memcpy-Kernel, which directly accesses the other device’s memory, everything works as expected:
__global__ void MemcpyKernel(float* src, float* dst)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
dst[idx] = src[idx];
}
// ...
// P2P copy d0 -> d1
#if 0
CHECK_CUDA_ERROR(cudaSetDevice(1));
CHECK_CUDA_ERROR(cudaMemcpy(d1, d0, buf_size, cudaMemcpyDefault)); // This does not work
#else
CHECK_CUDA_ERROR(cudaSetDevice(1)); // Interestingly only works with device 1, not with device 0.
MemcpyKernel<<<float_n / 512, 512>>>(d0, d1); // This works
#endif
My questions:
- Am I running into expected behavior here?
- Why does the memcpy only work if the peer to peer access is uni-directional?
- Why does the kernel work? Why does it only work if started on device 1, not if started on device 0?
I am deeply grateful for all help and each suggestion.
Best regards,
Philipp