Peer to peer (UVA) memcpy not working

Hi,

I have the problem that in a dual-GPU system, memcpies from one device to another don’t seem to work, if peer access is enabled between the devices.

My system:

  • Windows 11
  • 2x Nvidia RTX Quadro 4500 Ada Generation, both running in TCC driver mode
  • Latest Nvidia driver (566.14), although also tested with 553.24
  • Cuda 12.4 and 12.6

Following is a minimal code example, where I allocate a buffer on each device and two buffers on the host. I then copy data like this:
Host Buffer 0 → Device 0 → Device 1 → Host Buffer 1
My expectation is that afterwards the content of Host Buffers 0 and 1 are identical. However, they are not. The content of Host Buffer 1 is still zero (initial values).

#include <iostream>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>

template <typename T>
void check(T result, char const* const func, const char* const file, int const line) 
{
    if (result) 
    {
        std::cerr << "CUDA error at " << file << ": " << line << "\n" << static_cast<unsigned int>(result);
        exit(EXIT_FAILURE);
    }
}

#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)

int main(int argc, char **argv) 
{
    int gpu_n;
    CHECK_CUDA_ERROR(cudaGetDeviceCount(&gpu_n));

    if (gpu_n < 2)
    {
        std::cout << "Less than 2 GPUs found\n";
        return 0;
    }

    int can_access_peer01, can_access_peer10;
    CHECK_CUDA_ERROR(cudaDeviceCanAccessPeer(&can_access_peer01, 0, 1));
    CHECK_CUDA_ERROR(cudaDeviceCanAccessPeer(&can_access_peer10, 1, 0));

    if (!can_access_peer01 || !can_access_peer10)
    {
        std::cout << "No P2P available\n";
        return 0;
    }

    // Enable P2P access
    CHECK_CUDA_ERROR(cudaSetDevice(0));
    CHECK_CUDA_ERROR(cudaDeviceEnablePeerAccess(1, 0));
    CHECK_CUDA_ERROR(cudaSetDevice(1));
    CHECK_CUDA_ERROR(cudaDeviceEnablePeerAccess(0, 0));

    
    const size_t float_n = 1024 * 1024 * 16;
    const size_t buf_size = float_n * sizeof(float);

    // Create buffers, one on each device and two on the host
    float* d0, * d1, * h0, * h1;
    CHECK_CUDA_ERROR(cudaSetDevice(0));
    CHECK_CUDA_ERROR(cudaMalloc(&d0, buf_size));
    CHECK_CUDA_ERROR(cudaSetDevice(1));
    CHECK_CUDA_ERROR(cudaMalloc(&d1, buf_size));
    CHECK_CUDA_ERROR(cudaMallocHost(&h0, buf_size));
    CHECK_CUDA_ERROR(cudaMallocHost(&h1, buf_size));

    // Fill first host buffer with random values, second with zeros.
    for (size_t i = 0; i < float_n; ++i)
    {
        h0[i] = (float)rand() / RAND_MAX;
        h1[i] = 0.f;
    }

    // Copy h0 -> d0
    CHECK_CUDA_ERROR(cudaSetDevice(0));
    CHECK_CUDA_ERROR(cudaMemcpy(d0, h0, buf_size, cudaMemcpyHostToDevice));

    // P2P copy d0 -> d1
    CHECK_CUDA_ERROR(cudaSetDevice(1)); // Tested with both device 0 and 1
    CHECK_CUDA_ERROR(cudaMemcpy(d1, d0, buf_size, cudaMemcpyDefault)); // Also tested with cudaMemcpyPeer
    
    // Copy d1 -> h1
    CHECK_CUDA_ERROR(cudaSetDevice(1));
    CHECK_CUDA_ERROR(cudaMemcpy(h1, d1, buf_size, cudaMemcpyDeviceToHost));


    // Compare h1 to h0
    bool equal = true;
    for (size_t i = 0; i < float_n; ++i)
    {
        if (h0[i] != h1[i])
        {
            equal = false;
        }
    }

    std::cout << "h0 == h1: " << std::boolalpha << equal << std::endl;

    CHECK_CUDA_ERROR(cudaSetDevice(0));
    CHECK_CUDA_ERROR(cudaFree(d0));
    CHECK_CUDA_ERROR(cudaSetDevice(1));
    CHECK_CUDA_ERROR(cudaFree(d1));
    CHECK_CUDA_ERROR(cudaFreeHost(h0));
    CHECK_CUDA_ERROR(cudaFreeHost(h1));

    CHECK_CUDA_ERROR(cudaSetDevice(0));
    CHECK_CUDA_ERROR(cudaDeviceDisablePeerAccess(1));
    CHECK_CUDA_ERROR(cudaSetDevice(1));
    CHECK_CUDA_ERROR(cudaDeviceDisablePeerAccess(0));

    return 0;
}

A couple of observations:

  • The memcpy shows up in Nsight Systems and looks exactly how I would expect.
  • If I remove the following two lines, the copy suddenly works as expected. However, in my real program I would like to bi-directional access between the devices, so that is not really an option. Also I would expect some form of error from the API if bi-directional access is not possible.
// Removed
//CHECK_CUDA_ERROR(cudaSetDevice(0));
//CHECK_CUDA_ERROR(cudaDeviceEnablePeerAccess(1, 0));
  • If I replace the device to device memcpy with a custom Memcpy-Kernel, which directly accesses the other device’s memory, everything works as expected:
__global__ void MemcpyKernel(float* src, float* dst)
{
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    dst[idx] = src[idx];
}

// ...

    // P2P copy d0 -> d1
#if 0
    CHECK_CUDA_ERROR(cudaSetDevice(1));
    CHECK_CUDA_ERROR(cudaMemcpy(d1, d0, buf_size, cudaMemcpyDefault)); // This does not work
#else
    CHECK_CUDA_ERROR(cudaSetDevice(1)); // Interestingly only works with device 1, not with device 0.
    MemcpyKernel<<<float_n / 512, 512>>>(d0, d1); // This works
#endif

My questions:

  • Am I running into expected behavior here?
  • Why does the memcpy only work if the peer to peer access is uni-directional?
  • Why does the kernel work? Why does it only work if started on device 1, not if started on device 0?

I am deeply grateful for all help and each suggestion.

Best regards,
Philipp

what results do you observe if you run the simpleP2P cuda sample code?