Unified memory not working in multi GPU system

I’ve been debugging a unified memory issue in my application that I’ve been able to reproduce with the following simple test code

#include <iostream>

void checkError(cudaError_t err)
{
    if (cudaSuccess != err) {
        printf("CUDA error: '%s'\n", cudaGetErrorString(err));
        exit(0);
    }
}

__global__ void printVals(int *ptr, int device, int size) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < size) {
        printf("device %d, ptr[%d] = %d\n", device, i, ptr[i]);
    }
}

int main(int argc, char const *argv[])
{
    int count;
    checkError(cudaGetDeviceCount(&count));
    for (int device = 0; device < count; ++device) {
        checkError(cudaSetDevice(device));
        cudaDeviceProp prop;
        checkError(cudaGetDeviceProperties(&prop, 0));
        std::cout << "unifiedAddressing = " << prop.unifiedAddressing << "\n";
        std::cout << "concurrentManagedAccess = " << prop.concurrentManagedAccess << "\n";
    }
    checkError(cudaSetDevice(0));
    int *ptr = nullptr;
    checkError(cudaMallocManaged(&ptr, 10 * sizeof(int)));
    for (int i = 0; i < 10; ++i) {
        ptr[i] = 9 - i;
    }
    for (int device = 0; device < count; ++device) {
        checkError(cudaSetDevice(device));
        printVals<<<1,32>>>(ptr, device, 10);
    }
    for (int device = 0; device < count; ++device) {
        checkError(cudaSetDevice(device));
        checkError(cudaDeviceSynchronize());
    }
    return 0;
}

All this is trying to do is initialize managed memory on the CPU, then read it from all the GPUs in the system. I tested it on two systems. The first has three 2080Ti’s, and the test works perfectly as I would expect with the following output

unifiedAddressing = 1
concurrentManagedAccess = 1
unifiedAddressing = 1
concurrentManagedAccess = 1
unifiedAddressing = 1
concurrentManagedAccess = 1
device 0, ptr[0] = 9
device 0, ptr[1] = 8
device 0, ptr[2] = 7
device 0, ptr[3] = 6
device 0, ptr[4] = 5
device 0, ptr[5] = 4
device 0, ptr[6] = 3
device 0, ptr[7] = 2
device 0, ptr[8] = 1
device 0, ptr[9] = 0
device 1, ptr[0] = 9
device 1, ptr[1] = 8
device 1, ptr[2] = 7
device 1, ptr[3] = 6
device 1, ptr[4] = 5
device 1, ptr[5] = 4
device 1, ptr[6] = 3
device 1, ptr[7] = 2
device 1, ptr[8] = 1
device 1, ptr[9] = 0
device 2, ptr[0] = 9
device 2, ptr[1] = 8
device 2, ptr[2] = 7
device 2, ptr[3] = 6
device 2, ptr[4] = 5
device 2, ptr[5] = 4
device 2, ptr[6] = 3
device 2, ptr[7] = 2
device 2, ptr[8] = 1
device 2, ptr[9] = 0

The other system has two 4080’s, but on that system I am only ever getting the correct values on one of the GPUs

unifiedAddressing = 1
concurrentManagedAccess = 1
unifiedAddressing = 1
concurrentManagedAccess = 1
device 0, ptr[0] = 9
device 0, ptr[1] = 8
device 0, ptr[2] = 7
device 0, ptr[3] = 6
device 0, ptr[4] = 5
device 0, ptr[5] = 4
device 0, ptr[6] = 3
device 0, ptr[7] = 2
device 0, ptr[8] = 1
device 0, ptr[9] = 0
device 1, ptr[0] = 0
device 1, ptr[1] = 0
device 1, ptr[2] = 0
device 1, ptr[3] = 0
device 1, ptr[4] = 0
device 1, ptr[5] = 0
device 1, ptr[6] = 0
device 1, ptr[7] = 0
device 1, ptr[8] = 0
device 1, ptr[9] = 0

It also seems that whichever device I launch the printVals kernel on first is getting the correct values. So if I change the order of the loop so that it launches the kernel on device 1 in the first iteration & device 0 in the second, then device 1 gets the correct values and device 0 gets all zeros.

Both systems are running Ubuntu 22.04, CUDA 12.0. The 2080Ti system is running driver version 525.85.12, and the 4080 system version number 525.60.13.

Is there something I’m missing, or is this unexpected behavior?

To be sure, I just updated the 4080 system to also be running driver version 525.85.12 and it made no difference.

So I’m thinking this has to be a Linux driver issue. I booted the 4080 system with no hardware changes into Windows 11, CUDA 12.0, MSVC 2022, driver version 528.02 and it works fine. Is there somewhere I can/should report this?

The method to report a bug is linked to a sticky post in this forum. It always appears at the top of the topics list and is entitled “how to report a bug”

Thanks! After some more searching it appears that this is already a known issue

Running on Windows is at least a work around for now until it can be resolved on Linux.

A recent post from that thread: Standard nVidia CUDA tests fail with dual RTX 4090 Linux box - #16 by abchauhan