I’ve been debugging a unified memory issue in my application that I’ve been able to reproduce with the following simple test code
#include <iostream>
void checkError(cudaError_t err)
{
if (cudaSuccess != err) {
printf("CUDA error: '%s'\n", cudaGetErrorString(err));
exit(0);
}
}
__global__ void printVals(int *ptr, int device, int size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < size) {
printf("device %d, ptr[%d] = %d\n", device, i, ptr[i]);
}
}
int main(int argc, char const *argv[])
{
int count;
checkError(cudaGetDeviceCount(&count));
for (int device = 0; device < count; ++device) {
checkError(cudaSetDevice(device));
cudaDeviceProp prop;
checkError(cudaGetDeviceProperties(&prop, 0));
std::cout << "unifiedAddressing = " << prop.unifiedAddressing << "\n";
std::cout << "concurrentManagedAccess = " << prop.concurrentManagedAccess << "\n";
}
checkError(cudaSetDevice(0));
int *ptr = nullptr;
checkError(cudaMallocManaged(&ptr, 10 * sizeof(int)));
for (int i = 0; i < 10; ++i) {
ptr[i] = 9 - i;
}
for (int device = 0; device < count; ++device) {
checkError(cudaSetDevice(device));
printVals<<<1,32>>>(ptr, device, 10);
}
for (int device = 0; device < count; ++device) {
checkError(cudaSetDevice(device));
checkError(cudaDeviceSynchronize());
}
return 0;
}
All this is trying to do is initialize managed memory on the CPU, then read it from all the GPUs in the system. I tested it on two systems. The first has three 2080Ti’s, and the test works perfectly as I would expect with the following output
unifiedAddressing = 1
concurrentManagedAccess = 1
unifiedAddressing = 1
concurrentManagedAccess = 1
unifiedAddressing = 1
concurrentManagedAccess = 1
device 0, ptr[0] = 9
device 0, ptr[1] = 8
device 0, ptr[2] = 7
device 0, ptr[3] = 6
device 0, ptr[4] = 5
device 0, ptr[5] = 4
device 0, ptr[6] = 3
device 0, ptr[7] = 2
device 0, ptr[8] = 1
device 0, ptr[9] = 0
device 1, ptr[0] = 9
device 1, ptr[1] = 8
device 1, ptr[2] = 7
device 1, ptr[3] = 6
device 1, ptr[4] = 5
device 1, ptr[5] = 4
device 1, ptr[6] = 3
device 1, ptr[7] = 2
device 1, ptr[8] = 1
device 1, ptr[9] = 0
device 2, ptr[0] = 9
device 2, ptr[1] = 8
device 2, ptr[2] = 7
device 2, ptr[3] = 6
device 2, ptr[4] = 5
device 2, ptr[5] = 4
device 2, ptr[6] = 3
device 2, ptr[7] = 2
device 2, ptr[8] = 1
device 2, ptr[9] = 0
The other system has two 4080’s, but on that system I am only ever getting the correct values on one of the GPUs
unifiedAddressing = 1
concurrentManagedAccess = 1
unifiedAddressing = 1
concurrentManagedAccess = 1
device 0, ptr[0] = 9
device 0, ptr[1] = 8
device 0, ptr[2] = 7
device 0, ptr[3] = 6
device 0, ptr[4] = 5
device 0, ptr[5] = 4
device 0, ptr[6] = 3
device 0, ptr[7] = 2
device 0, ptr[8] = 1
device 0, ptr[9] = 0
device 1, ptr[0] = 0
device 1, ptr[1] = 0
device 1, ptr[2] = 0
device 1, ptr[3] = 0
device 1, ptr[4] = 0
device 1, ptr[5] = 0
device 1, ptr[6] = 0
device 1, ptr[7] = 0
device 1, ptr[8] = 0
device 1, ptr[9] = 0
It also seems that whichever device I launch the printVals kernel on first is getting the correct values. So if I change the order of the loop so that it launches the kernel on device 1 in the first iteration & device 0 in the second, then device 1 gets the correct values and device 0 gets all zeros.
Both systems are running Ubuntu 22.04, CUDA 12.0. The 2080Ti system is running driver version 525.85.12, and the 4080 system version number 525.60.13.
Is there something I’m missing, or is this unexpected behavior?