P2P access gets all 0xfffffff on PXB and PIX gpus

I have a demo P2P program as below:

#include <stdio.h>
#include <iostream>

__global__ void array_add(int *a, int *b, int *c, int n) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < n) {
        c[index] = a[index] + b[index];
        printf("index = %x, a = %x, b = %x, c = %x\n", index, a[index], b[index], c[index]);
    }

}

int main() {
    int n = 16; 
    int device1 = 0;
    int device2 = 1;
    int *a = new int[n];
    int *b = new int[n];
    int *c = new int[n];

    for (int i = 0; i < n; ++i) {
        a[i] = i;
        b[i] = i * 2;
    }
    int *dev_a, *dev_b, *dev_c;
    cudaSetDevice(device1);
    cudaMalloc(&dev_a, n * sizeof(int));
    cudaSetDevice(device2);
    cudaMalloc(&dev_b, n * sizeof(int));
    cudaMalloc(&dev_c, n * sizeof(int));

    cudaSetDevice(device1);
    cudaMemcpy(dev_a, a, n * sizeof(int), cudaMemcpyHostToDevice);

    cudaSetDevice(device2);
    cudaMemcpy(dev_b, b, n * sizeof(int), cudaMemcpyHostToDevice);

    if (device1 != device2) {
    cudaSetDevice(device1);
    if (cudaDeviceEnablePeerAccess(device2, 0)) {
        printf("enable p2p failed to dev %d from dev %d\n",
            device2, device1);
        return 1;
    }
    printf("enable p2p success to dev %d from dev %d\n",
            device1, device2);
    cudaSetDevice(device2);
    if (cudaDeviceEnablePeerAccess(device1, 0)) {
        printf("enable p2p failed to dev %d from dev %d\n",
            device2, device1);
        return 1;
    }
    printf("enable p2p success to dev %d from dev %d\n",
            device2, device1);
    int p2p;
    if (cudaDeviceCanAccessPeer(&p2p, device1, device2)) {
        printf("peer query failed between dev %d and dev %d\n",
            device1, device2);
        return 1;
    }
    printf("\np2p = %d\n", p2p);
    }

    int threadsPerBlock = 16;
    int gridSize = (n + threadsPerBlock - 1) / threadsPerBlock;
    void* args[] = { &dev_a, &dev_b, &dev_c, &n };

    cudaSetDevice(device1);

    array_add<<<gridSize,16>>>(dev_a,dev_b,dev_c,n);
    if (cudaDeviceSynchronize()) {
        printf("cudaDeviceSynchronize failed!\n");
        return 1;
    }
    printf("cudaDeviceSynchronize success!\n");

    cudaSetDevice(device2);
    if (cudaMemcpy(c, dev_c, n * sizeof(int), cudaMemcpyDeviceToHost)) {
        printf("cudaMemcpy failed for dev_c\n");
    }

    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    for (int i = 0; i < n; ++i) {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }

    for (int i = 0; i < n; ++i) {
        if (a[i] + b [i] != c[i]) {
            printf("Result verification failed at element %d, a = %d, b = %d, c = %d!\n",
                i, a[i], b[i], c[i]);
            return 1;
        }
    }
    printf("Result verification success!\n");

    delete[] a;
    delete[] b;
    delete[] c;
    return 0;
}

The kernel function tries to do a vector add using the data from two devices. The output of nvidia-smi topo -m is:

        GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    CPU Affinity    NUMA Affinity   GPU NUMA ID
GPU0     X      PXB     PXB     SYS     SYS     SYS     SYS     0-23,48-71      0               N/A
GPU1    PXB      X      PIX     SYS     SYS     SYS     SYS     0-23,48-71      0               N/A
GPU2    PXB     PIX      X      SYS     SYS     SYS     SYS     0-23,48-71      0               N/A
GPU3    SYS     SYS     SYS      X      PXB     PXB     PXB     24-47,72-95     1               N/A
GPU4    SYS     SYS     SYS     PXB      X      PXB     PXB     24-47,72-95     1               N/A
GPU5    SYS     SYS     SYS     PXB     PXB      X      PIX     24-47,72-95     1               N/A
GPU6    SYS     SYS     SYS     PXB     PXB     PIX      X      24-47,72-95     1               N/A

When I set device1 and device2 to (0,1), (1,2), or (0,2), whose topology is PXB or PIX, the result is incorrect, and all the data read from the other device is 0xffff inside the kernel function. And when I choose device pair (1,3), (0,5), whose topology is SYS, the result is correct.

I have the following questions:

  1. Did the P2P access fail? If so, why the cudaDeviceCanAccessPeer call didn’t return any error?
  2. Why 0xffff?
  3. Why p2p can be done over SYS topo, while not with PXB and PIX?

Incorrect results:

enable p2p success to dev 3 from dev 4
enable p2p success to dev 4 from dev 3

p2p = 1

index = 0, a = 0, b = ffffffff, c = ffffffff
index = 1, a = 1, b = ffffffff, c = 0
index = 2, a = 2, b = ffffffff, c = 1
index = 3, a = 3, b = ffffffff, c = 2
index = 4, a = 4, b = ffffffff, c = 3
index = 5, a = 5, b = ffffffff, c = 4
index = 6, a = 6, b = ffffffff, c = 5
index = 7, a = 7, b = ffffffff, c = 6
index = 8, a = 8, b = ffffffff, c = 7
index = 9, a = 9, b = ffffffff, c = 8
index = a, a = a, b = ffffffff, c = 9
index = b, a = b, b = ffffffff, c = a
index = c, a = c, b = ffffffff, c = b
index = d, a = d, b = ffffffff, c = c
index = e, a = e, b = ffffffff, c = d
index = f, a = f, b = ffffffff, c = e
cudaDeviceSynchronize success!
0 + 0 = 0
1 + 2 = 0
2 + 4 = 0
3 + 6 = 0
4 + 8 = 0
5 + 10 = 0
6 + 12 = 0
7 + 14 = 0
8 + 16 = 0
9 + 18 = 0
10 + 20 = 0
11 + 22 = 0
12 + 24 = 0
13 + 26 = 0
14 + 28 = 0
15 + 30 = 0
Result verification failed at element 1, a = 1, b = 2, c = 0!

Correct result:

enable p2p success to dev 3 from dev 0
enable p2p success to dev 0 from dev 3

p2p = 1

index = 0, a = 0, b = 0, c = 0
index = 1, a = 1, b = 2, c = 3
index = 2, a = 2, b = 4, c = 6
index = 3, a = 3, b = 6, c = 9
index = 4, a = 4, b = 8, c = c
index = 5, a = 5, b = a, c = f
index = 6, a = 6, b = c, c = 12
index = 7, a = 7, b = e, c = 15
index = 8, a = 8, b = 10, c = 18
index = 9, a = 9, b = 12, c = 1b
index = a, a = a, b = 14, c = 1e
index = b, a = b, b = 16, c = 21
index = c, a = c, b = 18, c = 24
index = d, a = d, b = 1a, c = 27
index = e, a = e, b = 1c, c = 2a
index = f, a = f, b = 1e, c = 2d
cudaDeviceSynchronize success!
0 + 0 = 0
1 + 2 = 3
2 + 4 = 6
3 + 6 = 9
4 + 8 = 12
5 + 10 = 15
6 + 12 = 18
7 + 14 = 21
8 + 16 = 24
9 + 18 = 27
10 + 20 = 30
11 + 22 = 33
12 + 24 = 36
13 + 26 = 39
14 + 28 = 42
15 + 30 = 45
Result verification success!

Your system has 7 GPUs? Seems unusual to me. I’m interested in the system manufacturer and model number, as well as the GPU complement (what type of GPUs are installed). However I don’t know that I would do anything with that info; its mainly for my personal edification.

I didn’t spot any errors in your code, and it seems to run correctly for me on a known-good system (DGX-H100) albeit all of the GPU-GPU connections there are of type NV18.

It certainly appears that P2P is failing on your system. When the result of cudaDeviceCanAccessPeer is true/no error, and P2P failures occur, it generally indicates a problem in the system design.

This forum is not intended to be a place for help troubleshooting your system design. Instead you should contact the system manufacturer. You may also wish to update all relevant system-level firmware, such as the system BIOS, and try things again.

My comments here mostly assume you purchased this system, in this exact config, from a reputable vendor, particularly including my advice to contact the system vendor for help. If you assembled it yourself or modified it in a significant way, then the only thing I can say is it does not work properly/is broken, and no, cudaDeviceCanAccessPeer is not guaranteed to discover any such trouble “ahead of time”. You’re welcome to ask the community for help in such a case, but there isn’t any further advice I can provide.

aside/FWIW, if you study a canonical peer access code such as the simpleP2P cuda sample code, you will discover that canonical usage of the cudaDeviceCanAccessPeer is to use it before you attempt to enable peer access. It is not doing a verification of peer access. It is reporting a piece of information like “as far as is known by the GPU driver, the connection specified should support P2P, if enabled”. It is not reporting something like “now that you have enabled Peer access, is everything working correctly”