I have a demo P2P program as below:
#include <stdio.h>
#include <iostream>
__global__ void array_add(int *a, int *b, int *c, int n) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < n) {
c[index] = a[index] + b[index];
printf("index = %x, a = %x, b = %x, c = %x\n", index, a[index], b[index], c[index]);
}
}
int main() {
int n = 16;
int device1 = 0;
int device2 = 1;
int *a = new int[n];
int *b = new int[n];
int *c = new int[n];
for (int i = 0; i < n; ++i) {
a[i] = i;
b[i] = i * 2;
}
int *dev_a, *dev_b, *dev_c;
cudaSetDevice(device1);
cudaMalloc(&dev_a, n * sizeof(int));
cudaSetDevice(device2);
cudaMalloc(&dev_b, n * sizeof(int));
cudaMalloc(&dev_c, n * sizeof(int));
cudaSetDevice(device1);
cudaMemcpy(dev_a, a, n * sizeof(int), cudaMemcpyHostToDevice);
cudaSetDevice(device2);
cudaMemcpy(dev_b, b, n * sizeof(int), cudaMemcpyHostToDevice);
if (device1 != device2) {
cudaSetDevice(device1);
if (cudaDeviceEnablePeerAccess(device2, 0)) {
printf("enable p2p failed to dev %d from dev %d\n",
device2, device1);
return 1;
}
printf("enable p2p success to dev %d from dev %d\n",
device1, device2);
cudaSetDevice(device2);
if (cudaDeviceEnablePeerAccess(device1, 0)) {
printf("enable p2p failed to dev %d from dev %d\n",
device2, device1);
return 1;
}
printf("enable p2p success to dev %d from dev %d\n",
device2, device1);
int p2p;
if (cudaDeviceCanAccessPeer(&p2p, device1, device2)) {
printf("peer query failed between dev %d and dev %d\n",
device1, device2);
return 1;
}
printf("\np2p = %d\n", p2p);
}
int threadsPerBlock = 16;
int gridSize = (n + threadsPerBlock - 1) / threadsPerBlock;
void* args[] = { &dev_a, &dev_b, &dev_c, &n };
cudaSetDevice(device1);
array_add<<<gridSize,16>>>(dev_a,dev_b,dev_c,n);
if (cudaDeviceSynchronize()) {
printf("cudaDeviceSynchronize failed!\n");
return 1;
}
printf("cudaDeviceSynchronize success!\n");
cudaSetDevice(device2);
if (cudaMemcpy(c, dev_c, n * sizeof(int), cudaMemcpyDeviceToHost)) {
printf("cudaMemcpy failed for dev_c\n");
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
for (int i = 0; i < n; ++i) {
printf("%d + %d = %d\n", a[i], b[i], c[i]);
}
for (int i = 0; i < n; ++i) {
if (a[i] + b [i] != c[i]) {
printf("Result verification failed at element %d, a = %d, b = %d, c = %d!\n",
i, a[i], b[i], c[i]);
return 1;
}
}
printf("Result verification success!\n");
delete[] a;
delete[] b;
delete[] c;
return 0;
}
The kernel function tries to do a vector add using the data from two devices. The output of nvidia-smi topo -m
is:
GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 CPU Affinity NUMA Affinity GPU NUMA ID
GPU0 X PXB PXB SYS SYS SYS SYS 0-23,48-71 0 N/A
GPU1 PXB X PIX SYS SYS SYS SYS 0-23,48-71 0 N/A
GPU2 PXB PIX X SYS SYS SYS SYS 0-23,48-71 0 N/A
GPU3 SYS SYS SYS X PXB PXB PXB 24-47,72-95 1 N/A
GPU4 SYS SYS SYS PXB X PXB PXB 24-47,72-95 1 N/A
GPU5 SYS SYS SYS PXB PXB X PIX 24-47,72-95 1 N/A
GPU6 SYS SYS SYS PXB PXB PIX X 24-47,72-95 1 N/A
When I set device1 and device2 to (0,1), (1,2), or (0,2), whose topology is PXB or PIX, the result is incorrect, and all the data read from the other device is 0xffff inside the kernel function. And when I choose device pair (1,3), (0,5), whose topology is SYS, the result is correct.
I have the following questions:
- Did the P2P access fail? If so, why the cudaDeviceCanAccessPeer call didn’t return any error?
- Why 0xffff?
- Why p2p can be done over SYS topo, while not with PXB and PIX?
Incorrect results:
enable p2p success to dev 3 from dev 4
enable p2p success to dev 4 from dev 3
p2p = 1
index = 0, a = 0, b = ffffffff, c = ffffffff
index = 1, a = 1, b = ffffffff, c = 0
index = 2, a = 2, b = ffffffff, c = 1
index = 3, a = 3, b = ffffffff, c = 2
index = 4, a = 4, b = ffffffff, c = 3
index = 5, a = 5, b = ffffffff, c = 4
index = 6, a = 6, b = ffffffff, c = 5
index = 7, a = 7, b = ffffffff, c = 6
index = 8, a = 8, b = ffffffff, c = 7
index = 9, a = 9, b = ffffffff, c = 8
index = a, a = a, b = ffffffff, c = 9
index = b, a = b, b = ffffffff, c = a
index = c, a = c, b = ffffffff, c = b
index = d, a = d, b = ffffffff, c = c
index = e, a = e, b = ffffffff, c = d
index = f, a = f, b = ffffffff, c = e
cudaDeviceSynchronize success!
0 + 0 = 0
1 + 2 = 0
2 + 4 = 0
3 + 6 = 0
4 + 8 = 0
5 + 10 = 0
6 + 12 = 0
7 + 14 = 0
8 + 16 = 0
9 + 18 = 0
10 + 20 = 0
11 + 22 = 0
12 + 24 = 0
13 + 26 = 0
14 + 28 = 0
15 + 30 = 0
Result verification failed at element 1, a = 1, b = 2, c = 0!
Correct result:
enable p2p success to dev 3 from dev 0
enable p2p success to dev 0 from dev 3
p2p = 1
index = 0, a = 0, b = 0, c = 0
index = 1, a = 1, b = 2, c = 3
index = 2, a = 2, b = 4, c = 6
index = 3, a = 3, b = 6, c = 9
index = 4, a = 4, b = 8, c = c
index = 5, a = 5, b = a, c = f
index = 6, a = 6, b = c, c = 12
index = 7, a = 7, b = e, c = 15
index = 8, a = 8, b = 10, c = 18
index = 9, a = 9, b = 12, c = 1b
index = a, a = a, b = 14, c = 1e
index = b, a = b, b = 16, c = 21
index = c, a = c, b = 18, c = 24
index = d, a = d, b = 1a, c = 27
index = e, a = e, b = 1c, c = 2a
index = f, a = f, b = 1e, c = 2d
cudaDeviceSynchronize success!
0 + 0 = 0
1 + 2 = 3
2 + 4 = 6
3 + 6 = 9
4 + 8 = 12
5 + 10 = 15
6 + 12 = 18
7 + 14 = 21
8 + 16 = 24
9 + 18 = 27
10 + 20 = 30
11 + 22 = 33
12 + 24 = 36
13 + 26 = 39
14 + 28 = 42
15 + 30 = 45
Result verification success!