I can’t use nvshmem with 16gpus/node. it’s restricted by the cudaDeviceEnablePeerAccess. is there a way out for nvshmem to work on a node with 16 GPUs?
#include <cuda_runtime_api.h>
#include <stdio.h>
int main() {
for (int i = 0; i < 16; i++) {
if (i == 0)
continue;
auto err = cudaDeviceEnablePeerAccess(i, 0);
if (err != cudaSuccess) {
fprintf(
stderr,
"cudaDeviceEnablePeerAccess(%d, 0) failed with error code %d: %s\n",
i, err, cudaGetErrorString(err));
}
}
return 0;
}
$ ./a.out
cudaDeviceEnablePeerAccess(9, 0) failed with error code 711: peer mapping resources exhausted
cudaDeviceEnablePeerAccess(10, 0) failed with error code 711: peer mapping resources exhausted
cudaDeviceEnablePeerAccess(11, 0) failed with error code 711: peer mapping resources exhausted
cudaDeviceEnablePeerAccess(12, 0) failed with error code 711: peer mapping resources exhausted
cudaDeviceEnablePeerAccess(13, 0) failed with error code 711: peer mapping resources exhausted
cudaDeviceEnablePeerAccess(14, 0) failed with error code 711: peer mapping resources exhausted
cudaDeviceEnablePeerAccess(15, 0) failed with error code 711: peer mapping resources exhausted
this is the NVIDIA driver version: