am having problem when running MPI codes using NVIDIA MPS Service on multi-GPU nodes.
The system that I am using has 2 K80 GPUs (total of 4 GPUs).
Basically, I first set the GPU mode to exclusive_process:
nvidia_smi -c 3
Then I start the MPS Service:
nvidia-cuda-mps-control -d
When I increase the number of processes and run my code I get the following error:
all CUDA-capable devices are busy or unavailable
Here is an example:
This is my code:
#include <stdio.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "mpi.h"
#define __SIZE__ 1024
int main(int argc, char **argv)
{
cudaError_t cuda_err = cudaSuccess;
void *dev_buf;
MPI_Init(&argc, &argv);
int my_rank = -1;
int dev_cnt = 0;
int dev_id = -1;
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
cuda_err = cudaGetDeviceCount(&dev_cnt);
if (cuda_err != cudaSuccess)
printf("cudaGET Error--on rank %d %s\n", my_rank, cudaGetErrorString(cuda_err));
dev_id = my_rank % dev_cnt;
printf("myrank=%d dev_cnt=%d, dev_id=%d\n", my_rank, dev_cnt, dev_id);
cuda_err = cudaSetDevice(dev_id);
if (cuda_err != cudaSuccess)
printf("cudaSet Error--on rank %d %s\n", my_rank, cudaGetErrorString(cuda_err));
cuda_err = cudaMalloc((void **) &dev_buf, __SIZE__);
if (cuda_err != cudaSuccess)
printf("cudaMalloc Error--on rank %d %s\n", my_rank, cudaGetErrorString(cuda_err))
else
printf("cudaMalloc Success++, %d \n", my_rank);
MPI_Finalize();
return 0;
}
Here is the output for 12 processes:
#mpirun -n 12 -hostfile hosts ./hq_test
myrank=0 dev_cnt=4, dev_id=0
myrank=1 dev_cnt=4, dev_id=1
myrank=2 dev_cnt=4, dev_id=2
myrank=3 dev_cnt=4, dev_id=3
myrank=4 dev_cnt=4, dev_id=0
myrank=5 dev_cnt=4, dev_id=1
myrank=6 dev_cnt=4, dev_id=2
myrank=7 dev_cnt=4, dev_id=3
myrank=8 dev_cnt=4, dev_id=0
myrank=9 dev_cnt=4, dev_id=1
myrank=10 dev_cnt=4, dev_id=2
myrank=11 dev_cnt=4, dev_id=3
cudaMalloc Success++, 8
cudaMalloc Success++, 10
cudaMalloc Success++, 0
cudaMalloc Success++, 1
cudaMalloc Success++, 3
cudaMalloc Success++, 7
cudaMalloc Success++, 9
cudaMalloc Success++, 6
cudaMalloc Success++, 4
cudaMalloc Success++, 2
cudaMalloc Success++, 5
cudaMalloc Success++, 11
Here is the output for 14 processes:
#mpirun -n 14 -hostfile hosts ./hq_test
myrank=0 dev_cnt=4, dev_id=0
myrank=1 dev_cnt=4, dev_id=1
myrank=2 dev_cnt=4, dev_id=2
myrank=3 dev_cnt=4, dev_id=3
myrank=4 dev_cnt=4, dev_id=0
myrank=5 dev_cnt=4, dev_id=1
myrank=6 dev_cnt=4, dev_id=2
myrank=7 dev_cnt=4, dev_id=3
myrank=8 dev_cnt=4, dev_id=0
myrank=9 dev_cnt=4, dev_id=1
myrank=10 dev_cnt=4, dev_id=2
myrank=11 dev_cnt=4, dev_id=3
myrank=12 dev_cnt=4, dev_id=0
myrank=13 dev_cnt=4, dev_id=1
cudaMalloc Success++, 11
cudaMalloc Success++, 3
cudaMalloc Success++, 7
cudaMalloc Success++, 2
cudaMalloc Success++, 10
cudaMalloc Success++, 6
cudaMalloc Success++, 1
cudaMalloc Success++, 8
cudaMalloc Error–on rank 13 all CUDA-capable devices are busy or unavailable
cudaMalloc Error–on rank 5 all CUDA-capable devices are busy or unavailable
cudaMalloc Error–on rank 9 all CUDA-capable devices are busy or unavailable
cudaMalloc Error–on rank 4 all CUDA-capable devices are busy or unavailable
cudaMalloc Error–on rank 0 all CUDA-capable devices are busy or unavailable
cudaMalloc Error–on rank 12 all CUDA-capable devices are busy or unavailable
Note: I have already tried changing CUDA_DEVICE_MAX_CONNECTIONS value, but it didn’t help.
I’d appreciate if you share your thoughts on this with me.