Hi, I have the following program multi_node_mutli_gpu.cpp:
```
#include <mpi.h>
#include <cuda_runtime.h>
#define CUDA_RT_CALL(call) \
{ \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA Runtime error at " << __FILE__ << ":" << __LINE__ << ": " \
<< cudaGetErrorString(err) << std::endl; \
MPI_Abort(MPI_COMM_WORLD, 1); \
} \
}
int main(int argc, char *argv[]) {
int rank, size;
int numDevices;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
cudaGetDeviceCount(&numDevices);
// Ensure there are enough GPUs for all MPI processes
if (numDevices < size) {
if (rank == 0) {
std::cerr << "Error: Number of CUDA devices (" << numDevices
<< ") is less than MPI processes (" << size << ")" << std::endl;
}
MPI_Finalize();
return 1;
}
// Determine which GPU to use based on MPI rank
int device = rank % numDevices;
CUDA_RT_CALL(cudaSetDevice(device));
cudaDeviceProp prop;
CUDA_RT_CALL(cudaGetDeviceProperties(&prop, device));
// Gather information from all nodes to node 0
char node_name[MPI_MAX_PROCESSOR_NAME];
int node_name_len;
MPI_Get_processor_name(node_name, &node_name_len);
// Print CPU and GPU information from each MPI rank
std::cout << "Node " << rank << " (on " << node_name << "), GPU " << device << ": " << prop.name << std::endl;
std::cout << " Compute capability: " << prop.major << "." << prop.minor << std::endl;
std::cout << " Total global memory: " << prop.totalGlobalMem << " bytes" << std::endl;
std::cout << " Memory clock rate: " << prop.memoryClockRate << " kHz" << std::endl;
std::cout << " Memory bus width: " << prop.memoryBusWidth << " bits" << std::endl;
std::cout << " Peak memory bandwidth (GB/s): " << 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6 << std::endl;
MPI_Finalize();
return 0;
}
[vorlket@server cudaprac]$ nvcc -o cuda_code.o -c multi_node_multi_gpu.cpp -gencode arch=compute_35,code=sm_35 -gencode arch=compute_52,code=sm_52
[vorlket@server cudaprac]$ mpic++ -o multi_node_multi_gpu_35 cuda_code_.o -I/opt/cuda/targets/x86_64-linux/include -L/opt/cuda/targets/x86_64-linux/lib -l:libcudart.so
[vorlket@server cudaprac]$ cp multi_node_multi_gpu ~/sharedfolder/multi_node_multi_gpu
[vorlket@server cudaprac]$ mpirun -host server:2,midiserver:2 -np 4 /home/vorlket/sharedfolder/multi_node_multi_gpu
The mpirun gives me Error: Number of CUDA devices (2) is less than MPI processes (4). I have 2 gpus on midiserver and another 2 gpus on server, so 4 in total. I don’t understand why I get this error. Could you please help me understanding what’s going on?
Thanks.