I have some problems with setting up multigpu code. The setup is the following:
two threads (communicating with MPI), each driving one GPU (runtime API). I am
trying to allocate a mapped and portable memory on each thread, so that both
threads can work with both buffers. The allocation works, and I can send the
address of the allocated buffer via MPI to the other thread. When I am trying
to get the buffer address mapped to the device, I get an error in case of the
buffer of the other thread.
Anyone has idea, if this setup can work at all, and if yes, how to fix it?
Most probably I am not correctly interpreting the statement "portable memory
can be used by all threads". What is the proper way to ‘broadcast’ a portable
memory address among threads? I havent found any example for that.
I attach a simple test case:
#include <mpi.h>
#include <cuda_runtime.h>
#define C_CALL(call) \
do {\
cudaError err = call;\
if( cudaSuccess != err)\
{\
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) );\
exit(EXIT_FAILURE);\
}\
} while (0)
//compile: nvcc -lmpi cudacomm.cu
//run: mpirun -np 2 ./a.out
int main(int argc, char *argv[])
{
//set up MPI
int node;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &node);
int othernode= 1-node;
//set up CUDA
C_CALL(cudaSetDevice(node));
C_CALL(cudaSetDeviceFlags(cudaDeviceMapHost));
//alloc sendbuf = portable + mapped memory on host
float2 *sendbuf;
C_CALL(cudaHostAlloc((void **) &sendbuf, 1000*sizeof(float2),
cudaHostAllocPortable|cudaHostAllocMapped));
//recvbuf= other thread's sendbuf
float2 *recvbuf;
MPI_Sendrecv(&sendbuf, sizeof(float2 *), MPI_CHAR, othernode, 0,
&recvbuf, sizeof(float2 *), MPI_CHAR, othernode, 0,
MPI_COMM_WORLD, &status);
printf("node= %d send= %Ld recv= %Ld\n", node, (long)sendbuf, (long)recvbuf);
//get buffer addresses mapped to device mem
float2 *d_sendbuf, *d_recvbuf;
C_CALL(cudaHostGetDevicePointer((void **) &d_sendbuf, sendbuf, 0));
//recv FAILS with "invalid argument"
C_CALL(cudaHostGetDevicePointer((void **) &d_recvbuf, recvbuf, 0));
MPI_Finalize();
return 0;
}