MPI + Peer2Peer combine MPI and Peer2Peer

Hello,

I can run a multithreaded code with cudaMemcpyPeerAsync with amazing performance (pthreads commands for synchronization).
Now I want to use MPI instead of pthreads, because I don"t want to reprogram a larger existing code in mixed pthreads/MPI style.
I want to use MPI for synchronization and cudaMemcpyPeerAsync to communicate. It is like a one sided communication setup.
I can enable Peer2Peer between the processes (each precess holds one GPU).
I transfer the device pointers with MPI to the other processes, but cudaMemcpyPeerAsync fails.
Do I need a unified address space to do that ???

Thanks,
Jonas.

Are you using the cudaIpc interface available in CUDA 4.1?
It is the only way to have P2P between the processes.

Thanks a lot, that looks like the solution but I can not make it work, it is

simple_text.cu(125) : cudaSafeCall() Runtime API error 11: invalid argument.

I post the code here, perhaps somebody can help me:

#include <mpi.h>

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include

#include <cuda_runtime.h>

//#include <cutil.h>

#include <cutil_inline.h>

#define BUFFSIZE 10000

define MPI_SAFE_CALL( call) { \

int errloc = call;							\

if( errloc != MPI_SUCCESS) {					\

  fprintf(stderr, "MPI error in file '%s' in line %i\n",	\

      __FILE__, __LINE__);		\

  MPI_Finalize();							\

  exit(-1);								\

} }

void write2stdout(int myid, char* messg, MPI_Comm comm_cart, int numprocs){

MPI_Status status;

MPI_Request request;

int i;

char mess1[BUFFSIZE];

MPI_SAFE_CALL( MPI_Isend( messg, BUFFSIZE, MPI_CHARACTER, 0, 0, comm_cart, &request) );

if(myid==0) {

for(i=0 ; i<numprocs ; i++){

  MPI_SAFE_CALL( MPI_Recv( mess1  , BUFFSIZE, MPI_CHARACTER, i, 0, comm_cart, &status));

  printf("%s",mess1);

}

}

MPI_SAFE_CALL( MPI_Wait(&request,&status) );

}

int main( int argc, char* argv){

char buff[BUFFSIZE];

int i;

int pid=-1, np=-1;

MPI_SAFE_CALL( MPI_Init(&argc, &argv) );

MPI_SAFE_CALL( MPI_Comm_rank(MPI_COMM_WORLD, &pid));

MPI_SAFE_CALL( MPI_Comm_size(MPI_COMM_WORLD, &np));

if( np!=2) {

printf("exactly 2 processes\n",argv[0]);

MPI_Abort( MPI_COMM_WORLD, 1 );

return 1;

}

if( argc != np+1) {

printf("usage: %s device0 device1\n",argv[0]);

MPI_Abort( MPI_COMM_WORLD, 1 );

return 1;

}

int gpuID[2];

for(i=0;i<2;i++){

gpuID[i] = atoi(argv[i+1]);

}

int* memD[2];

int mem=pid;

int mem_size = sizeof(int);

cudaDeviceProp deviceProp;

cudaGetDeviceProperties(&deviceProp, gpuID[pid]);

if(deviceProp.computeMode==cudaComputeModeDefault) cutilSafeCall(cudaSetDevice(gpuID[pid]));

else{

sprintf(buff,"pid %d, gpuId: %d:  Exclusive Mode, reassign gpuId\n",pid, gpuID[pid]);

write2stdout(pid,buff,MPI_COMM_WORLD,np);

}

cutilSafeCall( cudaMalloc((void**) &memD[pid], mem_size) );

cutilSafeCall( cudaMemcpy( memD[pid], &mem, mem_size, cudaMemcpyHostToDevice) );

cutilSafeCall(cudaGetDevice(&gpuID[pid]));

cudaGetDeviceProperties(&deviceProp, gpuID[pid]);

sprintf(buff,“pid %d, gpuId: %d: %s, %d multiprocessors, %d warp size\n”,pid, gpuID[pid], deviceProp.name, deviceProp.multiProcessorCount, deviceProp.warpSize);

write2stdout(pid,buff,MPI_COMM_WORLD,np);

cudaIpcMemHandle_t memHandle[2];

cudaIpcGetMemHandle ( &memHandle[pid], memD[pid]);

for (i=0 ; i<2 ; i++){

MPI_Bcast( &gpuID[i], 1, MPI_INT, i , MPI_COMM_WORLD); 

MPI_Bcast( &memHandle[i], sizeof(memHandle[i]), MPI_CHAR, i , MPI_COMM_WORLD); 

}

for (i=0 ; i<2 ; i++){

//if(pid!=i){

cudaIpcOpenMemHandle ((void**) &memD[i], memHandle[i], cudaIpcMemLazyEnablePeerAccess);

//}

sprintf(buff,"pid %d: gpuId[i=%d]=%d: dat1[i=%d]=%p\n",pid, i, gpuID[i], i, memD[i]);

write2stdout(pid,buff,MPI_COMM_WORLD,np);

}

MPI_Barrier (MPI_COMM_WORLD);

int peerI=0;

if(pid == 0) {

cutilSafeCall( cudaDeviceCanAccessPeer(&peerI,gpuID[0],gpuID[1]));

if(peerI==1){

  printf("pid %d: Enable peer to %d\n",pid,pid+1);

  cutilSafeCall( cudaDeviceEnablePeerAccess(gpuID[1],0) );

}

else{

  printf("pid %d: *** Can not Enable peer to %d\n",gpuID[0],gpuID[1]);

}

}

MPI_Barrier (MPI_COMM_WORLD);

if(pid==0) {

cutilSafeCall( cudaMemcpyPeer( memD[0], gpuID[0],  memD[1],  gpuID[1], mem_size ) );

}

cutilSafeCall( cudaMemcpy(&mem, memD[pid], mem_size,cudaMemcpyDeviceToHost) );

sprintf(buff, “pid %d: mem=%d\n”, pid, mem);

write2stdout(pid,buff,MPI_COMM_WORLD,np);

MPI_Barrier (MPI_COMM_WORLD);

MPI_Finalize();

return 0;

}

it works with
cutilSafeCall( cudaMemcpy( memD[0], memD[1], mem_size, cudaMemcpyDefault ) ); !!!

why not with
cudaMemcpyPeer( memD[0], gpuID[0], memD[1], gpuID[1], mem_size ) ???

By the way, it works but only if they are on the same socket.

ok here is more information:

http://forums.nvidia.com/index.php?showtopic=215373

I had a similar program to yours.

But, in my case, cudaIpcOpenMemHandle(…) will return cudaErrorMapBufferObjectFailed, if

cutilSafeCall( cudaDeviceCanAccessPeer(&peerI,gpuID[0],gpuID[1])) returns false.

Therefore I have to put cudaDeviceCanAccessPeer test before.

Has anyone seen this problem as I did?