Hello, I have questions about cuda aware mpi and not sure how/why it works/does not work.
Thanks.
I illustrate my issue with a simplified example. I define a struct with
2 pointers to double f.a and f.b, each allocated as arrays with 6 doubles on host.
I define an MPI communicator of size 2, with the first 3 entries of f.a and f.b of rank 0
sent to the last 3 entries of f.a and f.b of rank 1 and vice versa.
First I wrote a program only with MPI communication in host, no variables on device involved.
So I construct an MPI derived data type and did the MPI Isend/recv/wait communications.
I got the expected output:
I am rank 1, of size 2
Before comm, rank 1, f.a[0]=2.000000, f.b[0]=20.000000
Before comm, rank 1, f.a[1]=2.000000, f.b[1]=20.000000
Before comm, rank 1, f.a[2]=2.000000, f.b[2]=20.000000
Before comm, rank 1, f.a[3]=0.000000, f.b[3]=0.000000
Before comm, rank 1, f.a[4]=0.000000, f.b[4]=0.000000
Before comm, rank 1, f.a[5]=0.000000, f.b[5]=0.000000
I am rank 1, still ok.
After comm, rank 1, f.a[0]=2.000000, f.b[0]=20.000000
After comm, rank 1, f.a[1]=2.000000, f.b[1]=20.000000
After comm, rank 1, f.a[2]=2.000000, f.b[2]=20.000000
After comm, rank 1, f.a[3]=1.000000, f.b[3]=10.000000
After comm, rank 1, f.a[4]=1.000000, f.b[4]=10.000000
After comm, rank 1, f.a[5]=1.000000, f.b[5]=10.000000
I am rank 0, of size 2
Before comm, rank 0, f.a[0]=1.000000, f.b[0]=10.000000
Before comm, rank 0, f.a[1]=1.000000, f.b[1]=10.000000
Before comm, rank 0, f.a[2]=1.000000, f.b[2]=10.000000
Before comm, rank 0, f.a[3]=0.000000, f.b[3]=0.000000
Before comm, rank 0, f.a[4]=0.000000, f.b[4]=0.000000
Before comm, rank 0, f.a[5]=0.000000, f.b[5]=0.000000
I am rank 0, still ok.
After comm, rank 0, f.a[0]=1.000000, f.b[0]=10.000000
After comm, rank 0, f.a[1]=1.000000, f.b[1]=10.000000
After comm, rank 0, f.a[2]=1.000000, f.b[2]=10.000000
After comm, rank 0, f.a[3]=2.000000, f.b[3]=20.000000
After comm, rank 0, f.a[4]=2.000000, f.b[4]=20.000000
After comm, rank 0, f.a[5]=2.000000, f.b[5]=20.000000
Next, I modified the program and declare device variable f_d (and cudaMalloc f_d.a f_d.b),
I construct the MPI datatypes for f_d and copy f.a, f.b to f_d.a, f_d.b resp…
then I implement the MPI communication with variables on devices,
and copy back from device to host and wish to get the same output.
However, I failed right before the MPI Isend.
The error I got is
Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x7f8a66400000)
and a list of backtrace link the those libraries in nvidia openmpi.
Here is my code:
Header:
struct teststruc {
double *a;
double *b;
};
#ifndef EXTERN
#define EXTERN extern
#endif
EXTERN int n;
EXTERN struct teststruc f, f_d;
Main function:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <mpi.h>
#define EXTERN
#include "teststructptr.h"
int main(int argc, char *argv[]) {
MPI_Comm comm1=MPI_COMM_WORLD;
int nproc, rank;
MPI_Request request;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_size(comm1, &nproc);
MPI_Comm_rank(comm1, &rank);
int iroot=0;
printf("I am rank %d, of size %d\n", rank, nproc);
int dev_id = -1;
{
MPI_Comm local_comm;
MPI_Info info;
MPI_Info_create(&info) ;
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, info, &local_comm) ;
MPI_Comm_rank(local_comm,&dev_id) ;
MPI_Comm_free(&local_comm) ;
MPI_Info_free(&info) ;
}
{
int num_devs = 0;
cudaGetDeviceCount( &num_devs ) ;
dev_id = dev_id % num_devs;
}
cudaSetDevice( dev_id ) ;
// demonstrate f.a/b[1,2,3] of rank 0 to f.a/b[4,5,6] of rank 1
// and vice versa.
// allocate memory and initialize
n=3;
f.a = (double*) malloc(2*n*sizeof(double));
f.b = (double*) malloc(2*n*sizeof(double));
memset(f.a,0,2*n*sizeof(double));
memset(f.b,0,2*n*sizeof(double));
for (int i=0; i<n; ++i) {
f.a[i]=rank+1;
f.b[i]=(rank+1)*10;
}
cudaMalloc(&f_d.a, 2*n*sizeof(double));
cudaMalloc(&f_d.b, 2*n*sizeof(double));
//copy h2d
cudaMemcpy( f_d.a, f.a, 2*n*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy( f_d.b, f.b, 2*n*sizeof(double), cudaMemcpyHostToDevice);
for (int i=0; i<2*n; ++i) {
printf("Before comm, rank %d, f.a[%d]=%lf, f.b[%d]=%lf\n", rank, i,f.a[i],i, f.b[i]);
}
int sour=rank;
int dest=(rank+1)%nproc;
// construct derived data type
int const ntype=2;
int array_of_blocklengths[ntype];
MPI_Datatype array_of_types[ntype], stype, rtype;
MPI_Aint base, array_of_displacements[ntype];
MPI_Get_address(&f_d, &base);
array_of_blocklengths[0]=n; array_of_blocklengths[1]=n;
array_of_types[0]=MPI_DOUBLE; array_of_types[1]=MPI_DOUBLE;
MPI_Get_address(f_d.a, &array_of_displacements[0]);
MPI_Get_address(f_d.b, &array_of_displacements[1]);
array_of_displacements[0]=MPI_Aint_diff(array_of_displacements[0], base);
array_of_displacements[1]=MPI_Aint_diff(array_of_displacements[1], base);
MPI_Type_create_struct(ntype, array_of_blocklengths, array_of_displacements, array_of_types, &stype);
MPI_Type_commit(&stype);
MPI_Get_address(f_d.a+n, &array_of_displacements[0]);
MPI_Get_address(f_d.b+n, &array_of_displacements[1]);
array_of_displacements[0]=MPI_Aint_diff(array_of_displacements[0], base);
array_of_displacements[1]=MPI_Aint_diff(array_of_displacements[1], base);
MPI_Type_create_struct(ntype, array_of_blocklengths, array_of_displacements, array_of_types, &rtype);
MPI_Type_commit(&rtype);
int tag1=1;
printf("I am rank %d, still ok.\n", rank);
//communication
MPI_Isend(&f_d, 1, stype, dest, tag1, comm1, &request);
MPI_Recv( &f_d, 1, rtype, dest, tag1, comm1, &status);
MPI_Wait(&request, &status);
//copy d2h
cudaMemcpy( f.a, f_d.a, 2*n*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy( f.b, f_d.b, 2*n*sizeof(double), cudaMemcpyDeviceToHost);
//output
for (int i=0; i<2*n; ++i) {
printf("After comm, rank %d, f.a[%d]=%lf, f.b[%d]=%lf\n", rank, i,f.a[i],i, f.b[i]);
}
MPI_Type_free(&stype);
MPI_Type_free(&rtype);
free(f.a);
free(f.b);
return 0;
}
I found that if I don’t construct the derived data type, and use 2 MPI send recv on f_d.a and f_d.b,
it works as expected, but the number of communications doubles.
Anything I have missed?
And then, I tried to declare a single array g (and g_d) that can store both f.a and f.b,
have MPI send and recv on g_d, with a data type constructed by MPI_Type_vector with 2 blocks.
This works. However, I profiled the test and found that there are a pair of d2h and a pair of h2d
copies, each of 24 bytes x2 corresponding to the 2 blocks, but I believe the communication
should be on device and should not be copied to and from the host.
While not using derived datatype, send recv f_d.a f_d.b separately, I did not see such host device copying.
I read some discussions on the copying issue in cuda with MPI but not sure whether they are exactly the same issue
I encountered, and excuse me if I inquired a posted issue regarding this part.
I attached this code as well.
I also tried similar things in cudafortran and noticed the same issue.
Thanks in advance for help.
sendrecv_struct2.txt (2.6 KB)