There’s no opportunity to use the GPU for this code so I modified it to populate an array rather than use scalars.
I’ve also included conditional compilation depending on if your MPI supports CUDA Aware MPI (not all do). With CUDA Aware MPI, the data is transfer directly from device to device. Without, you need to copy the data to/from the host and device.
% cat testmpi.cpp
#include "mpi.h"
#include <iostream>
#ifdef _OPENACC
#include <openacc.h>
#endif
#ifndef NUM
#define NUM 1024
#endif
using namespace std;
int main(int argc, char *argv[])
{
MPI_Init(NULL, NULL);
int rank;
int *x;
MPI_Status status;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
#ifdef _OPENACC
acc_device_t my_device_type;
int num_devices;
int gpuId;
MPI_Comm shmcomm;
int local_rank;
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
MPI_INFO_NULL, &shmcomm);
MPI_Comm_rank(shmcomm, &local_rank);
my_device_type = acc_get_device_type();
num_devices = acc_get_num_devices(my_device_type);
gpuId = local_rank % num_devices;
acc_set_device_num(gpuId, my_device_type);
printf("Rank %d is using Device %d of %d\n",rank,gpuId,num_devices);
#endif
x = new int[NUM];
#pragma acc enter data create(x[:NUM])
if (rank == 0)
{
#pragma acc parallel loop present(x)
for (int i = 0; i < NUM; ++i) {
x[i] = i;
}
#ifdef USE_CUDA_AWARE_MPI
#pragma acc host_data use_device(x)
{
#else
#pragma acc update self(x[:NUM])
#endif
MPI_Send(x, NUM, MPI_INT, 1, 1, MPI_COMM_WORLD);
#ifdef USE_CUDA_AWARE_MPI
}
#endif
}
else
{
#ifdef USE_CUDA_AWARE_MPI
#pragma acc host_data use_device(x)
{
#endif
MPI_Recv(x, NUM, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
#ifdef USE_CUDA_AWARE_MPI
}
#else
#pragma acc update device(x[:NUM])
#endif
#pragma acc parallel loop present(x)
for (int i = 0; i < 10; ++i) {
printf(" the received date is: %d\n", x[i]);
}
}
#pragma acc exit data delete(x)
delete [] x;
MPI_Finalize();
return 0;
}
% mpicxx testmpi.cpp -acc -Minfo=accel
main:
40, Generating enter data create(x[:1024])
41, Generating present(x[:1])
Generating Tesla code
43, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
52, Generating update self(x[:1024])
63, Generating update device(x[:1024])
Generating present(x[:1])
Generating Tesla code
70, #pragma acc loop gang, vector(10) /* blockIdx.x threadIdx.x */
75, Generating exit data delete(x[:1])
% mpirun -np 2 a.out
Rank 0 is using Device 0 of 4
Rank 1 is using Device 1 of 4
the received date is: 0
the received date is: 1
the received date is: 2
the received date is: 3
the received date is: 4
the received date is: 5
the received date is: 6
the received date is: 7
the received date is: 8
the received date is: 9
% mpicxx testmpi.cpp -acc -Minfo=accel -DUSE_CUDA_AWARE_MPI
main:
40, Generating enter data create(x[:1024])
41, Generating present(x[:1])
Generating Tesla code
43, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
65, Generating present(x[:1])
Generating Tesla code
70, #pragma acc loop gang, vector(10) /* blockIdx.x threadIdx.x */
75, Generating exit data delete(x[:1])
% mpirun -np 2 a.out
Rank 1 is using Device 1 of 4
Rank 0 is using Device 0 of 4
the received date is: 0
the received date is: 1
the received date is: 2
the received date is: 3
the received date is: 4
the received date is: 5
the received date is: 6
the received date is: 7
the received date is: 8
the received date is: 9