Hi fablup,
“acc_malloc” returns a device pointer so can’t be accessed on the host and why you’re getting a segv. Instead, only use “buffer” on the device. For example:
% cat test_acc_malloc.c
#include <mpi.h>
#include "openacc.h"
#include "mpi-ext.h" /* Needed for CUDA-aware check */
int main(int argc, char* argv[])
{
MPI_Init(&argc, &argv);
if (1 == MPIX_Query_cuda_support()) {
printf("This MPI library has CUDA-aware support.\n");
} else {
printf("This MPI library does not have CUDA-aware support.\n");
}
int rank = -1;
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
//printf("rank=%d\n", rank);
int ngpus = acc_get_num_devices(acc_device_nvidia);
int devicenum = (rank)%(ngpus);
//printf("devicenum=%d\n", devicenum);
acc_set_device_num(devicenum,acc_device_nvidia);
acc_init(acc_device_nvidia);
//int buffer[10];
int *buffer = acc_malloc((size_t)10*sizeof(int));
#pragma acc parallel loop deviceptr(buffer)
if (rank == 0) {
for (int i=0; i<10; i++) buffer[i] = i;
}
if (rank == 0) {
MPI_Send(buffer, 10, MPI_INT, 1, 0, MPI_COMM_WORLD);
}
else {
MPI_Recv(buffer, 10, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
#pragma acc serial deviceptr(buffer)
{
printf("rank=1, %d\n", buffer[2]);
}
}
acc_free(buffer);
MPI_Finalize();
}
% mpicc -ta=tesla -Minfo=accel test_acc_malloc.c
main:
30, Generating Tesla code
31, #pragma acc loop gang, vector(10) /* blockIdx.x threadIdx.x */
40, Accelerator serial kernel generated
Generating Tesla code
% mpirun -np 2 a.out
This MPI library has CUDA-aware support.
This MPI library has CUDA-aware support.
rank=1, 2
Though the better solution is to use the “host_data” directive to pass the device pointer to the MPI calls. For example:
% cat test_host_data.c
#include <stdlib.h>
#include <mpi.h>
#include "openacc.h"
#include "mpi-ext.h" /* Needed for CUDA-aware check */
int main(int argc, char* argv[])
{
MPI_Init(&argc, &argv);
if (1 == MPIX_Query_cuda_support()) {
printf("This MPI library has CUDA-aware support.\n");
} else {
printf("This MPI library does not have CUDA-aware support.\n");
}
int rank = -1;
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
//printf("rank=%d\n", rank);
int ngpus = acc_get_num_devices(acc_device_nvidia);
int devicenum = (rank)%(ngpus);
//printf("devicenum=%d\n", devicenum);
acc_set_device_num(devicenum,acc_device_nvidia);
acc_init(acc_device_nvidia);
int *buffer = (int *) malloc((size_t)10*sizeof(int));
if (rank == 0) {
for (int i=0; i<10; i++) buffer[i] = i;
}
#pragma acc enter data copyin(buffer[:10])
if (rank == 0) {
#pragma acc host_data use_device(buffer)
{
MPI_Send(buffer, 10, MPI_INT, 1, 0, MPI_COMM_WORLD);
}
}
else {
#pragma acc host_data use_device(buffer)
{
MPI_Recv(buffer, 10, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc update self(buffer[:10])
printf("rank=1, %d\n", buffer[2]);
}
#pragma acc exit data delete(buffer)
MPI_Finalize();
}
% mpicc -ta=tesla -Minfo=accel test_host_data.c
main:
33, Generating enter data copyin(buffer[:10])
45, Generating update self(buffer[:10])
49, Generating exit data delete(buffer[:1])
% mpirun -np 2 a.out
This MPI library has CUDA-aware support.
This MPI library has CUDA-aware support.
rank=1, 2
Hope his helps,
Mat