Segfault with MPI_Send + acc_malloc

Hi fablup,

“acc_malloc” returns a device pointer so can’t be accessed on the host and why you’re getting a segv. Instead, only use “buffer” on the device. For example:

% cat test_acc_malloc.c

#include <mpi.h>
#include "openacc.h"
#include "mpi-ext.h" /* Needed for CUDA-aware check */

int main(int argc, char* argv[])
  MPI_Init(&argc, &argv);

  if (1 == MPIX_Query_cuda_support()) {
      printf("This MPI library has CUDA-aware support.\n");
  } else {
      printf("This MPI library does not have CUDA-aware support.\n");

  int rank = -1;
  //printf("rank=%d\n", rank);

  int ngpus = acc_get_num_devices(acc_device_nvidia);
  int devicenum = (rank)%(ngpus);
  //printf("devicenum=%d\n", devicenum);


  //int buffer[10];
  int *buffer = acc_malloc((size_t)10*sizeof(int));
  #pragma acc parallel loop deviceptr(buffer)
  if (rank == 0) {
    for (int i=0; i<10; i++) buffer[i] = i;

  if (rank == 0) {
    MPI_Send(buffer, 10, MPI_INT, 1, 0, MPI_COMM_WORLD);
  else {
    MPI_Recv(buffer, 10, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
#pragma acc serial deviceptr(buffer)
    printf("rank=1, %d\n", buffer[2]);


% mpicc -ta=tesla -Minfo=accel test_acc_malloc.c
     30, Generating Tesla code
         31, #pragma acc loop gang, vector(10) /* blockIdx.x threadIdx.x */
     40, Accelerator serial kernel generated
         Generating Tesla code
% mpirun -np 2 a.out
This MPI library has CUDA-aware support.
This MPI library has CUDA-aware support.
rank=1, 2

Though the better solution is to use the “host_data” directive to pass the device pointer to the MPI calls. For example:

% cat test_host_data.c
#include <stdlib.h>
#include <mpi.h>
#include "openacc.h"
#include "mpi-ext.h" /* Needed for CUDA-aware check */

int main(int argc, char* argv[])
  MPI_Init(&argc, &argv);

  if (1 == MPIX_Query_cuda_support()) {
      printf("This MPI library has CUDA-aware support.\n");
  } else {
      printf("This MPI library does not have CUDA-aware support.\n");

  int rank = -1;
  //printf("rank=%d\n", rank);

  int ngpus = acc_get_num_devices(acc_device_nvidia);
  int devicenum = (rank)%(ngpus);
  //printf("devicenum=%d\n", devicenum);


  int *buffer = (int *) malloc((size_t)10*sizeof(int));
  if (rank == 0) {
     for (int i=0; i<10; i++) buffer[i] = i;
#pragma acc enter data copyin(buffer[:10])

  if (rank == 0) {
#pragma acc host_data use_device(buffer)
    MPI_Send(buffer, 10, MPI_INT, 1, 0, MPI_COMM_WORLD);
  else {
#pragma acc host_data use_device(buffer)
    MPI_Recv(buffer, 10, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
#pragma acc update self(buffer[:10])
    printf("rank=1, %d\n", buffer[2]);
#pragma acc exit data delete(buffer)


% mpicc -ta=tesla -Minfo=accel test_host_data.c
     33, Generating enter data copyin(buffer[:10])
     45, Generating update self(buffer[:10])
     49, Generating exit data delete(buffer[:1])
% mpirun -np 2 a.out
This MPI library has CUDA-aware support.
This MPI library has CUDA-aware support.
rank=1, 2

Hope his helps,