Dear all,
I am trying to use MPI communications (with MPI Cuda Aware, NVFortran 24.7-0, on a dual GPU node) with OpenACC, more specifically with device buffer variables allocated using acc_malloc. Unfortunately, the test failed, in particular, the resulting error complains that the buffer of send/receive is not mapped on devices.
Below I report the complete minimal test, it is simple:
- initialize MPI and devices environment;
- allocate buffer_dev on devices using acc_malloc (through iso_c_binding interface) ;
- populate buffer_dev using openacc loop with different values on each device;
- do MPI send/receive;
- check the result of the communication;
Unfortunately, the send instruction raises an error: Caught signal 11 (Segmentation fault: address not mapped to object at address….
This test tries to mimic the C example posted in this old thread:
Here is my Fortran test:
program test_deviceptr_mpi
use :: iso_c_binding
use :: mpi
use :: openacc
implicit none
integer :: sizes(3)=[1,2,3] ! arrays sizes
real, pointer :: buffer_dev(:,:,:) ! device work array
real, allocatable, target :: buffer_hos(:,:,:) ! host work array
type(c_ptr) :: cptr ! c-pointer
integer(c_size_t) :: bytes ! number of bytes of arryas
integer :: ierr ! error status
integer :: procs_number ! MPI processes number
integer :: myrank ! MPI current ID
character(:), allocatable :: myrankstr ! MPI ID stringified
integer :: local_comm ! MPI local communicator
integer :: local_rank ! local MPI split ID
integer :: devices_number ! devices number
integer :: mydev ! device current ID
integer :: i, j, k ! counters
interface
function acc_malloc_f(total_byte_dim) bind(c, name="acc_malloc")
use iso_c_binding, only : c_ptr, c_size_t
implicit none
type(c_ptr) :: acc_malloc_f
integer(c_size_t), value, intent(in) :: total_byte_dim
endfunction acc_malloc_f
subroutine acc_memcpy_from_device_f(host_ptr, dev_ptr, total_byte_dim) bind(c, name="acc_memcpy_from_device")
use iso_c_binding, only : c_ptr, c_size_t
implicit none
type(c_ptr), value :: host_ptr
type(c_ptr), value :: dev_ptr
integer(c_size_t), value :: total_byte_dim
endsubroutine acc_memcpy_from_device_f
endinterface
! initialize MPI and devices env
call MPI_INIT(ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD, procs_number, ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD, myrank, ierr)
call MPI_COMM_SPLIT_TYPE(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, local_comm, ierr)
call MPI_COMM_RANK(local_comm, local_rank, ierr)
myrankstr = repeat(' ',5)
write(myrankstr, '(I5.5)') myrank
myrankstr = 'proc'//trim(adjustl(myrankstr))//':'
devices_number = acc_get_num_devices(acc_device_nvidia)
mydev = mod(local_rank, devices_number)
call acc_set_device_num(mydev, acc_device_nvidia)
call acc_init(acc_device_nvidia)
print '(A,2I2)', myrankstr//' devices number, mydev', devices_number, mydev
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
! allocate work arrays on host and devices
bytes = int(storage_size(buffer_dev)/8, c_size_t) * int(product(sizes), c_size_t)
cptr = acc_malloc_f(bytes)
if (c_associated(cptr)) call c_f_pointer(cptr, buffer_dev, shape=sizes)
allocate(buffer_hos(sizes(1),sizes(2),sizes(3))) ; buffer_hos = -1.0
! prepare buffer_dev array
!$acc parallel loop collapse(3) deviceptr(buffer_dev)
do k=1, sizes(3)
do j=1, sizes(2)
do i=1, sizes(1)
if (myrank == 0) then
buffer_dev(i,j,k) = 0.0
else
buffer_dev(i,j,k) = 1.0
endif
enddo
enddo
enddo
! check buffer status
call acc_memcpy_from_device_f(c_loc(buffer_hos), c_loc(buffer_dev), bytes)
print '(A)', myrankstr//' buffer_dev array'
do k=1, sizes(3)
do j=1, sizes(2)
do i=1, sizes(1)
print '(A,3I3,F5.1)', myrankstr//' i j k a:', i,j,k,buffer_hos(i,j,k)
enddo
enddo
enddo
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
! MPI send from dev 1 to dev 0
!!$acc data deviceptr(buffer_dev)
!!$acc host_data use_device(buffer_dev)
if (myrank == 1) call MPI_SEND(buffer_dev, 6, MPI_REAL8, 0, 101, MPI_COMM_WORLD, ierr)
if (myrank == 0) call MPI_RECV(buffer_dev, 6, MPI_REAL8, 1, 101, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
!!$acc end host_data
!!$acc end data
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
if (myrank == 0) then
print '(A)', myrankstr//' check communication result'
call acc_memcpy_from_device_f(c_loc(buffer_hos), c_loc(buffer_dev), bytes)
print '(A)', myrankstr//' buffer_dev array'
do k=1, sizes(3)
do j=1, sizes(2)
do i=1, sizes(1)
print '(A,3I3,F5.1)', myrankstr//' i j k a:', i,j,k,buffer_hos(i,j,k)
enddo
enddo
enddo
if (any(int(buffer_hos) /= 1)) then
print '(A)', myrankstr//' communication failed'
else
print '(A)', myrankstr//' communication done'
endif
endif
call MPI_FINALIZE(ierr)
endprogram test_deviceptr_mpi
Note that “decorating” the send/recv calls with openacc directives has no effect, I obtain segfault with and without directives.
The complete output I got is the following.
proc00001: devices number, mydev 2 1
proc00000: devices number, mydev 2 0
proc00001: buffer_dev array
proc00001: i j k a: 1 1 1 1.0
proc00001: i j k a: 1 2 1 1.0
proc00001: i j k a: 1 1 2 1.0
proc00001: i j k a: 1 2 2 1.0
proc00001: i j k a: 1 1 3 1.0
proc00001: i j k a: 1 2 3 1.0
proc00000: buffer_dev array
proc00000: i j k a: 1 1 1 0.0
proc00000: i j k a: 1 2 1 0.0
proc00000: i j k a: 1 1 2 0.0
proc00000: i j k a: 1 2 2 0.0
proc00000: i j k a: 1 1 3 0.0
proc00000: i j k a: 1 2 3 0.0
[adam:120514:0:120514] Caught signal 11 (Segmentation fault: address not mapped to object at address 0xec7358)
==== backtrace (tid: 120514) ====
0 0x0000000000042520 __sigaction() ???:0
1 0x0000000000013528 ucc_event_manager_init() /build-result/src/hpcx-v2.19-gcc-mlnx_ofed-redhat7-cuda12-x86_64/ucc-0b4a0780918900fa497b1e6a65485247fecec4a2/src/schedule/ucc_schedule.c:38
2 0x0000000000013528 ucc_coll_task_init() /build-result/src/hpcx-v2.19-gcc-mlnx_ofed-redhat7-cuda12-x86_64/ucc-0b4a0780918900fa497b1e6a65485247fecec4a2/src/schedule/ucc_schedule.c:126
3 0x0000000000011592 ucc_tl_shm_get_task() /build-result/src/hpcx-v2.19-gcc-mlnx_ofed-redhat7-cuda12-x86_64/ucc-0b4a0780918900fa497b1e6a65485247fecec4a2/src/components/tl/shm/barrier/../tl_shm_coll.h:62
4 0x0000000000011592 ucc_tl_shm_barrier_init() /build-result/src/hpcx-v2.19-gcc-mlnx_ofed-redhat7-cuda12-x86_64/ucc-0b4a0780918900fa497b1e6a65485247fecec4a2/src/components/tl/shm/barrier/barrier.c:107
5 0x0000000000017ab0 ucc_coll_init() /build-result/src/hpcx-v2.19-gcc-mlnx_ofed-redhat7-cuda12-x86_64/ucc-0b4a0780918900fa497b1e6a65485247fecec4a2/src/coll_score/ucc_coll_score_map.c:132
6 0x0000000000010066 ucc_collective_init() /build-result/src/hpcx-v2.19-gcc-mlnx_ofed-redhat7-cuda12-x86_64/ucc-0b4a0780918900fa497b1e6a65485247fecec4a2/src/core/ucc_coll.c:234
7 0x0000000000004ce3 mca_coll_ucc_barrier_init() /var/jenkins/workspace/rel_nv_lib_hpcx_cuda12_x86_64/work/rebuild_ompi/ompi/build/ompi/mca/coll/ucc/../../../../../ompi/mca/coll/ucc/coll_ucc_barrier.c:19
8 0x0000000000004ce3 mca_coll_ucc_barrier() /var/jenkins/workspace/rel_nv_lib_hpcx_cuda12_x86_64/work/rebuild_ompi/ompi/build/ompi/mca/coll/ucc/../../../../../ompi/mca/coll/ucc/coll_ucc_barrier.c:32
9 0x00000000000618f8 PMPI_Barrier() /var/jenkins/workspace/rel_nv_lib_hpcx_cuda12_x86_64/work/rebuild_ompi/ompi/build/ompi/mpi/c/profile/pbarrier.c:74
10 0x0000000000044e73 ompi_barrier_f() /var/jenkins/workspace/rel_nv_lib_hpcx_cuda12_x86_64/work/rebuild_ompi/ompi/build/ompi/mpi/fortran/mpif-h/profile/pbarrier_f.c:76
11 0x0000000000403676 MAIN_() /home/stefano/fortran/FUNDAL/compilers_proofs/oac/test_deviceptr_mpi.f90:95
12 0x00000000004024f1 main() ???:0
13 0x0000000000029d90 __libc_init_first() ???:0
14 0x0000000000029e40 __libc_start_main() ???:0
15 0x00000000004023e5 _start() ???:0
=================================
[adam:120514] *** Process received signal ***
[adam:120514] Signal: Segmentation fault (11)
[adam:120514] Signal code: (-6)
[adam:120514] Failing at address: 0x3e80001d6c2
[adam:120514] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x42520)[0x7f99c1219520]
[adam:120514] [ 1] /opt/nvidia/hpc_sdk/Linux_x86_64/24.7/comm_libs/12.5/hpcx/hpcx-2.19/ucc/lib/libucc.so.1(ucc_coll_task_init+0xf8)[0x7f99a0c13528]
[adam:120514] [ 2] /opt/nvidia/hpc_sdk/Linux_x86_64/24.7/comm_libs/12.5/hpcx/hpcx-2.19/ucc/lib/ucc/libucc_tl_shm.so(ucc_tl_shm_barrier_init+0x92)[0x7f99ae411592]
[adam:120514] [ 3] /opt/nvidia/hpc_sdk/Linux_x86_64/24.7/comm_libs/12.5/hpcx/hpcx-2.19/ucc/lib/libucc.so.1(ucc_coll_init+0x110)[0x7f99a0c17ab0]
[adam:120514] [ 4] /opt/nvidia/hpc_sdk/Linux_x86_64/24.7/comm_libs/12.5/hpcx/hpcx-2.19/ucc/lib/libucc.so.1(ucc_collective_init+0x1c6)[0x7f99a0c10066]
[adam:120514] [ 5] /opt/nvidia/hpc_sdk/Linux_x86_64/24.7/comm_libs/12.5/hpcx/hpcx-2.19/ompi/lib/openmpi/mca_coll_ucc.so(mca_coll_ucc_barrier+0x73)[0x7f99a1004ce3]
[adam:120514] [ 6] /opt/nvidia/hpc_sdk/Linux_x86_64/24.7/comm_libs/12.5/hpcx/hpcx-2.19/ompi/lib/libmpi.so.40(MPI_Barrier+0x38)[0x7f99c48618f8]
[adam:120514] [ 7] /opt/nvidia/hpc_sdk/Linux_x86_64/24.7/comm_libs/12.5/hpcx/hpcx-2.19/ompi/lib/libmpi_mpifh.so.40(MPI_Barrier_f08+0x13)[0x7f99c4c44e73]
[adam:120514] [ 8] a.out[0x403676]
[adam:120514] [ 9] a.out[0x4024f1]
[adam:120514] [10] /lib/x86_64-linux-gnu/libc.so.6(+0x29d90)[0x7f99c1200d90]
[adam:120514] [11] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x80)[0x7f99c1200e40]
[adam:120514] [12] a.out[0x4023e5]
[adam:120514] *** End of error message ***
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node adam exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------
I am wondering why I cannot mimic the C example in Fortran: is my test wrong or are there some differences between C and Fortran compiler implementation of OpenACC?
Any suggestions are much more than welcome,
Kind regards,
Stefano