Compiling my program by
mpif90 -g -ta=nvidia,cuda4.0,maxregcount:32 -Minfo=accel,time -c main_MPI.f90
I get a pgfortran-Fatal (using PGI 11.8)
pgfortran-Fatal-/caspur/shared/sw/devel/pgi/linux86-64/11.8/bin/pgf902 TERMINATED by signal 11
Arguments to /caspur/shared/sw/devel/pgi/linux86-64/11.8/bin/pgf902
/caspur/shared/sw/devel/pgi/linux86-64/11.8/bin/pgf902 /tmp/pgfortranestfm9JlPDKl.ilm -fn main_MPI.f90 -debug -x 120 0x200 -opt 2 -terse 1 -inform warn -x 51 0x20 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -x 59 4 -x 59 4 -tp nehalem -x 124 0x1400 -y 15 2 -x 57 0x3b0000 -x 58 0x48000000 -x 49 0x100 -x 120 0x200 -astype 0 -x 124 1 -x 163 0x10001 -x 186 1 -accel nvidia -x 175 32 -x 176 0x140000 -x 177 0x0202007f -x 0 1 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -cmdline '+pgfortran main_MPI.f90 -g -ta=nvidia,cuda4.0,maxregcount:32 -Minfo=accel,time -c -I/work/ady/fsalvado/OPENMPI/BUILD_1.4.4_PGI11.8_LOUIS/include -I/work/ady/fsalvado/OPENMPI/BUILD_1.4.4_PGI11.8_LOUIS/lib' -asm /tmp/pgfortranKstfS9JIoAK8.s
Anybody can help me? I simplified the code (maybe I could do better):
module storage
character*128, parameter :: path_dati = './'
integer, parameter :: dp_kind = kind(1.d0)
integer, parameter :: sp_kind = kind(1.)
integer, parameter :: myk = dp_kind
integer :: istart_x
integer :: iend_x
integer, parameter :: ng_x=3
integer :: istart_y
integer :: iend_y
integer, parameter :: ng_y=3
integer :: istart_z
integer :: iend_z
integer, parameter :: ng_z=3
integer, parameter :: nt_x=60
integer, parameter :: nt_y=680
integer, parameter :: nt_z=60
real(myk), dimension(:,:,:), allocatable :: var_rho
real(myk), dimension(:,:,:), allocatable :: var_rhu
real(myk), dimension(:,:,:), allocatable :: var_rhv
real(myk), dimension(:,:,:), allocatable :: var_rhw
real(myk), dimension(:,:,:), allocatable :: var_rhe
real(myk), dimension(:,:,:), allocatable :: rhs_rho
real(myk), dimension(:,:,:), allocatable :: rhs_rhu
real(myk), dimension(:,:,:), allocatable :: rhs_rhv
real(myk), dimension(:,:,:), allocatable :: rhs_rhw
real(myk), dimension(:,:,:), allocatable :: rhs_rhe
!$acc mirror (var_rho,var_rhu,var_rhv,var_rhw,var_rhe)
!$acc mirror (rhs_rho,rhs_rhu,rhs_rhv,rhs_rhw,rhs_rhe)
end module storage
module mpi_storage
use storage
use mpi
implicit none
integer, parameter :: n_decomp=3
logical :: reorder !=.false.
logical,dimension(n_decomp)::periodicbc=(/.true. ,.false. ,.true. /)
integer :: istatus(MPI_STATUS_SIZE),ierr
integer :: n_rank,n_proc,n_left,n_right,n_top,n_bottom
integer :: n_front,n_back
integer, dimension(n_decomp) :: n_block,n_cart,n_truebc_start,n_truebc_end
integer, allocatable, dimension(:) :: all_st,all_en,all_sz
integer :: mpi_istart_x,mpi_iend_x
integer :: mpi_istart_y,mpi_iend_y
integer :: mpi_istart_z,mpi_iend_z
integer :: i_dim
integer :: mpi_comm_cart
integer :: mpi_prec = MPI_REAL8
end module mpi_storage
program cubeflu
use storage
use mpi_storage
use timing
use mpi
call start_mpi()
call allocate_storage()
var_rho = 1. ; var_rhu = 1. ; var_rhv = 1. ; var_rhw = 1. ; var_rhe = 1.
rhs_rho = 1. ; rhs_rhu = 1. ; rhs_rhv = 1. ; rhs_rhw = 1. ; rhs_rhe = 1.
!$acc update device( var_rho, var_rhu, var_rhv, var_rhw, var_rhe)
!$acc update device( rhs_rho, rhs_rhu, rhs_rhv, rhs_rhw, rhs_rhe)
call var_eval()
!$acc update host( var_rho, var_rhu, var_rhv, var_rhw, var_rhe)
call end_mpi()
end program cubeflu
subroutine allocate_storage
use storage
allocate(var_rho(istart_x-ng_x:iend_x+ng_x,istart_y-ng_y:iend_y+ng_y,istart_z-ng_z:iend_z+ng_z))
allocate(var_rhu(istart_x-ng_x:iend_x+ng_x,istart_y-ng_y:iend_y+ng_y,istart_z-ng_z:iend_z+ng_z))
allocate(var_rhv(istart_x-ng_x:iend_x+ng_x,istart_y-ng_y:iend_y+ng_y,istart_z-ng_z:iend_z+ng_z))
allocate(var_rhw(istart_x-ng_x:iend_x+ng_x,istart_y-ng_y:iend_y+ng_y,istart_z-ng_z:iend_z+ng_z))
allocate(var_rhe(istart_x-ng_x:iend_x+ng_x,istart_y-ng_y:iend_y+ng_y,istart_z-ng_z:iend_z+ng_z))
allocate(rhs_rho(istart_x:iend_x,istart_y:iend_y,istart_z:iend_z))
allocate(rhs_rhu(istart_x:iend_x,istart_y:iend_y,istart_z:iend_z))
allocate(rhs_rhv(istart_x:iend_x,istart_y:iend_y,istart_z:iend_z))
allocate(rhs_rhw(istart_x:iend_x,istart_y:iend_y,istart_z:iend_z))
allocate(rhs_rhe(istart_x:iend_x,istart_y:iend_y,istart_z:iend_z))
end subroutine allocate_storage
subroutine var_eval()
use storage
implicit none
integer i_x,i_y,i_z
! evaluation cycle.
!$acc region
do i_z=istart_z,iend_z
do i_y=istart_y,iend_y
do i_x=istart_x,iend_x
var_rho(i_x,i_y,i_z) = var_rho(i_x,i_y,i_z) + rhs_rho(i_x,i_y,i_z)
var_rhu(i_x,i_y,i_z) = var_rhu(i_x,i_y,i_z) + rhs_rhu(i_x,i_y,i_z)
var_rhv(i_x,i_y,i_z) = var_rhv(i_x,i_y,i_z) + rhs_rhv(i_x,i_y,i_z)
var_rhw(i_x,i_y,i_z) = var_rhw(i_x,i_y,i_z) + rhs_rhw(i_x,i_y,i_z)
var_rhe(i_x,i_y,i_z) = var_rhe(i_x,i_y,i_z) + rhs_rhe(i_x,i_y,i_z)
enddo
enddo
enddo
!$acc end region
end subroutine var_eval
subroutine start_mpi
use mpi
use storage
use mpi_storage
implicit none
integer i_x,i_y,i_z
reorder = .false.
call mpi_init(ierr)
call mpi_comm_rank(mpi_comm_world,n_rank,ierr)
call mpi_comm_size(mpi_comm_world,n_proc,ierr)
print*,'MPI size,rank: ',n_proc,n_rank
! Decompose procs in n_decomp rank blocks
n_block=0
call mpi_dims_create(n_proc,n_decomp,n_block,ierr)
call mpi_cart_create(mpi_comm_world,n_decomp,n_block,periodicbc, &
reorder,mpi_comm_cart,ierr)
print*,'MPI grid: ',n_block(1:n_decomp)
! Obtaining process ids with in the cartesian grid
call mpi_cart_coords(mpi_comm_cart,n_rank,n_decomp,n_cart,ierr)
! call mpi_cart_rank(mpi_comm_cart, ncoords, newrank, ierr)
! For any n_decomp direction decompose nodes between blocks e set true_bc array
allocate(all_st(0:n_block(1)-1),all_en(0:n_block(1)-1),all_sz(0:n_block(1)-1))
call MapDataToProc(nt_x,n_block(1),all_st,all_en,all_sz)
! call MapDataToProc(nt_x,n_block(1),all_st,all_en,all_sz,ratio)
mpi_istart_x = all_st(n_cart(1)) ; mpi_iend_x = all_en(n_cart(1))
deallocate(all_st,all_en,all_sz)
allocate(all_st(0:n_block(2)-1),all_en(0:n_block(2)-1),all_sz(0:n_block(2)-1))
call MapDataToProc(nt_y,n_block(2),all_st,all_en,all_sz)
mpi_istart_y = all_st(n_cart(2)) ; mpi_iend_y = all_en(n_cart(2))
deallocate(all_st,all_en,all_sz)
allocate(all_st(0:n_block(3)-1),all_en(0:n_block(3)-1),all_sz(0:n_block(3)-1))
call MapDataToProc(nt_z,n_block(3),all_st,all_en,all_sz)
mpi_istart_z = all_st(n_cart(3)) ; mpi_iend_z = all_en(n_cart(3))
deallocate(all_st,all_en,all_sz)
call mpi_cart_shift(mpi_comm_cart, 0, 1, n_left , n_right , ierr)
call mpi_cart_shift(mpi_comm_cart, 1, 1, n_bottom , n_top , ierr)
call mpi_cart_shift(mpi_comm_cart, 2, 1, n_back , n_front , ierr)
print*,'Rank: ',n_rank,'; n_cart: ',n_cart(1:n_decomp)
print*,'Rank: ',n_rank,'; neigh: ',n_front,n_back,n_top,n_bottom,n_left,n_right
print*,'mpi_istart_x,mpi_iend_x: ',mpi_istart_x,mpi_iend_x
print*,'mpi_istart_y,mpi_iend_y: ',mpi_istart_y,mpi_iend_y
print*,'mpi_istart_z,mpi_iend_z: ',mpi_istart_z,mpi_iend_z
istart_x = mpi_istart_x
iend_x = mpi_iend_x
istart_y = mpi_istart_y
iend_y = mpi_iend_y
istart_z = mpi_istart_z
iend_z = mpi_iend_z
end subroutine start_mpi
subroutine MapDataToProc(n_tot,n_proc_dir,st,en,sz)
!
implicit none
integer n_tot,n_proc_dir,st(0:n_proc_dir-1),en(0:n_proc_dir-1),sz(0:n_proc_dir-1)
integer i,n_size,nl,nu
n_size=n_tot/n_proc_dir
nu = n_tot - n_size * n_proc_dir
nl = n_proc_dir - nu
st(0) = 1
sz(0) = n_size
en(0) = n_size
do i=1,nl-1
st(i) = st(i-1) + n_size
sz(i) = n_size
en(i) = en(i-1) + n_size
enddo
n_size = n_size + 1
do i=nl,n_proc_dir-1
st(i) = en(i-1) + 1
sz(i) = n_size
en(i) = en(i-1) + n_size
enddo
en(n_proc_dir-1)= n_tot
sz(n_proc_dir-1)= n_tot-st(n_proc_dir-1)+1
end subroutine MapDataToProc
subroutine end_mpi
use mpi
integer :: ierr
call mpi_finalize(ierr)
end subroutine end_mpi