pgfortran-Fatal

Compiling my program by

mpif90  -g -ta=nvidia,cuda4.0,maxregcount:32 -Minfo=accel,time   -c main_MPI.f90

I get a pgfortran-Fatal (using PGI 11.8)

pgfortran-Fatal-/caspur/shared/sw/devel/pgi/linux86-64/11.8/bin/pgf902 TERMINATED by signal 11
Arguments to /caspur/shared/sw/devel/pgi/linux86-64/11.8/bin/pgf902
/caspur/shared/sw/devel/pgi/linux86-64/11.8/bin/pgf902 /tmp/pgfortranestfm9JlPDKl.ilm -fn main_MPI.f90 -debug -x 120 0x200 -opt 2 -terse 1 -inform warn -x 51 0x20 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -x 59 4 -x 59 4 -tp nehalem -x 124 0x1400 -y 15 2 -x 57 0x3b0000 -x 58 0x48000000 -x 49 0x100 -x 120 0x200 -astype 0 -x 124 1 -x 163 0x10001 -x 186 1 -accel nvidia -x 175 32 -x 176 0x140000 -x 177 0x0202007f -x 0 1 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -cmdline '+pgfortran main_MPI.f90 -g -ta=nvidia,cuda4.0,maxregcount:32 -Minfo=accel,time -c -I/work/ady/fsalvado/OPENMPI/BUILD_1.4.4_PGI11.8_LOUIS/include -I/work/ady/fsalvado/OPENMPI/BUILD_1.4.4_PGI11.8_LOUIS/lib' -asm /tmp/pgfortranKstfS9JIoAK8.s

Anybody can help me? I simplified the code (maybe I could do better):

module storage

character*128, parameter :: path_dati = './'

integer, parameter :: dp_kind = kind(1.d0)
integer, parameter :: sp_kind = kind(1.)
integer, parameter :: myk = dp_kind

integer :: istart_x
integer :: iend_x
integer, parameter :: ng_x=3
integer :: istart_y
integer :: iend_y
integer, parameter :: ng_y=3
integer :: istart_z
integer :: iend_z
integer, parameter :: ng_z=3
integer, parameter :: nt_x=60
integer, parameter :: nt_y=680
integer, parameter :: nt_z=60

real(myk), dimension(:,:,:), allocatable :: var_rho
real(myk), dimension(:,:,:), allocatable :: var_rhu
real(myk), dimension(:,:,:), allocatable :: var_rhv
real(myk), dimension(:,:,:), allocatable :: var_rhw
real(myk), dimension(:,:,:), allocatable :: var_rhe
real(myk), dimension(:,:,:), allocatable :: rhs_rho
real(myk), dimension(:,:,:), allocatable :: rhs_rhu
real(myk), dimension(:,:,:), allocatable :: rhs_rhv
real(myk), dimension(:,:,:), allocatable :: rhs_rhw
real(myk), dimension(:,:,:), allocatable :: rhs_rhe
!$acc mirror (var_rho,var_rhu,var_rhv,var_rhw,var_rhe)
!$acc mirror (rhs_rho,rhs_rhu,rhs_rhv,rhs_rhw,rhs_rhe)

end module storage

module mpi_storage
use storage
use mpi
implicit none
integer, parameter :: n_decomp=3
logical ::  reorder !=.false.
logical,dimension(n_decomp)::periodicbc=(/.true.  ,.false. ,.true. /)
integer :: istatus(MPI_STATUS_SIZE),ierr
integer :: n_rank,n_proc,n_left,n_right,n_top,n_bottom
integer :: n_front,n_back
integer, dimension(n_decomp) :: n_block,n_cart,n_truebc_start,n_truebc_end
integer, allocatable, dimension(:) :: all_st,all_en,all_sz
integer :: mpi_istart_x,mpi_iend_x
integer :: mpi_istart_y,mpi_iend_y
integer :: mpi_istart_z,mpi_iend_z
integer :: i_dim
integer :: mpi_comm_cart
integer :: mpi_prec = MPI_REAL8
end module mpi_storage

program cubeflu

use storage
use mpi_storage
use timing
use mpi

call start_mpi()
call allocate_storage()
var_rho = 1. ; var_rhu = 1. ; var_rhv = 1. ; var_rhw = 1. ; var_rhe = 1.
rhs_rho = 1. ; rhs_rhu = 1. ; rhs_rhv = 1. ; rhs_rhw = 1. ; rhs_rhe = 1.
!$acc update device( var_rho, var_rhu, var_rhv, var_rhw, var_rhe)
!$acc update device( rhs_rho, rhs_rhu, rhs_rhv, rhs_rhw, rhs_rhe)
call var_eval()     
!$acc update host( var_rho, var_rhu, var_rhv, var_rhw, var_rhe)
call end_mpi()

end program cubeflu

subroutine allocate_storage 

use storage

allocate(var_rho(istart_x-ng_x:iend_x+ng_x,istart_y-ng_y:iend_y+ng_y,istart_z-ng_z:iend_z+ng_z))
allocate(var_rhu(istart_x-ng_x:iend_x+ng_x,istart_y-ng_y:iend_y+ng_y,istart_z-ng_z:iend_z+ng_z))
allocate(var_rhv(istart_x-ng_x:iend_x+ng_x,istart_y-ng_y:iend_y+ng_y,istart_z-ng_z:iend_z+ng_z))
allocate(var_rhw(istart_x-ng_x:iend_x+ng_x,istart_y-ng_y:iend_y+ng_y,istart_z-ng_z:iend_z+ng_z))
allocate(var_rhe(istart_x-ng_x:iend_x+ng_x,istart_y-ng_y:iend_y+ng_y,istart_z-ng_z:iend_z+ng_z))
allocate(rhs_rho(istart_x:iend_x,istart_y:iend_y,istart_z:iend_z))
allocate(rhs_rhu(istart_x:iend_x,istart_y:iend_y,istart_z:iend_z))
allocate(rhs_rhv(istart_x:iend_x,istart_y:iend_y,istart_z:iend_z))
allocate(rhs_rhw(istart_x:iend_x,istart_y:iend_y,istart_z:iend_z))
allocate(rhs_rhe(istart_x:iend_x,istart_y:iend_y,istart_z:iend_z))

end subroutine allocate_storage

subroutine var_eval()
use storage
implicit none 
integer i_x,i_y,i_z
! evaluation cycle. 
!$acc region
do i_z=istart_z,iend_z
do i_y=istart_y,iend_y
do i_x=istart_x,iend_x
   var_rho(i_x,i_y,i_z) = var_rho(i_x,i_y,i_z) + rhs_rho(i_x,i_y,i_z)
   var_rhu(i_x,i_y,i_z) = var_rhu(i_x,i_y,i_z) + rhs_rhu(i_x,i_y,i_z)
   var_rhv(i_x,i_y,i_z) = var_rhv(i_x,i_y,i_z) + rhs_rhv(i_x,i_y,i_z)
   var_rhw(i_x,i_y,i_z) = var_rhw(i_x,i_y,i_z) + rhs_rhw(i_x,i_y,i_z)
   var_rhe(i_x,i_y,i_z) = var_rhe(i_x,i_y,i_z) + rhs_rhe(i_x,i_y,i_z)
enddo
enddo
enddo
!$acc end region
end subroutine var_eval

subroutine start_mpi

use mpi
use storage
use mpi_storage

implicit none

integer i_x,i_y,i_z
reorder = .false.
call mpi_init(ierr)
call mpi_comm_rank(mpi_comm_world,n_rank,ierr)
call mpi_comm_size(mpi_comm_world,n_proc,ierr)
print*,'MPI size,rank: ',n_proc,n_rank
! Decompose procs in n_decomp rank blocks
n_block=0
call mpi_dims_create(n_proc,n_decomp,n_block,ierr)

call mpi_cart_create(mpi_comm_world,n_decomp,n_block,periodicbc,  &
                     reorder,mpi_comm_cart,ierr)
print*,'MPI grid: ',n_block(1:n_decomp)
! Obtaining process ids with in the cartesian grid
call mpi_cart_coords(mpi_comm_cart,n_rank,n_decomp,n_cart,ierr)
!     call mpi_cart_rank(mpi_comm_cart, ncoords, newrank, ierr)
! For any n_decomp direction decompose nodes between blocks e set true_bc array
allocate(all_st(0:n_block(1)-1),all_en(0:n_block(1)-1),all_sz(0:n_block(1)-1))
  call MapDataToProc(nt_x,n_block(1),all_st,all_en,all_sz)
!  call MapDataToProc(nt_x,n_block(1),all_st,all_en,all_sz,ratio)
  mpi_istart_x = all_st(n_cart(1))   ;   mpi_iend_x = all_en(n_cart(1))
deallocate(all_st,all_en,all_sz)
allocate(all_st(0:n_block(2)-1),all_en(0:n_block(2)-1),all_sz(0:n_block(2)-1))
  call MapDataToProc(nt_y,n_block(2),all_st,all_en,all_sz)
  mpi_istart_y = all_st(n_cart(2))   ;   mpi_iend_y = all_en(n_cart(2))
deallocate(all_st,all_en,all_sz)
allocate(all_st(0:n_block(3)-1),all_en(0:n_block(3)-1),all_sz(0:n_block(3)-1))
  call MapDataToProc(nt_z,n_block(3),all_st,all_en,all_sz)
  mpi_istart_z = all_st(n_cart(3))   ;   mpi_iend_z = all_en(n_cart(3))
deallocate(all_st,all_en,all_sz)

call mpi_cart_shift(mpi_comm_cart, 0, 1, n_left   , n_right  , ierr)
call mpi_cart_shift(mpi_comm_cart, 1, 1, n_bottom , n_top    , ierr)
call mpi_cart_shift(mpi_comm_cart, 2, 1, n_back   , n_front  , ierr)

print*,'Rank: ',n_rank,'; n_cart: ',n_cart(1:n_decomp)
print*,'Rank: ',n_rank,'; neigh: ',n_front,n_back,n_top,n_bottom,n_left,n_right
print*,'mpi_istart_x,mpi_iend_x: ',mpi_istart_x,mpi_iend_x
print*,'mpi_istart_y,mpi_iend_y: ',mpi_istart_y,mpi_iend_y
print*,'mpi_istart_z,mpi_iend_z: ',mpi_istart_z,mpi_iend_z
istart_x = mpi_istart_x
iend_x   = mpi_iend_x
istart_y = mpi_istart_y
iend_y   = mpi_iend_y
istart_z = mpi_istart_z
iend_z   = mpi_iend_z

end subroutine start_mpi

subroutine MapDataToProc(n_tot,n_proc_dir,st,en,sz)
!    
implicit none
integer n_tot,n_proc_dir,st(0:n_proc_dir-1),en(0:n_proc_dir-1),sz(0:n_proc_dir-1)
integer i,n_size,nl,nu

n_size=n_tot/n_proc_dir
nu = n_tot - n_size * n_proc_dir
nl = n_proc_dir - nu
st(0) = 1
sz(0) = n_size
en(0) = n_size
do i=1,nl-1
  st(i) = st(i-1) + n_size
  sz(i) = n_size
  en(i) = en(i-1) + n_size
enddo
n_size = n_size + 1
do i=nl,n_proc_dir-1
   st(i) = en(i-1) + 1
   sz(i) = n_size
   en(i) = en(i-1) + n_size
enddo
en(n_proc_dir-1)= n_tot
sz(n_proc_dir-1)= n_tot-st(n_proc_dir-1)+1

end subroutine MapDataToProc

subroutine end_mpi

use mpi

integer :: ierr

call mpi_finalize(ierr)

end subroutine end_mpi

Hi franzisko,

This is known compiler issue (TPR#18220) and was fixed in the 11.10 release of the compilers. Please update your compiler to avoid this error.

Thanks!
Mat

% pgf90 main_MPI.f90  -Mmpi=mpich2 -g -ta=nvidia,cuda4.0,maxregcount:32 -Minfo=accel,time -V11.10
cubeflu:
     68, Generating !$acc update device(var_rhe(:,:,:))
         Generating !$acc update device(var_rhw(:,:,:))
         Generating !$acc update device(var_rhv(:,:,:))
         Generating !$acc update device(var_rhu(:,:,:))
         Generating !$acc update device(var_rho(:,:,:))
     69, Generating !$acc update device(rhs_rhe(:,:,:))
         Generating !$acc update device(rhs_rhw(:,:,:))
         Generating !$acc update device(rhs_rhv(:,:,:))
         Generating !$acc update device(rhs_rhu(:,:,:))
         Generating !$acc update device(rhs_rho(:,:,:))
     71, Generating !$acc update host(var_rhe(:,:,:))
         Generating !$acc update host(var_rhw(:,:,:))
         Generating !$acc update host(var_rhv(:,:,:))
         Generating !$acc update host(var_rhu(:,:,:))
         Generating !$acc update host(var_rho(:,:,:))
var_eval:
     98, Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
     99, Loop is parallelizable
    100, Loop is parallelizable
    101, Loop is parallelizable
         Accelerator kernel generated
         99, !$acc do parallel, vector(4) ! blockidx%y threadidx%z
        100, !$acc do parallel, vector(4) ! blockidx%x threadidx%y
        101, !$acc do vector(16) ! threadidx%x
             CC 1.3 : 15 registers; 24 shared, 440 constant, 0 local memory bytes; 100% occupancy
             CC 2.0 : 32 registers; 8 shared, 464 constant, 0 local memory bytes; 66% occupancy
  Timing stats:
    import                  33 millisecs    28%
    expand                  16 millisecs    13%
    optimize                17 millisecs    14%
    schedule                51 millisecs    43%
    Total time             117 millisecs