Hi,
I am working on a program with Fortran PGI 14.6 which I want to accelerate with OpenACC. I would appreciate some advice in the following problem I am facing:
Firstly I have some input data to read and allocate (topography, initial water depth and initial Manning coefficient).
PROGRAM IBER_MATRIX
use dflib
use dfport
use accel_lib
use openacc
IMPLICIT REAL*4 (A-H,O-Z)
CHARACTER*25 A25,B25
character*40 variable
real*4,allocatable::z_bed(:,:)
real*4,allocatable::h1(:,:)
real*4,allocatable::aman(:,:)
real*4,allocatable::z_ini(:,:)
real*4,allocatable::aman_ini(:,:)
real*4,allocatable::auxiliar(:,:)
real*4,allocatable::h(:)
real*4,allocatable::qx(:)
real*4,allocatable::qy(:)
real*4,allocatable::z0(:)
real*4,allocatable::amanning(:)
integer,allocatable::idomini(:,:)
integer,allocatable::numel(:,:)
integer,allocatable::IN(:)
integer,allocatable::IS(:)
integer,allocatable::IE(:)
integer,allocatable::IO(:)
real*8 xmin,ymin
real*8 xmin_ini,ymin_ini
real*8 xmin_man,ymin_man
aNODATA=-9999.
!reading of parameters
COURANT=0.45D0
HMIN=1.D-2
open(11,file='parametres.dat',status='old')
read(11,*)HMIN
read(11,*)Courant
read(11,*)Time_end
read(11,*)DT_results
read(11,*)i_cond_inicial
Dt=1 !initial time increment
G=9.81D0
close (11)
!end reading of parameters
!reading of topography
[...]
allocate(z_bed(NFIL,NCOL))
z_bed=aNODATA
allocate(h1(NFIL,NCOL))
h1=0.
allocate(aman(NFIL,NCOL))
aman=0.
allocate(idomini(NFIL,NCOL))
idomini=0 !0 is out of the computation domain
allocate(auxiliar(NFIL,NCOL)) !auxiliary variable for results
auxiliar=anodata
[...]
z_bed(i,j)
!end reading topography
!reading of initial conditions
[...]
z_ini(i,j)
!end reading initial conditions
!reading of Manning
[...]
aman_ini(i,j)
!end reading Manning
!change z_ini(i,j) to h1(i,j)
call asigna_a_matriu(nfil,ncol,xmin-dx,ymin-dx,h1,dx,nfil_ini,
. ncol_ini,xmin_ini,ymin_ini,z_ini,dx_ini,anodata) !assign initial conditions
if (i_cond_inicial.eq.1) then h1(:,:)=h1(:,:)-z_bed(:,:)
endif
deallocate(z_ini)
!change aman_ini(i,j) to aman(i,j)
call asigna_a_matriu(nfil,ncol,xmin-dx,ymin-dx,aman,dx,nfil_man,
. ncol_man,xmin_man,ymin_man,aman_ini,dx_man,anodata) !assign manning
deallocate(aman_ini)
TIME=0.
Time_results=-Dt_results !in order to write time 0
DA=Dx*Dx
DY=Dx
ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
allocate(h(numels))
h=0
allocate(amanning(numels))
amanning=0
allocate(z0(numels))
amanning=0
allocate(qx(numels))
qx=0
allocate(qy(numels))
qy=0
allocate(IN(numels))
IN=anodata
allocate(IS(numels))
IS=anodata
allocate(IO(numels))
IO=anodata
allocate(IE(numels))
IE=anodata
allocate(numel(nfil,ncol))
numel=anodata
!create vectors (h, amanning, z0, IN, IO, IS, IE)
ij=0
DO I=1,nfil
DO J=1,NCOL
IF (idomini(i,j).eq.1) THEN
ij=ij+1
h(ij)=h1(i,j)
amanning(ij)=aman(i,j)
z0(ij)=z_bed(i,j)
numel(i,j)=ij
ENDIF
ENDDO
ENDDO
ij=0
DO I=1,nfil
DO J=1,NCOL
IF (idomini(i,j).eq.1) THEN
IJ=IJ+1
if (i.gt.1.AND.i.lt.nfil.AND.j.gt.2.AND.j.lt.ncol)then
if (idomini(i-1,j).eq.1) IN(ij)=numel(i-1,j)
if (idomini(i,j-1).eq.1) IO(ij)=numel(i,j-1)
if (idomini(i+1,j).eq.1) IS(ij)=numel(i+1,j)
if (idomini(i,j+1).eq.1) IE(ij)=numel(i,j+1)
endif
ENDIF
ENDDO
ENDDO
deallocate (h1,z_bed,aman,idomini)
Secondly I do some explicit calculations with two large loops (one for the time evolution and the other for each element where I need the results). Before entering the time loop I send some variables to the device. Then I do some calculations with them in the second loop which I accelerate in the GPU with the “!$acc parallel loop” directive. Finally, and before ending the time loop, I need to transfer my results (h, qx, qy) to the host in order to write them in “.txt” files with the “write_grid” subroutine.
!$acc enter data copyin(h, qx, qy, amanning, z0, IN, IO, IS, IE)
DO WHILE (TIME.LT.Time_end) !begins time calculation
!writing results (which are h, qx and qy)
if (TIME-time_results.ge.Dt_results) then !write results
TIME_results=time
!writing of results in grid format
variable='Depth_'
do i=1,nfil
do j=1,ncol
if (numel(i,j).gt.0) then
ij=numel(i,j)
if (h(ij).gt.hmin) then
auxiliar(i,j)=h(ij)
else
auxiliar(i,j)=anodata
endif
endif
enddo
enddo
call write_grid(variable,temps,nfil,ncol,xmin,ymin,
. dx,auxiliar(1:nfil,1:ncol))
variable='qx_'
do i=1,nfil
do j=1,ncol
if (numel(i,j).gt.0) then
ij=numel(i,j)
if (h(ij).gt.hmin) then
auxiliar(i,j)=qx(ij)
else
auxiliar(i,j)=anodata
endif
endif
enddo
enddo
call write_grid(variable,temps,nfil,ncol,xmin,ymin,
. dx,auxiliar(1:nfil,1:ncol))
variable='qy_'
do i=1,nfil
do j=1,ncol
if (numel(i,j).gt.0) then
ij=numel(i,j)
if (h(ij).gt.hmin) then
auxiliar(i,j)=qy(ij)
else
auxiliar(i,j)=anodata
endif
endif
enddo
enddo
call write_grid(variable,temps,nfil,ncol,xmin,ymin,
. dx,auxiliar(1:nfil,1:ncol))
variable='Vx_'
do i=1,nfil
do j=1,ncol
if (numel(i,j).gt.0) then
ij=numel(i,j)
if (h(ij).gt.hmin) then
auxiliar(i,j)=qx(ij)/h(ij)
else
auxiliar(i,j)=anodata
endif
endif
enddo
enddo
call write_grid(variable,temps,nfil,ncol,xmin,ymin,
. dx,auxiliar(1:nfil,1:ncol))
variable='Vy_'
do i=1,nfil
do j=1,ncol
if (numel(i,j).gt.0) then
ij=numel(i,j)
if (h(ij).gt.hmin) then
auxiliar(i,j)=qy(ij)/h(ij)
else
auxiliar(i,j)=anodata
endif
endif
enddo
enddo
call write_grid(variable,temps,nfil,ncol,xmin,ymin,
. dx,auxiliar(1:nfil,1:ncol))
endif
!end writing results
!Courant computation
[...]
DT_local
!end Courant
TIME=TIME+DT
!$acc parallel loop
DO IJ=1,numels !begins the computation for each element of the vector
[...] !quite long but easy independent calculations for each element
!results of the loop:
h(ij)
qx(ij)
qy(ij)
ENDDO !numels loop
!$acc end parallel loop
ENDDO !Time loop
!$acc exit data copyout(h,qx,qy)
END
The problem is that I need to extract not just the last step of the computation but also the intermediate values every let’s say 10 seconds and I want to do just one copyin of the data at the begining. Instructions like “!$exit data copyout()” deallocates the data from the device and does not work.
My question:
Is there any instruction to copy data from device to host without deallocating the data in the device? (like the acc_memcpy_from_device?). Or do you know if there is any other way to do that I can use? (like the establishment of a more extensive region in which the data is available by the device and do some copyouts of the intermediate results?)
Thank you very much,
Martí