Data transfer from device to host

Hi,

I am working on a program with Fortran PGI 14.6 which I want to accelerate with OpenACC. I would appreciate some advice in the following problem I am facing:

Firstly I have some input data to read and allocate (topography, initial water depth and initial Manning coefficient).

PROGRAM IBER_MATRIX

	  use dflib
	  use dfport
	  use accel_lib
	  use openacc

      IMPLICIT REAL*4 (A-H,O-Z)
      CHARACTER*25 A25,B25
      character*40 variable
      real*4,allocatable::z_bed(:,:)        
      real*4,allocatable::h1(:,:)
      real*4,allocatable::aman(:,:)
      real*4,allocatable::z_ini(:,:)
      real*4,allocatable::aman_ini(:,:)
      real*4,allocatable::auxiliar(:,:)
      real*4,allocatable::h(:)
      real*4,allocatable::qx(:)
      real*4,allocatable::qy(:)
      real*4,allocatable::z0(:)
      real*4,allocatable::amanning(:)
      integer,allocatable::idomini(:,:)
      integer,allocatable::numel(:,:)
      integer,allocatable::IN(:)
      integer,allocatable::IS(:)
      integer,allocatable::IE(:)
      integer,allocatable::IO(:)
      real*8 xmin,ymin
      real*8 xmin_ini,ymin_ini
      real*8 xmin_man,ymin_man
      
      aNODATA=-9999.

      !reading of parameters
      COURANT=0.45D0
      HMIN=1.D-2
      open(11,file='parametres.dat',status='old')
      read(11,*)HMIN
      read(11,*)Courant
      read(11,*)Time_end
      read(11,*)DT_results
      read(11,*)i_cond_inicial
      Dt=1	!initial time increment
      G=9.81D0
      close (11)
     !end reading of parameters

      !reading of topography
             [...]
	  allocate(z_bed(NFIL,NCOL))
	  z_bed=aNODATA	
	  allocate(h1(NFIL,NCOL))
	  h1=0.
	  allocate(aman(NFIL,NCOL))
	  aman=0.  
	  allocate(idomini(NFIL,NCOL))
	  idomini=0   !0 is out of the computation domain
	  allocate(auxiliar(NFIL,NCOL))  !auxiliary variable for results
	  auxiliar=anodata
             [...]
      z_bed(i,j)
      !end reading topography

      !reading of initial conditions
             [...]
      z_ini(i,j)
      !end reading initial conditions

      !reading of Manning
             [...]
      aman_ini(i,j)
      !end reading Manning

      !change z_ini(i,j) to h1(i,j)
      call asigna_a_matriu(nfil,ncol,xmin-dx,ymin-dx,h1,dx,nfil_ini,
     .	ncol_ini,xmin_ini,ymin_ini,z_ini,dx_ini,anodata) !assign initial conditions
      if (i_cond_inicial.eq.1) then h1(:,:)=h1(:,:)-z_bed(:,:)
      endif
      deallocate(z_ini) 
          
      !change aman_ini(i,j) to aman(i,j)
      call asigna_a_matriu(nfil,ncol,xmin-dx,ymin-dx,aman,dx,nfil_man,
     .	ncol_man,xmin_man,ymin_man,aman_ini,dx_man,anodata)	!assign manning
      deallocate(aman_ini)


      TIME=0.
      Time_results=-Dt_results  !in order to write time 0
      DA=Dx*Dx
      DY=Dx
      
ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc

      allocate(h(numels))
      h=0
      allocate(amanning(numels))
      amanning=0
      allocate(z0(numels))
      amanning=0
      allocate(qx(numels))
      qx=0
      allocate(qy(numels))
      qy=0
      allocate(IN(numels))
      IN=anodata
      allocate(IS(numels))
      IS=anodata
      allocate(IO(numels))
      IO=anodata
      allocate(IE(numels))
      IE=anodata
      allocate(numel(nfil,ncol))
      numel=anodata


      !create vectors (h, amanning, z0, IN, IO, IS, IE)
        ij=0
        DO I=1,nfil	
           DO J=1,NCOL
             IF (idomini(i,j).eq.1) THEN
             ij=ij+1
             h(ij)=h1(i,j)
             amanning(ij)=aman(i,j)
             z0(ij)=z_bed(i,j)
             numel(i,j)=ij
             ENDIF
           ENDDO
        ENDDO
        ij=0
        DO I=1,nfil	
           DO J=1,NCOL
             IF (idomini(i,j).eq.1) THEN
                 IJ=IJ+1
                 if (i.gt.1.AND.i.lt.nfil.AND.j.gt.2.AND.j.lt.ncol)then
                  if (idomini(i-1,j).eq.1) IN(ij)=numel(i-1,j)
                  if (idomini(i,j-1).eq.1) IO(ij)=numel(i,j-1)
                  if (idomini(i+1,j).eq.1) IS(ij)=numel(i+1,j)
                  if (idomini(i,j+1).eq.1) IE(ij)=numel(i,j+1)
                 endif
             ENDIF
           ENDDO
        ENDDO

      deallocate (h1,z_bed,aman,idomini)

Secondly I do some explicit calculations with two large loops (one for the time evolution and the other for each element where I need the results). Before entering the time loop I send some variables to the device. Then I do some calculations with them in the second loop which I accelerate in the GPU with the “!$acc parallel loop” directive. Finally, and before ending the time loop, I need to transfer my results (h, qx, qy) to the host in order to write them in “.txt” files with the “write_grid” subroutine.

!$acc enter data copyin(h, qx, qy, amanning, z0, IN, IO, IS, IE)

      DO WHILE (TIME.LT.Time_end)  !begins time calculation

      !writing results (which are h, qx and qy)
         
	     if (TIME-time_results.ge.Dt_results) then  !write results
	     TIME_results=time
	     
	     !writing of results in grid format
		  variable='Depth_'
		  do i=1,nfil
			do j=1,ncol
			  if (numel(i,j).gt.0) then
			  ij=numel(i,j)
			  if (h(ij).gt.hmin) then
				auxiliar(i,j)=h(ij)
			  else
				auxiliar(i,j)=anodata
			  endif
			  endif	
			enddo
		 enddo
	      call write_grid(variable,temps,nfil,ncol,xmin,ymin,
     .	dx,auxiliar(1:nfil,1:ncol))
     
		  variable='qx_'
		  do i=1,nfil
			do j=1,ncol
			  if (numel(i,j).gt.0) then
			  ij=numel(i,j)
			  if (h(ij).gt.hmin) then
				auxiliar(i,j)=qx(ij)
			  else
				auxiliar(i,j)=anodata
			  endif
			  endif	
			enddo
		 enddo
	      call write_grid(variable,temps,nfil,ncol,xmin,ymin, 
     .	dx,auxiliar(1:nfil,1:ncol))  
     
		  variable='qy_'
		  do i=1,nfil
			do j=1,ncol
			  if (numel(i,j).gt.0) then
			  ij=numel(i,j)
			  if (h(ij).gt.hmin) then
				auxiliar(i,j)=qy(ij)
			  else
				auxiliar(i,j)=anodata
			  endif
			  endif	
			enddo
		 enddo
	      call write_grid(variable,temps,nfil,ncol,xmin,ymin,       
     .	dx,auxiliar(1:nfil,1:ncol))   	   
	   
		 variable='Vx_'
		  do i=1,nfil
			do j=1,ncol
			  if (numel(i,j).gt.0) then
			  ij=numel(i,j)
			  if (h(ij).gt.hmin) then
				auxiliar(i,j)=qx(ij)/h(ij)
			  else
				auxiliar(i,j)=anodata
			  endif
			  endif	
			enddo
		 enddo
	      call write_grid(variable,temps,nfil,ncol,xmin,ymin, 
     .	dx,auxiliar(1:nfil,1:ncol))
   	      
		 variable='Vy_' 
		  do i=1,nfil
			do j=1,ncol
			  if (numel(i,j).gt.0) then
			  ij=numel(i,j)
			  if (h(ij).gt.hmin) then
				auxiliar(i,j)=qy(ij)/h(ij)
			  else
				auxiliar(i,j)=anodata
			  endif
			  endif	
			enddo
		 enddo
	      call write_grid(variable,temps,nfil,ncol,xmin,ymin,
     .	dx,auxiliar(1:nfil,1:ncol))  
	       
	     endif  
      !end writing results

      !Courant computation
             [...]
      DT_local
      !end Courant

      TIME=TIME+DT

!$acc parallel loop

        DO IJ=1,numels  !begins the computation for each element of the vector
                
                [...]    !quite long but easy independent calculations for each element

       !results of the loop:
        h(ij)
        qx(ij)
        qy(ij)


        ENDDO !numels loop
!$acc end parallel loop

      ENDDO !Time loop

!$acc exit data copyout(h,qx,qy)
      
     END

The problem is that I need to extract not just the last step of the computation but also the intermediate values every let’s say 10 seconds and I want to do just one copyin of the data at the begining. Instructions like “!$exit data copyout()” deallocates the data from the device and does not work.

My question:
Is there any instruction to copy data from device to host without deallocating the data in the device? (like the acc_memcpy_from_device?). Or do you know if there is any other way to do that I can use? (like the establishment of a more extensive region in which the data is available by the device and do some copyouts of the intermediate results?)

Thank you very much,

Martí

Hi Marti,

Is there any instruction to copy data from device to host without deallocating the data in the device?

Use the “update” directive just before you write the intermediary data to the file. Something like:

!$acc update host(h,qx,qy)

Hope this helps,
Mat

It works Mat!
Thank you very much for your help,
Martí