Correct manually deep copy of user defined data structure

Hi,

I’m trying to use a user defined datatype inside a kernel, but the program fails.
When I compile with the deepcopy flag it works but I would like to know how to manually create a deep copy.

How do I make a correct copy of the comm_data datatype?
I tried to create small example program, it creates a library and then links it to a testprogram.

Thank you for your help.

type.f90

module krylov
  type nrk
     integer ndata
     integer,allocatable :: offset(:)
  end type nrk
  type comm
     integer ndata
     type(nrk),allocatable :: nrk_array(:)
  end type comm

  type KSP
     type(comm) :: comm_data
     real*8,allocatable :: r(:)
     real*8,allocatable :: x(:)
     real*8,allocatable :: b(:)
  end type KSP

contains
  subroutine set_KSP_solver(method,size_n)
    implicit none
    type(KSP) :: method
    integer size_n
    write(*,*)"Set_ksp_solver"
  end subroutine set_KSP_solver
end module krylov

libmain.f90

    subroutine  solver(vec_size)
        use krylov
        implicit none
        type(KSP) method
        integer n,j,i,vec_size
        real*8,bnorm
        n=vec_size
        bnorm=0.0d0

        write(*,*)"Start testprogram"
        call init_KSP_CRS(method,10)
        !$acc data copyin(method, method%comm_data,&
        !$acc method%comm_data%nrk_array)
        !$acc parallel loop
        do j=1,n
           bnorm = bnorm+method%b(i)
           write(*,*)method%comm_data%nrk_array(j)%ndata
           write(*,*)method%comm_data%nrk_array(j)%offset(j)
        enddo
        !$acc end data

        write(*,*)"End testprogram"
      end subroutine solver

      subroutine init_KSP_CRS(method,n)
        use krylov
        implicit none
        type(KSP) method
        integer n,i
        allocate(method%comm_data%nrk_array(n))
        do i=1,n
          method%comm_data%nrk_array(i)%ndata = i
          allocate(method%comm_data%nrk_array(i)%offset(i))
        enddo
end subroutine init_KSP_CRS

main.f90

PROGRAM test
        implicit none
        write(*,*)"Start testprogram"
        call solver(10)
        write(*,*)"End testprogram"
      end PROGRAM test

makefile

CC=pgf90

OBJS=type.o libmain.o
OPTS=-ta=tesla:cc70 -acc -Minfo=accel -Minfo

%.o: %.f90
        ${CC} ${OPTS} -c $<

all: myProgram
myProgram: main.o libtest.a
        ${CC} ${OPTS} -o myProgram main.o -L. -ltest
myProg:main.o
        ${CC} ${OPTS} -c $<

libtest.a:${OBJS}
        ar rc libtest.a ${OBJS}
clean:
        rm -f libtest.a *.o

Hi Peter,

I typically use unstructured data regions which are inserted as part of the allocation and deallocation of the type. Then use update directives in between to synchronize the data. This way the device copy of the type has the same lifetime and scope as the host copy of the type.

Your example wasn’t run able (it segv’s on the host), so I fixed a few things to get it to work and also added how I would perform the deep copy.

% cat libmain.f90
    subroutine  solver(vec_size)
        use krylov
        implicit none
        type(KSP) method
        integer n,j,i,vec_size
        real*8,bnorm
        n=vec_size
        bnorm=0.0d0

        write(*,*)"Start testprogram"
        call init_KSP_CRS(method,n)
        !$acc parallel loop present(method) reduction(+:bnorm)
        do j=1,n
           bnorm = bnorm+method%b(j)
           write(*,*)method%comm_data%nrk_array(j)%ndata, method%comm_data%nrk_array(j)%offset(j)
        enddo

        write(*,*)"End testprogram", bnorm
      end subroutine solver

      subroutine init_KSP_CRS(method,n)
        use krylov
        implicit none
        type(KSP) method
        integer n,i,j
        allocate(method%b(n))
        allocate(method%r(n))
        allocate(method%x(n))
        allocate(method%comm_data%nrk_array(n))
!$acc enter data create(method,method%b(:n),method%r(:n),method%x(n), &
!$acc                   method%comm_data%nrk_array(:n))
        do i=1,n
          method%comm_data%nrk_array(i)%ndata = i
!$acc update device(method%comm_data%nrk_array(i)%ndata)
          allocate(method%comm_data%nrk_array(i)%offset(i))
!$acc enter data create(method%comm_data%nrk_array(i)%offset(:i))
        enddo

!This can be someplace else in the program, i.e. when the data is assigned

        method%b=1
!$acc update device(method%b)
        do i=1,n
           method%comm_data%nrk_array(i)%offset = i
!$acc update device(method%comm_data%nrk_array(i)%offset)
        enddo

end subroutine init_KSP_CRS
% pgfortran type.f90 libmain.f90 main.f90 -o cpu.out
type.f90:
libmain.f90:
main.f90:
% pgfortran type.f90 libmain.f90 main.f90 -ta=tesla -o gpu.out
type.f90:
libmain.f90:
main.f90:
% ./cpu.out
 Start testprogram
 Start testprogram
            1            1
            2            2
            3            3
            4            4
            5            5
            6            6
            7            7
            8            8
            9            9
           10           10
 End testprogram    10.00000000000000
 End testprogram
% setenv PGI_ACC_TIME 1
% ./gpu.out
 Start testprogram
 Start testprogram
            1            1
            2            2
            3            3
            4            4
            5            5
            6            6
            7            7
            8            8
            9            9
           10           10
 End testprogram    10.00000000000000
 End testprogram

Accelerator Kernel Timing data
libmain.f90
  solver  NVIDIA  devicenum=0
    time(us): 131
    12: compute region reached 1 time
        12: kernel launched 1 time
            grid: [1]  block: [128]
             device time(us): total=96 max=96 min=96 avg=96
            elapsed time(us): total=1,341 max=1,341 min=1,341 avg=1,341
        12: reduction kernel launched 1 time
            grid: [1]  block: [256]
             device time(us): total=4 max=4 min=4 avg=4
            elapsed time(us): total=29 max=29 min=29 avg=29
    12: data region reached 4 times
        12: data copyin transfers: 1
             device time(us): total=6 max=6 min=6 avg=6
        18: data copyout transfers: 1
             device time(us): total=25 max=25 min=25 avg=25
libmain.f90
  init_ksp_crs  NVIDIA  devicenum=0
    time(us): 217
    30: data region reached 1 time
        30: data copyin transfers: 4
             device time(us): total=40 max=16 min=6 avg=10
    34: update directive reached 10 times
        34: data copyin transfers: 10
             device time(us): total=59 max=9 min=5 avg=5
    36: data region reached 10 times
        36: data copyin transfers: 10
             device time(us): total=62 max=7 min=6 avg=6
    42: update directive reached 1 time
        42: data copyin transfers: 1
             device time(us): total=6 max=6 min=6 avg=6
    45: update directive reached 10 times
        45: data copyin transfers: 10
             device time(us): total=50 max=5 min=5 avg=5

Hope this helps,
Mat

Thank you for your answer! It worked!