Fortran derived type and allocatable data

I’d like to port code Fortran to GPU using openacc but the code has derived types which have allocatable variables in them. I’ve seen mention elsewhere that it’s possible to use pointers to enable this type of data to be used on the GPU. I’ve created a test program below, which compiles and runs but I don’t get any sensible data back, what am I doing wrong?



program dtypetest

integer, parameter :: dp=selected_real_kind(15,300)

type :: e_d
complex(kind=dp), dimension(:), allocatable :: aa
complex(kind=dp), dimension(:), allocatable :: bb
real(kind=dp), dimension(:), allocatable ::r_aa
real(kind=dp), dimension(:), allocatable :: r_bb
end type e_d


type(e_d),target :: dataa
type(e_d),pointer :: datab
integer :: i

allocate(dataa%aa(10))
allocate(dataa%bb(10))

allocate(dataa%r_aa(10))
allocate(dataa%r_bb(10))

datab => dataa

!$acc kernels loop copy(datab$p)
do i=1,10
datab%aa(i) = cmplx(i,i)
datab%bb(i) = cmplx(i,i)
datab%r_aa(i) = real(i)
datab%r_bb(i) = real(i)
end do
!$acc end kernels

write(,) dataa%aa
write(,) ’ ’
write(,) dataa%bb
write(,) ’ ’
write(,) dataa%r_aa
write(,) ’ ’
write(,) dataa%r_bb
nullify(datab)

deallocate(dataa%aa)
deallocate(dataa%bb)
deallocate(dataa%r_aa)
deallocate(dataa%r_bb)


end program



thanks

adrianj

Hi adrianj,

You need to create pointers to the member arrays.

  • Mat
% cat test.f90
program dtypetest
 implicit none
 integer, parameter :: dp=selected_real_kind(15,300)

 type :: e_d
   complex(kind=dp), dimension(:), allocatable :: aa
   complex(kind=dp), dimension(:), allocatable :: bb
   real(kind=dp), dimension(:), allocatable ::r_aa
   real(kind=dp), dimension(:), allocatable :: r_bb
 end type e_d
 complex(kind=dp), dimension(:), pointer :: aaptr
 complex(kind=dp), dimension(:), pointer :: bbptr
 real(kind=dp), dimension(:), pointer ::r_aaptr
 real(kind=dp), dimension(:), pointer :: r_bbptr

 type(e_d),target :: dataa
 integer :: i

 allocate(dataa%aa(10))
 allocate(dataa%bb(10))

 allocate(dataa%r_aa(10))
 allocate(dataa%r_bb(10))

 aaptr=>dataa%aa
 bbptr=>dataa%bb
 r_aaptr=>dataa%r_aa
 r_bbptr=>dataa%r_bb

 !$acc kernels loop independent
 do i=1,10
   aaptr(i) = cmplx(i,i)
   bbptr(i) = cmplx(i,i)
   r_aaptr(i) = real(i)
   r_bbptr(i) = real(i)
 end do
 !$acc end kernels

 write(*,*) dataa%aa
 write(*,*) ' '
 write(*,*) dataa%bb
 write(*,*) ' '
 write(*,*) dataa%r_aa
 write(*,*) ' '
 write(*,*) dataa%r_bb

 deallocate(dataa%aa)
 deallocate(dataa%bb)
 deallocate(dataa%r_aa)
 deallocate(dataa%r_bb)


 end program

% pgf90 test.f90 -acc -Minfo=accel; a.out
dtypetest:
     30, Generating copyout(r_bbptr(1:10))
         Generating copyout(r_aaptr(1:10))
         Generating copyout(bbptr(1:10))
         Generating copyout(aaptr(1:10))
         Generating NVIDIA code
     31, Loop is parallelizable
         Accelerator kernel generated
         31, !$acc loop gang, vector(32) ! blockidx%x threadidx%x
 (1.000000000000000,1.000000000000000)  (2.000000000000000,2.000000000000000)
 (3.000000000000000,3.000000000000000)  (4.000000000000000,4.000000000000000)
 (5.000000000000000,5.000000000000000)  (6.000000000000000,6.000000000000000)
 (7.000000000000000,7.000000000000000)  (8.000000000000000,8.000000000000000)
 (9.000000000000000,9.000000000000000)  (10.00000000000000,10.00000000000000)

 (1.000000000000000,1.000000000000000)  (2.000000000000000,2.000000000000000)
 (3.000000000000000,3.000000000000000)  (4.000000000000000,4.000000000000000)
 (5.000000000000000,5.000000000000000)  (6.000000000000000,6.000000000000000)
 (7.000000000000000,7.000000000000000)  (8.000000000000000,8.000000000000000)
 (9.000000000000000,9.000000000000000)  (10.00000000000000,10.00000000000000)

    1.000000000000000         2.000000000000000         3.000000000000000
    4.000000000000000         5.000000000000000         6.000000000000000
    7.000000000000000         8.000000000000000         9.000000000000000
    10.00000000000000

    1.000000000000000         2.000000000000000         3.000000000000000
    4.000000000000000         5.000000000000000         6.000000000000000
    7.000000000000000         8.000000000000000         9.000000000000000
    10.00000000000000

Hi Mat,

Thanks, works a treat.

cheers

adrianj

Hi Mat,

Who does this interact with the device attribute? Is it possible to add the device attribute to the derived type declaration? Or will this not work?

thanks

adrianj

You can add device to the members, but I don’t think it’s really what you want. What you really want is the ability to perform a deep copy. The OpenACC committee is in the process of defining how to do this, but it wont be available for awhile. The other possibility is to wait for us to add support for CUDA 6.0’s unified memory (expected some time this summer), in which case the CUDA driver will move the data for you. However, UM is not, at least at first, expected to have good performance.

Here’s an example of using the CUDA Fortran device attribute:

% cat test.cuf
program dtypetest
  use cudafor
  implicit none
  integer, parameter :: dp=selected_real_kind(15,300)

  type :: e_d
    complex(kind=dp), dimension(:), device, allocatable :: aa
    complex(kind=dp), dimension(:), allocatable :: bb
    real(kind=dp), dimension(:), device, allocatable ::r_aa
    real(kind=dp), dimension(:), allocatable :: r_bb
  end type e_d
  complex(kind=dp), dimension(:), pointer, device :: aaptr
  real(kind=dp), dimension(:), pointer, device ::r_aaptr

  type(e_d),target :: dataa
  integer :: i

  allocate(dataa%aa(10))
  allocate(dataa%bb(10))
  allocate(dataa%r_aa(10))
  allocate(dataa%r_bb(10))

  aaptr=>dataa%aa
  r_aaptr=>dataa%r_aa

  !$acc kernels loop independent
  do i=1,10
    aaptr(i) = cmplx(i,i)
    r_aaptr(i) = real(i)
  end do
  !$acc end kernels

  dataa%bb = dataa%aa  ! device to host copy
  dataa%r_bb = dataa%r_aa

  write(*,*) dataa%bb
  write(*,*) ' '
  write(*,*) dataa%r_bb

  deallocate(dataa%aa)
  deallocate(dataa%bb)
  deallocate(dataa%r_aa)
  deallocate(dataa%r_bb)

  end program

% pgf90 test.cuf -acc -Minfo=accel
dtypetest:
     27, Loop is parallelizable
         Accelerator kernel generated
         27, !$acc loop gang, vector(32) ! blockidx%x threadidx%x % a.out
 (1.000000000000000,1.000000000000000)  (2.000000000000000,2.000000000000000)
 (3.000000000000000,3.000000000000000)  (4.000000000000000,4.000000000000000)
 (5.000000000000000,5.000000000000000)  (6.000000000000000,6.000000000000000)
 (7.000000000000000,7.000000000000000)  (8.000000000000000,8.000000000000000)
 (9.000000000000000,9.000000000000000)  (10.00000000000000,10.00000000000000)

    1.000000000000000         2.000000000000000         3.000000000000000
    4.000000000000000         5.000000000000000         6.000000000000000
    7.000000000000000         8.000000000000000         9.000000000000000
    10.00000000000000