How to use managed nested drived type in CUDA Fortran

Hi Experts, look at this toy code:

module datastructures

  use cudafor
  !use nvtx_mod

  INTEGER, PARAMETER :: EB = SELECTED_REAL_KIND(12)    !< Precision of "Eight Byte" reals

TYPE BOUNDARY_PROP1_TYPE

        REAL(EB), ALLOCATABLE, DIMENSION(:) :: M_DOT_G_PP_ACTUAL   !< (1:N_TRACKED_SPECIES) Actual mass production rate per unit area
        REAL(EB) :: K_G = 0.1_EB            !< Face area (m2)
END TYPE BOUNDARY_PROP1_TYPE


TYPE BOUNDARY_COORD_TYPE
   INTEGER :: IOR=0          !< Index of orientation of the WALL cell
END TYPE BOUNDARY_COORD_TYPE

TYPE WALL_TYPE
   INTEGER :: BC_INDEX=0              !< Index within the array BOUNDARY_COORD
   INTEGER :: B1_INDEX=0              !< Index within the array BOUNDARY_PROP1
END TYPE WALL_TYPE

  ! define a nested derived data type
  type, public :: MESHES_type
      type(WALL_TYPE), managed, allocatable, dimension(:) :: WALL
      type(BOUNDARY_COORD_TYPE), managed, allocatable,dimension(:) :: BOUNDARY_COORD
      type(BOUNDARY_PROP1_TYPE), managed, allocatable,dimension(:) :: BOUNDARY_PROP1
  end type MESHES_type
! define a nested derived data type variable on the device side
  !type(MESHES_type), managed, allocatable, target  :: MESHES(:)

  type(MESHES_type), allocatable, target, dimension(:)  :: MESHES

  type(WALL_TYPE), managed, pointer, dimension(:) :: P_WALL, tmp_WALL
  type(BOUNDARY_COORD_TYPE), managed, pointer,dimension(:) :: P_BOUNDARY_COORD, tmp_BOUNDARY_COORD
  type(BOUNDARY_PROP1_TYPE), managed, pointer,dimension(:) :: P_BOUNDARY_PROP1, tmp_BOUNDARY_PROP1

  TYPE(WALL_TYPE), managed, POINTER :: WC
  TYPE(BOUNDARY_COORD_TYPE),managed,  POINTER :: BC
  TYPE(BOUNDARY_PROP1_TYPE),managed,  POINTER :: B1

contains

        subroutine test_cpy
                implicit none
                integer :: I, J, K, istat, IW
                !type(BOUNDARY_COORD_TYPE), allocatable,dimension(:) :: BC_DUMMY
                type(BOUNDARY_COORD_TYPE), managed, allocatable,dimension(:) :: BC_DUMMY
                type(MESHES_type), pointer :: M
                integer, parameter :: OLD_DIM = 4608, ADD_DIM = 1000

               allocate(MESHES(2))
               do I=1,2 
                   allocate(MESHES(I)%WALL(OLD_DIM))
                   allocate(MESHES(I)%BOUNDARY_COORD(OLD_DIM))
                   allocate(MESHES(I)%BOUNDARY_PROP1(OLD_DIM))
                   do J=1, OLD_DIM
                       allocate(MESHES(I)%BOUNDARY_PROP1(J)%M_DOT_G_PP_ACTUAL(10))
                   enddo
               enddo

                M => MESHES(1)
              tmp_WALL => M%WALL

                tmp_BOUNDARY_COORD=> M%BOUNDARY_COORD
                tmp_BOUNDARY_PROP1=> M%BOUNDARY_PROP1

                !ALLOCATE(BC_DUMMY(1:OLD_DIM + ADD_DIM))
                !BC_DUMMY(1:OLD_DIM) = M%BOUNDARY_COORD(1:OLD_DIM)
                !BC_DUMMY(1:OLD_DIM) = tmp_BOUNDARY_COORD(1:OLD_DIM)
                !CALL MOVE_ALLOC(BC_DUMMY,M%BOUNDARY_COORD)

                !$CUF kernel do (1) <<<*,*, stream=cudaforGetDefaultStream() >>>
                DO IW=1, 300
                    WC => tmp_WALL(IW)
                    WC%BC_INDEX= IW
                    WC%B1_INDEX= IW
                    BC => tmp_BOUNDARY_COORD(WC%BC_INDEX) 
                    BC%IOR= IW
                    B1 => tmp_BOUNDARY_PROP1(WC%B1_INDEX)
                    B1%K_G = IW * 1.0
                enddo

                istat = cudaDeviceSynchronize()

               do I=1, 30
                   print *, "done,  for WALL (", I,") :", tmp_WALL(I)%BC_INDEX 
                   print *, "done,  for BOUNDARY_COORD (", I, ") :", tmp_BOUNDARY_COORD(I)%IOR
                   print *, "done,  for BOUNDARY_PROP1(", I, ") :", tmp_BOUNDARY_PROP1(I)%K_G
               enddo

               do I=1,2
                  deallocate(MESHES(I)%WALL)
                  deallocate(MESHES(I)%BOUNDARY_COORD)
                  do J=1, OLD_DIM
 deallocate(MESHES(I)%BOUNDARY_PROP1%M_DOT_G_PP_ACTUAL)
                  enddo
                  deallocate(MESHES(I)%BOUNDARY_PROP1)
               enddo
               deallocate(MESHES)

        end subroutine test_cpy
end module datastructures

program main
  !@cuf use cudafor
  use datastructures
  implicit none

  call test_cpy

  print*, "completed style 5"

end program main

The last print “completed style 5” cannot be printed due to some deallocation error. This arises a question: If a derived type is managed, shall we always define its allocatable members managed? what should we do if we do not need its allocatable members being managed? thanks.

This code was compiled with
nvfortran -g -cuda -gpu=debug -O0 -Mpreprocess -Mlarge_arrays -m64 -Wall -Werror -gpu=ccall,nomanaged,implicitsections -stdpar -traceback -Minfo=accel -cpp -o test_min driver_min.F90

Sincerely,

Honggang Wang.

Hi Honggang Wang,

The deallocation error is due to an error in your code.

To fix, add “(J)” to “BOUNDARY_PROP1” at line 97.

deallocate(MESHES(I)%BOUNDARY_PROP1%M_DOT_G_PP_ACTUAL)

to

deallocate(MESHES(I)%BOUNDARY_PROP1(J)%M_DOT_G_PP_ACTUAL)

Hope this helps,
Mat

Thank you Mat, my bad.

Does this mean that there is no difference whether we define M_DOT_G_PP_ACTUAL as managed or not? what is the best practice in this case?

Since in my large code, I find that the results are different if I do not define it as managed (managed is identical to the CPU code, non-managed has slightly different results), but if I defined it as managed, the performance will be lowered.

Thanks.

Sincerely,

I’ve forgotten the details off-hand, but I thought that in some contexts if the parent type array is managed, it’s children are managed as well. But I’m sure about this so I think the best practice would be to add managed to all allocatable arrays that you’ll use on the device.

Thank you so much Mat.

So, if I do not use the child allocatable arrays on the device, then I do not need to define it as managed, right?

Thanks.