How to cudaMemcpy and use nested drived type data in CUDA fortrann

Hi Nvidia Experts, I’d like to copy derived type variable between host and device, but couldn’t get the correct answers, as shown in my toy code:

module datastructures

  use cudafor
  !use nvtx_mod

  ! this is data structure you start with
  type, public :: element_type
     integer       ,pinned                    :: Nnodes ! need on gpu
     real(kind=8)                      :: volume ! not needed on gpu
     !integer       , device :: Nnodes ! need on gpu
     !real(kind=8)  , device             :: volume ! not needed on gpu
  end type element_type

  ! define a nested derived data type
  type, public :: GPUelement_type
      type(element_type), device, allocatable :: element(:)
  end type GPUelement_type

  ! define a nested derived data type variable on the device side
  type(GPUelement_type), allocatable, target  :: GPUelement(:)
  !type(GPUelement_type), device, allocatable, target  :: GPUelement(:)

  ! define a nested derived data on the host side 
  type, public :: Hostelement_type
      type(element_type), allocatable :: element(:)
  end type Hostelement_type
  type(Hostelement_type), allocatable, target  :: Hostelement(:)

   type(element_type), device, pointer :: P_element(:)
contains

        subroutine test_cpy
                implicit none
                integer :: I, J, K, istat

               allocate(Hostelement(3))
               do I=1, 3
                   allocate(Hostelement(I)%element(3))
                   do J=1, 3
                        Hostelement(I)%element(J)%Nnodes = I + J
                        Hostelement(I)%element(J)%volume = I * J * 2.22
                   enddo
               enddo

               allocate(GPUelement(3))
               do I=1, 3
                   allocate(GPUelement(I)%element(3))
                   !cudaMalloc(C_DEVLOC(GPUelement(I)%element), 3)
                   !istat =  cudaMemcpy(C_DEVLOC(GPUelement(I)%element), C_LOC(Hostelement(I)%element), size(Hostelement(I)%element), 0 )
                   !istat = cudaDeviceSynchronize()
               enddo

               istat =  cudaMemcpy(C_DEVLOC(GPUelement), C_LOC(Hostelement), size(Hostelement), 0 )
               istat = cudaDeviceSynchronize()

               !D_GPUelement = GPUelement
               !K = GPUelement(1)%element(1)%Nnodes
               print *, "K=", K

               J = Hostelement(1)%element(1)%Nnodes
               print *, "J=", J

               P_element = GPUelement(1)%element
               !$cuf kernel do(1) <<<*,*, stream=cudaforGetDefaultStream() >>>
               do I=1, 3
                   !if( I == 1 ) print *, GPUelement(I)%element(1)%Nnodes
                   if( I == 1 ) print *, P_element(I)%Nnodes
               enddo

               istat = cudaDeviceSynchronize()

               print *, "done" 

               do I=1,3
                  deallocate(GPUelement(I)%element)
                  deallocate(Hostelement(I)%element)
               enddo
               deallocate(GPUelement, Hostelement)

        end subroutine test_cpy
end module datastructures
program main
  !@cuf use cudafor
  use datastructures
  implicit none

  call test_cpy

  print*, "completed style 5"

end program main

Compiled with : nvfortran -g -cuda -o test driver.F90

When running, I got:

./test

K= 0
J= 2
done
0: DEALLOCATE: an illegal memory access was encountered

  1. I am expecting K and J are same. What should I do ? ( I do not want to use open_acc in my specific environment)
  2. Why the CUF kernel cannot print anything?
  3. The error information will disappear if I comment out the CUF kernel. Why?
  4. If I define GPUelement with device, I will get segmental fault. Why? a more general question is: where should we use device for multilayer nested derived type variables definition?

Thanks.

Thanks.

Sorry, the line
!K = GPUelement(1)%element(1)%Nnodes
should be activated, the outputs have no changes.

Hi honggangwang1979,

Using device derived types is very tricky. You’re much better off using “managed” instead of “device” or use OpenACC data directives where the compiler can update the descriptors as well as the data.

Here’s your example but using managed, which is simpler:

module datastructures

  use cudafor
  !use nvtx_mod

  ! this is data structure you start with
  type, public :: element_type
     integer        :: Nnodes ! need on gpu
     real(kind=8)   :: volume ! not needed on gpu
  end type element_type

  ! define a nested derived data type
  type, public :: element_type_array
      type(element_type), managed, allocatable :: element(:)
  end type element_type_array

  ! define a nested derived data type variable on the device side
  type(element_type_array), allocatable, managed  :: elements(:)

contains

        subroutine test_cpy
                implicit none
                integer :: I, J, K, istat

               allocate(elements(3))
               do I=1, 3
                   allocate(elements(I)%element(3))
                   do J=1, 3
                        elements(I)%element(J)%Nnodes = I + J
                        elements(I)%element(J)%volume = I * J * 2.22
                   enddo
               enddo

               K = elements(1)%element(1)%Nnodes
               print *, "K=", K

               J = elements(1)%element(1)%Nnodes
               print *, "J=", J

               !$cuf kernel do(1) <<<*,*, stream=cudaforGetDefaultStream() >>>
               do I=1, 3
                   if( I == 1 ) print *, elements(1)%element(I)%Nnodes
               enddo

               istat = cudaDeviceSynchronize()
               print *, "done"

               do I=1,3
                  deallocate(elements(I)%element)
               enddo
               deallocate(elements)
        end subroutine test_cpy
end module datastructures
program main
  !@cuf use cudafor
  use datastructures
  implicit none
  call test_cpy
  print*, "completed style 5"

end program main
% nvfortran derived_managed.CUF; a.out
 K=            2
 J=            2
            2
 done
 completed style 5

-Mat

Thank you Mat for the quick reply!

I will try this.

Have a good night!

Hi Mat, I modified you code a little bit so that It could get the flavor of my large code, but it failed, would you please help me out with this? thanks.

Here are the code:

module datastructures

  use cudafor
  !use nvtx_mod

  INTEGER, PARAMETER :: EB = SELECTED_REAL_KIND(12)    !< Precision of "Eight Byte" reals

  TYPE BOUNDARY_COORD_TYPE

   INTEGER :: II=0             !< Ghost cell x index
   INTEGER :: JJ=0             !< Ghost cell y index
   INTEGER :: KK=0             !< Ghost cell z index
   INTEGER :: IIG=0            !< Gas cell x index
   INTEGER :: JJG=0            !< Gas cell y index
   INTEGER :: KKG=0            !< Gas cell z index
   INTEGER :: IOR=0          !< Index of orientation of the WALL cell

   REAL(EB) :: X             !< x coordinate of boundary cell center
   REAL(EB) :: Y             !< y coordinate of boundary cell center
   REAL(EB) :: Z             !< z coordinate of boundary cell center
   REAL(EB) :: X1            !< Lower x extent of boundary cell (m)
   REAL(EB) :: X2            !< Upper x extent of boundary cell (m)
   REAL(EB) :: Y1            !< Lower y extent of boundary cell (m)
   REAL(EB) :: Y2            !< Upper y extent of boundary cell (m)
   REAL(EB) :: Z1            !< Lower z extent of boundary cell (m)
   REAL(EB) :: Z2            !< Upper z extent of boundary cell (m)

END TYPE BOUNDARY_COORD_TYPE

TYPE WALL_TYPE

   REAL(EB) :: DUNDT=0._EB            !< \f$ \partial u_n / \partial t \f$
   REAL(EB) :: Q_LEAK=0._EB           !< Heat production of leaking gas (W/m3)
   REAL(EB) :: V_DEP=0._EB            !< Deposition velocity (m/s)
   REAL(EB) :: VEL_ERR_NEW=0._EB      !< Velocity mismatch at mesh or solid boundary (m/s)

   INTEGER :: BC_INDEX=0              !< Index within the array BOUNDARY_COORD
   INTEGER :: OD_INDEX=0              !< Index within the array BOUNDARY_ONE_D
   INTEGER :: TD_INDEX=0              !< Index within the array BOUNDARY_THR_D
   INTEGER :: B1_INDEX=0              !< Index within the array BOUNDARY_PROP1
   INTEGER :: B2_INDEX=0              !< Index within the array BOUNDARY_PROP2
   INTEGER :: BR_INDEX=0              !< Index within the array BOUNDARY_RADIA
   INTEGER :: SURF_INDEX=0            !< Index of the SURFace conditions
   INTEGER :: BACK_INDEX=0            !< WALL index of back side of obstruction or exterior wall cell
   INTEGER :: BACK_MESH=0             !< Mesh number on back side of obstruction or exterior wall cell
   INTEGER :: BACK_SURF=0             !< SURF_INDEX on back side of obstruction or exterior wall cell
   INTEGER :: BOUNDARY_TYPE=0         !< Descriptor: SOLID, MIRROR, OPEN, INTERPOLATED, etc
   INTEGER :: SURF_INDEX_ORIG=0       !< Original SURFace index for this cell
   INTEGER :: OBST_INDEX=0            !< Index of the OBSTruction
   INTEGER :: PRESSURE_BC_INDEX       !< Poisson boundary condition, NEUMANN or DIRICHLET
   INTEGER :: VENT_INDEX=0            !< Index of the VENT containing this cell
   INTEGER :: JD11_INDEX=0
   INTEGER :: JD12_INDEX=0
   INTEGER :: JD21_INDEX=0
   INTEGER :: JD22_INDEX=0
   INTEGER :: CUT_FACE_INDEX=0
   INTEGER :: N_REALS=0               !< Number of reals to pack into restart or send/recv buffer
   INTEGER :: N_INTEGERS=0            !< Number of integers to pack into restart or send/recv buffer
   INTEGER :: N_LOGICALS=0            !< Number of logicals to pack into restart or send/recv buffer

END TYPE WALL_TYPE

! define a nested derived data type
  type, public :: MESHES_type
      type(WALL_TYPE), managed, allocatable, dimension(:) :: WALL
      type(BOUNDARY_COORD_TYPE), managed, allocatable,dimension(:) :: BOUNDARY_COORD
  end type MESHES_type

  ! define a nested derived data type variable on the device side
  !type(MESHES_type), managed, allocatable, target  :: MESHES(:)

  type(MESHES_type), allocatable, target, dimension(:)  :: MESHES

  type(WALL_TYPE), managed, pointer, dimension(:) :: P_WALL, tmp_WALL
  type(BOUNDARY_COORD_TYPE), managed, pointer,dimension(:) :: P_BOUNDARY_COORD, tmp_BOUNDARY_COORD

  TYPE(WALL_TYPE), managed, POINTER :: WC
  TYPE(BOUNDARY_COORD_TYPE),managed,  POINTER :: BC

contains

        subroutine test_cpy
                implicit none
                integer :: I, J, K, istat, II

               allocate(MESHES(2))
               do I=1,2 
                   allocate(MESHES(I)%WALL(5608))
                   allocate(MESHES(I)%BOUNDARY_COORD(5608))
               enddo

                tmp_WALL => MESHES(1)%WALL
                tmp_BOUNDARY_COORD=> MESHES(1)%BOUNDARY_COORD
 !$CUF kernel do (1) <<<*,*, stream=cudaforGetDefaultStream() >>>
                DO I=1, 300
                    WC = tmp_WALL(I)
                    WC%BC_INDEX= I
                    BC =tmp_BOUNDARY_COORD(WC%BC_INDEX) 
                    BC%IOR= I
                enddo
                istat = cudaDeviceSynchronize()

               print *, "done,  for WALL (2) :", tmp_WALL(2)%BC_INDEX 
               print *, "done,  for BOUNDARY_COORD (2) :", tmp_BOUNDARY_COORD(2)%IOR

               do I=1,2
                  deallocate(MESHES(I)%WALL)
                  deallocate(MESHES(I)%BOUNDARY_COORD)
                  !deallocate(Hostelement(I)%element)
               enddo
               deallocate(MESHES)
               !deallocate(MESHES, Hostelement)

        end subroutine test_cpy
end module datastructures

program main
  !@cuf use cudafor
  use datastructures
  implicit none

  call test_cpy

  print*, "completed style 5"

end program main

Compiled with:

nvfortran -g -cuda -O0 -Mpreprocess -Mlarge_arrays -m64 -Wall -Werror -gpu=ccall,nomanaged,implicitsections -stdpar -traceback -Minfo=accel -cpp -o test_mg driver_managed.F90

when running it, I got errors :

# ./test_mg
 done,  for WALL (2) :            0
 done,  for BOUNDARY_COORD (2) :            0
free: cuMemFree returns error code 700
free: cuMemFree returns error code 700
free: cuMemFree returns error code 700
free: cuMemFree returns error code 700
 completed style 5

If I debug into it, I got

CUDA Exception: Warp Illegal Address
The exception was triggered at PC 0x7fffb53dd2d0 (driver_managed.F90:102)

Thread 1 "test_mg" received signal CUDA_EXCEPTION_14, Warp Illegal Address.
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
0x00007fffb53dd660 in datastructures_test_cpy_96_gpu<<<(3,1,1),(128,1,1)>>> (tmp_wall=0x7fffb2000000, wc=<not associated>, tmp_boundary_coord=0x7fffb20af400, bc=<not associated>) at driver_managed.F90:102
102                     istat = cudaDeviceSynchronize()

Thanks!

Sincerely,

Honggang Wnag.

I think the issue is here where WC and BC should use pointer assignment, “=>”, not “=”.

 !$CUF kernel do (1) <<<*,*, stream=cudaforGetDefaultStream() >>>
                DO I=1, 300
                    WC => tmp_WALL(I)
                    WC%BC_INDEX= I
                    BC => tmp_BOUNDARY_COORD(WC%BC_INDEX)
                    BC%IOR= I
                enddo

Thank you so much!

I will check if the large code works in this way.