Hi Nvidia Experts, I’d like to copy derived type variable between host and device, but couldn’t get the correct answers, as shown in my toy code:
module datastructures
use cudafor
!use nvtx_mod
! this is data structure you start with
type, public :: element_type
integer ,pinned :: Nnodes ! need on gpu
real(kind=8) :: volume ! not needed on gpu
!integer , device :: Nnodes ! need on gpu
!real(kind=8) , device :: volume ! not needed on gpu
end type element_type
! define a nested derived data type
type, public :: GPUelement_type
type(element_type), device, allocatable :: element(:)
end type GPUelement_type
! define a nested derived data type variable on the device side
type(GPUelement_type), allocatable, target :: GPUelement(:)
!type(GPUelement_type), device, allocatable, target :: GPUelement(:)
! define a nested derived data on the host side
type, public :: Hostelement_type
type(element_type), allocatable :: element(:)
end type Hostelement_type
type(Hostelement_type), allocatable, target :: Hostelement(:)
type(element_type), device, pointer :: P_element(:)
contains
subroutine test_cpy
implicit none
integer :: I, J, K, istat
allocate(Hostelement(3))
do I=1, 3
allocate(Hostelement(I)%element(3))
do J=1, 3
Hostelement(I)%element(J)%Nnodes = I + J
Hostelement(I)%element(J)%volume = I * J * 2.22
enddo
enddo
allocate(GPUelement(3))
do I=1, 3
allocate(GPUelement(I)%element(3))
!cudaMalloc(C_DEVLOC(GPUelement(I)%element), 3)
!istat = cudaMemcpy(C_DEVLOC(GPUelement(I)%element), C_LOC(Hostelement(I)%element), size(Hostelement(I)%element), 0 )
!istat = cudaDeviceSynchronize()
enddo
istat = cudaMemcpy(C_DEVLOC(GPUelement), C_LOC(Hostelement), size(Hostelement), 0 )
istat = cudaDeviceSynchronize()
!D_GPUelement = GPUelement
!K = GPUelement(1)%element(1)%Nnodes
print *, "K=", K
J = Hostelement(1)%element(1)%Nnodes
print *, "J=", J
P_element = GPUelement(1)%element
!$cuf kernel do(1) <<<*,*, stream=cudaforGetDefaultStream() >>>
do I=1, 3
!if( I == 1 ) print *, GPUelement(I)%element(1)%Nnodes
if( I == 1 ) print *, P_element(I)%Nnodes
enddo
istat = cudaDeviceSynchronize()
print *, "done"
do I=1,3
deallocate(GPUelement(I)%element)
deallocate(Hostelement(I)%element)
enddo
deallocate(GPUelement, Hostelement)
end subroutine test_cpy
end module datastructures
program main
!@cuf use cudafor
use datastructures
implicit none
call test_cpy
print*, "completed style 5"
end program main
Compiled with : nvfortran -g -cuda -o test driver.F90
When running, I got:
./test
K= 0
J= 2
done
0: DEALLOCATE: an illegal memory access was encountered
- I am expecting K and J are same. What should I do ? ( I do not want to use open_acc in my specific environment)
- Why the CUF kernel cannot print anything?
- The error information will disappear if I comment out the CUF kernel. Why?
- If I define GPUelement with device, I will get segmental fault. Why? a more general question is: where should we use device for multilayer nested derived type variables definition?
Thanks.
Thanks.