In an attempt to test the derived types, openacc and CUDA Fortran together, I came upon an error. When I try to copy the derived type to the Device using OpenACC directives, I get the following error.
NVFORTRAN-F-0155-Compiler failed to translate accelerator region (see -Minfo messages): Unexpected Data Type in Deep Copy
Apparently, it is caused by Complex(kind=8) within the module MOLECULE. If I change it to (kind=4) or comment it out, it works fine.
My question here is that Complex(kind=8) works fine when using direct transfer (Given below under the heading INDEPENDENT PROGRAM) via assignment but generates error when using deepcopy option wih openacc. According to the SDK Documentation (CUDA Fortran Programming Guide Version 22.7 for ARM, OpenPower, x86), kind=4 and kind=8 both are supported, So why do I get error in deepcopy with kind=8 in the dervied type?
MODULE # 01
module MOLECULE
type :: molecule_details
real(kind=8) :: try = 0.0
integer, allocatable :: p_num(:)
character(1), allocatable :: name(:)
character(1), allocatable :: tag(:,:)
complex(kind = 8), allocatable :: orientation(:)
end type molecule_details
end module MOLECULE
MODULE # 02
module global
use MOLECULE, only: molecule_details
type (molecule_details), allocatable, target :: atoms(:)
integer :: constituents
end module global
MAIN PROGRAM
module mathOps
use global
contains
attributes(global) subroutine banker(x, a, SIZE)
implicit none
type (molecule_details) :: x(*)
integer, value :: SIZE
integer :: j,k
real, value :: a
integer :: i
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= SIZE) then
x(i)%name(1) = ‘R’
x(i)%name(2) = ‘A’
do j=1,30
do k=1,30
if(x(i)%tag(1,1) == ‘M’ .and. x(i)%tag(3,3) == ‘R’) then
x(i)%tag(1,1) = ‘B’
x(i)%tag(2,2) = ‘U’
x(i)%tag(3,3) = ‘F’
end if
end do
end do
x(i)%try = x(i)%try + a
x(i)%p_num(1) = x(i)%p_num(1) + a
end if
end subroutine banker
end module mathOps
program test_deep_copy
use mathOps; use global; use cudafor; use openacc
implicit none
real :: a
integer :: i
type(dim3) :: grid, tBlock
integer(kind=cuda_stream_kind) istr, isync
isync = 0
constituents = 10000
a = 2.
tBlock = dim3(256,1,1)
grid = dim3(ceiling(real(constituents)/tBlock%x),1,1)
allocate(atoms(1:constituents))
DO i=1,constituents
allocate(atoms(i)%p_num(2))
allocate(atoms(i)%name(30))
allocate(atoms(i)%tag(30, 30))
atoms(i)%tag(1,1) = ‘M’
atoms(i)%tag(3,3) = ‘R’
end do
print*, 'The name of atoms(1) before Kernel call is ', atoms(1)%name
print*, ’ The tag of atoms(1) before Kernel call is ', atoms(1)%tag
write(,) atoms(1)%try
write(,) atoms(10000)%p_num(1)
!$acc enter data copyin(atoms(:), constituents, a)
!$acc host_data use_device(atoms)
istr = acc_get_cuda_stream(isync)
call banker<<<grid, tBlock>>>(atoms, a, constituents)
!$acc end host_data
!$acc wait
!$acc exit data copyout(atoms(:))
print*, 'The name of atoms(1) after Kernel call is ', atoms(1)%name
print*, ’ The tag of atoms(1) after Kernel call is ', atoms(1)%tag
deallocate(atoms)
end program test_deep_copy
INDEPENDENT PROGRAM
module simpleOps_m
contains
attributes(global) subroutine increment(a, b, ca)
implicit none
integer, intent(inout) :: a(:)
integer, value :: b
complex(kind=8), allocatable :: ca
integer :: i
write (*,*) 'Complex number is ', ca(2)
i = threadIdx % x
a(i) = a(i)+b
end subroutine increment
end module simpleOps_m
program incrementTestGPU
use cudafor
use simpleOps_m
implicit none
complex(kind=8), allocatable :: ca
integer, parameter :: n = 256
integer :: a(n), b
integer, device :: a_d(n)
complex(kind=8), device, allocatable :: ca_d
allocate(ca(2))
ca(1) = (1,1)
ca(2) = (2,2)
a = 1
b = 3
write(*,*) ' Complex Number is ', ca
a_d = a
ca_d = ca
call increment<<<1,n>>>(a_d, b, ca_d)
a = a_d
if(any(a/=4)) then
write(*,*) '*** Program Failed ***'
else
write(*,*) 'Program Passed'
endif
end program incrementTestGPU
The HPC SDK version is 22.1 and I am using the deepcopy option while compiling.
Any help would be appreciated.