Hi experts, I have to use derived types in the kernel due to the structure of the current code. Below is a minimum repeatable case of what I am doing but cannot be compiled:
MODULE m_fields
!@cuf use cudafor
integer :: n = 100
type pair
REAL*8, ALLOCATABLE :: qv(:,:,:)
REAL*8, ALLOCATABLE :: t(:,:,:)
end type
type (pair) :: pair_t_qv
type D_pair
REAL*8, device, ALLOCATABLE :: qv(:,:,:)
REAL*8, device, ALLOCATABLE :: t(:,:,:)
end type
type (D_pair), device :: D_pair_t_qv
!type (pair), managed, allocatable, target :: pair_t_qv(:)
!!@cuf attributes(managed) :: pair_t_qv
contains
subroutine kw_2c( pair_t_qv, D_pair_t_qv, n )
!@cuf use cudafor
implicit none
integer, value :: n
type (pair) :: pair_t_qv
type (D_pair), device :: D_pair_t_qv
integer :: I,J,K,istat
integer*4 :: blockDim_x=8, blockDim_y=8, blockDim_z=8
integer*4 :: gridDim_x, gridDim_y, gridDim_z
integer, device :: d_n
integer, device :: d_n
type(dim3) :: grid, block
!istat=cudaMemset(pair_t_qv, 0._EB, size(pair_t_qv) )
gridDim_x = (n+blockDim_x-1)/blockDim_x
gridDim_y = (n+blockDim_y-1)/blockDim_y
gridDim_z = (n+blockDim_z-1)/blockDim_z
grid = dim3(gridDim_x, gridDim_y,gridDim_z)
block= dim3(blockDim_x, blockDim_y,blockDim_z)
d_n = n
!D_pair_t_qv = pair_t_qv
istat = cudaMemcpy(D_pair_t_qv%qv(:,:,:), pair_t_qv%qv(:,:,:), size(pair_t_qv%qv(:,:,:) ) )
istat = cudaMemcpy(D_pair_t_qv%t(:,:,:), pair_t_qv%t(:,:,:), size(pair_t_qv%t(:,:,:) ) )
call kernel_2c<<<grid,block,0,cudaforGetDefaultStream()>>>(D_pair_t_qv,d_n)
istat = cudaDeviceSynchronize()
istat = cudaMemcpy(pair_t_qv%qv(:,:,:), D_pair_t_qv%qv(:,:,:), size(pair_t_qv%qv(:,:,:) ) )
istat = cudaMemcpy(pair_t_qv%t(:,:,:), D_pair_t_qv%t(:,:,:), size(pair_t_qv%t(:,:,:) ) )
!pair_t_qv = D_pair_t_qv
end subroutine kw_2c
attributes(global) subroutine kernel_2c(pair_t_qv, n)
implicit none
integer*4 :: I, J, K, n
type (D_pair) :: pair_t_qv
I = (blockIdx%x-1) * blockDim%x + threadIdx%x
J = (blockIdx%y-1) * blockDim%y + threadIdx%y
K = (blockIdx%z-1) * blockDim%z + threadIdx%z
if(I .ge. 1 .and. I .le. n .and. J .ge. 1 .and. J .le. n .and. K .ge. 1 .and. K .le. n ) then
pair_t_qv%qv(I,J,K) = I+J+K
pair_t_qv%t(I,J,K) = I*J*K
endif
end subroutine kernel_2c
END MODULE m_fields
program test_kernel_device
!@cuf use cudafor
use m_fields
!type(cudaExtent) :: VS
!VS = cudaExtent(n,n,n)
ALLOCATE ( pair_t_qv%t(n,n,n) )
ALLOCATE ( pair_t_qv%qv(n,n,n) )
cudaMalloc( D_pair_t_qv%t, n*n*n*sizeof(real))
cudaMalloc( D_pair_t_qv%qv, n*n*n*sizeof(real))
!cudaMalloc3D( D_pair_t_qv%t, VS)
!cudaMalloc3D( D_pair_t_qv%qv, VS)
!both allocable array or array pointer work
!!$cuf kernel do(3) <<<*,*>>>
call kw_2c( pair_t_qv, D_pair_t_qv, n )
DO j=1, n
DO i=1, n
DO k=1, n
if( pair_t_qv%t(i,j,k) .ne. i+j+k ) then
print *, i,j,k, "error"
exit
endif
ENDDO
ENDDO
ENDDO
print*, pair_t_qv%t(n,n,n)
print*, pair_t_qv%qv(n,n,n)
deallocate(pair_t_qv%t)
deallocate(pair_t_qv%qv)
end program
When trying to compile with
TCUF# nvfortran -g -pg -Mlarge_arrays -m64 -Wall -Werror -gpu=ccall,managed,implicitsections -stdpar -traceback -Minfo=accel -cpp -cuda -o 2c 2c.f90
I got:
NVFORTRAN-S-0034-Syntax error at or near end of line (2c.f90: 87)
NVFORTRAN-S-0034-Syntax error at or near end of line (2c.f90: 88)
0 inform, 0 warnings, 2 severes, 0 fatal for test_kernel_device
Thanks.