How to use derived type in cuda fortran kernel

Hi experts, I have to use derived types in the kernel due to the structure of the current code. Below is a minimum repeatable case of what I am doing but cannot be compiled:

MODULE m_fields
    !@cuf use cudafor
    integer :: n = 100
    type pair
       REAL*8,  ALLOCATABLE :: qv(:,:,:)
       REAL*8,  ALLOCATABLE :: t(:,:,:)
    end type
    type (pair)  :: pair_t_qv

    type D_pair
       REAL*8,  device, ALLOCATABLE :: qv(:,:,:)
       REAL*8,  device, ALLOCATABLE :: t(:,:,:)
    end type
    type (D_pair), device   :: D_pair_t_qv

     !type (pair), managed, allocatable, target  :: pair_t_qv(:)
     !!@cuf attributes(managed) :: pair_t_qv


     subroutine kw_2c( pair_t_qv, D_pair_t_qv,  n )
             !@cuf use cudafor
          implicit none

          integer, value :: n

          type (pair) :: pair_t_qv 
          type (D_pair), device   :: D_pair_t_qv

          integer :: I,J,K,istat

        integer*4 ::  blockDim_x=8, blockDim_y=8, blockDim_z=8
       integer*4 :: gridDim_x, gridDim_y, gridDim_z

       integer, device :: d_n

integer, device :: d_n

       type(dim3) :: grid, block

       !istat=cudaMemset(pair_t_qv, 0._EB, size(pair_t_qv) )

      gridDim_x = (n+blockDim_x-1)/blockDim_x
      gridDim_y = (n+blockDim_y-1)/blockDim_y
      gridDim_z = (n+blockDim_z-1)/blockDim_z
      grid = dim3(gridDim_x, gridDim_y,gridDim_z)
      block= dim3(blockDim_x, blockDim_y,blockDim_z)

      d_n = n
      !D_pair_t_qv = pair_t_qv
      istat = cudaMemcpy(D_pair_t_qv%qv(:,:,:), pair_t_qv%qv(:,:,:), size(pair_t_qv%qv(:,:,:) ) )
      istat = cudaMemcpy(D_pair_t_qv%t(:,:,:), pair_t_qv%t(:,:,:), size(pair_t_qv%t(:,:,:) ) )
      call kernel_2c<<<grid,block,0,cudaforGetDefaultStream()>>>(D_pair_t_qv,d_n)
      istat = cudaDeviceSynchronize()

      istat = cudaMemcpy(pair_t_qv%qv(:,:,:), D_pair_t_qv%qv(:,:,:), size(pair_t_qv%qv(:,:,:) ) )
      istat = cudaMemcpy(pair_t_qv%t(:,:,:), D_pair_t_qv%t(:,:,:), size(pair_t_qv%t(:,:,:) ) )
      !pair_t_qv = D_pair_t_qv

    end subroutine kw_2c

attributes(global) subroutine kernel_2c(pair_t_qv, n)
implicit none
integer*4 :: I, J, K, n
type (D_pair) :: pair_t_qv

         I = (blockIdx%x-1) * blockDim%x + threadIdx%x
         J = (blockIdx%y-1) * blockDim%y + threadIdx%y
         K = (blockIdx%z-1) * blockDim%z + threadIdx%z

         if(I .ge. 1 .and. I .le. n .and.  J .ge. 1 .and. J .le. n .and. K .ge. 1 .and. K .le. n ) then
                pair_t_qv%qv(I,J,K) = I+J+K
                pair_t_qv%t(I,J,K) = I*J*K

  end subroutine kernel_2c

 END MODULE m_fields
 program test_kernel_device
           !@cuf use cudafor
           use m_fields

           !type(cudaExtent) ::  VS
           !VS = cudaExtent(n,n,n)

           ALLOCATE ( pair_t_qv%t(n,n,n) )
           ALLOCATE ( pair_t_qv%qv(n,n,n) )
           cudaMalloc( D_pair_t_qv%t, n*n*n*sizeof(real))
           cudaMalloc( D_pair_t_qv%qv, n*n*n*sizeof(real))

           !cudaMalloc3D( D_pair_t_qv%t, VS)
           !cudaMalloc3D( D_pair_t_qv%qv, VS)

           !both allocable array or array pointer work

           !!$cuf kernel do(3) <<<*,*>>> 
           call kw_2c( pair_t_qv, D_pair_t_qv,  n )

           DO j=1, n
                    DO i=1, n
                            DO k=1, n
                               if( pair_t_qv%t(i,j,k) .ne.  i+j+k ) then
                                  print *, i,j,k, "error"

           print*, pair_t_qv%t(n,n,n)
           print*, pair_t_qv%qv(n,n,n)


   end program

When trying to compile with
TCUF# nvfortran -g -pg -Mlarge_arrays -m64 -Wall -Werror -gpu=ccall,managed,implicitsections -stdpar -traceback -Minfo=accel -cpp -cuda -o 2c 2c.f90

I got:
NVFORTRAN-S-0034-Syntax error at or near end of line (2c.f90: 87)
NVFORTRAN-S-0034-Syntax error at or near end of line (2c.f90: 88)
0 inform, 0 warnings, 2 severes, 0 fatal for test_kernel_device


This error is because cudaMalloc is a function so you need to capture the return value.

Though in general using derived types with dynamic data members is very difficult to offload to the device. Given device memory is allocated using host side called, you’d need to access the device type from the host in order to allocate the members, which of course would cause a host side segfault. There’s ways to do it, but it means manipulating underlying pointers and using temps.

You’re better off making the host type or members “managed” so the CUDA runtime manages the data movement.

Another option is to use OpenACC for the data management and perform a manual deep copy.


Super thanks, I will try managed data type.