How to use derived type in cuda fortran kernel

honggangwang1979 · January 15, 2024, 4:42am

Hi experts, I have to use derived types in the kernel due to the structure of the current code. Below is a minimum repeatable case of what I am doing but cannot be compiled:

MODULE m_fields
    !@cuf use cudafor
    integer :: n = 100
    type pair
       REAL*8,  ALLOCATABLE :: qv(:,:,:)
       REAL*8,  ALLOCATABLE :: t(:,:,:)
    end type
    type (pair)  :: pair_t_qv

    type D_pair
       REAL*8,  device, ALLOCATABLE :: qv(:,:,:)
       REAL*8,  device, ALLOCATABLE :: t(:,:,:)
    end type
    type (D_pair), device   :: D_pair_t_qv

     !type (pair), managed, allocatable, target  :: pair_t_qv(:)
     !!@cuf attributes(managed) :: pair_t_qv

     contains 

     subroutine kw_2c( pair_t_qv, D_pair_t_qv,  n )
             !@cuf use cudafor
          implicit none

          integer, value :: n

          type (pair) :: pair_t_qv 
          type (D_pair), device   :: D_pair_t_qv

          integer :: I,J,K,istat

        integer*4 ::  blockDim_x=8, blockDim_y=8, blockDim_z=8
       integer*4 :: gridDim_x, gridDim_y, gridDim_z

       integer, device :: d_n

integer, device :: d_n

       type(dim3) :: grid, block

       !istat=cudaMemset(pair_t_qv, 0._EB, size(pair_t_qv) )

      gridDim_x = (n+blockDim_x-1)/blockDim_x
      gridDim_y = (n+blockDim_y-1)/blockDim_y
      gridDim_z = (n+blockDim_z-1)/blockDim_z
      grid = dim3(gridDim_x, gridDim_y,gridDim_z)
      block= dim3(blockDim_x, blockDim_y,blockDim_z)

      d_n = n
      !D_pair_t_qv = pair_t_qv
      istat = cudaMemcpy(D_pair_t_qv%qv(:,:,:), pair_t_qv%qv(:,:,:), size(pair_t_qv%qv(:,:,:) ) )
      istat = cudaMemcpy(D_pair_t_qv%t(:,:,:), pair_t_qv%t(:,:,:), size(pair_t_qv%t(:,:,:) ) )
      call kernel_2c<<<grid,block,0,cudaforGetDefaultStream()>>>(D_pair_t_qv,d_n)
      istat = cudaDeviceSynchronize()

      istat = cudaMemcpy(pair_t_qv%qv(:,:,:), D_pair_t_qv%qv(:,:,:), size(pair_t_qv%qv(:,:,:) ) )
      istat = cudaMemcpy(pair_t_qv%t(:,:,:), D_pair_t_qv%t(:,:,:), size(pair_t_qv%t(:,:,:) ) )
      !pair_t_qv = D_pair_t_qv

    end subroutine kw_2c

attributes(global) subroutine kernel_2c(pair_t_qv, n)
implicit none
integer*4 :: I, J, K, n
type (D_pair) :: pair_t_qv

         I = (blockIdx%x-1) * blockDim%x + threadIdx%x
         J = (blockIdx%y-1) * blockDim%y + threadIdx%y
         K = (blockIdx%z-1) * blockDim%z + threadIdx%z

         if(I .ge. 1 .and. I .le. n .and.  J .ge. 1 .and. J .le. n .and. K .ge. 1 .and. K .le. n ) then
                pair_t_qv%qv(I,J,K) = I+J+K
                pair_t_qv%t(I,J,K) = I*J*K
         endif

  end subroutine kernel_2c

 END MODULE m_fields
 program test_kernel_device
           !@cuf use cudafor
           use m_fields

           !type(cudaExtent) ::  VS
           !VS = cudaExtent(n,n,n)

           ALLOCATE ( pair_t_qv%t(n,n,n) )
           ALLOCATE ( pair_t_qv%qv(n,n,n) )
           cudaMalloc( D_pair_t_qv%t, n*n*n*sizeof(real))
           cudaMalloc( D_pair_t_qv%qv, n*n*n*sizeof(real))

           !cudaMalloc3D( D_pair_t_qv%t, VS)
           !cudaMalloc3D( D_pair_t_qv%qv, VS)

           !both allocable array or array pointer work

           !!$cuf kernel do(3) <<<*,*>>> 
           call kw_2c( pair_t_qv, D_pair_t_qv,  n )

           DO j=1, n
                    DO i=1, n
                            DO k=1, n
                               if( pair_t_qv%t(i,j,k) .ne.  i+j+k ) then
                                  print *, i,j,k, "error"
                                  exit
                               endif
                            ENDDO
                    ENDDO
           ENDDO


           print*, pair_t_qv%t(n,n,n)
           print*, pair_t_qv%qv(n,n,n)

           deallocate(pair_t_qv%t)
           deallocate(pair_t_qv%qv)

   end program

When trying to compile with
TCUF# nvfortran -g -pg -Mlarge_arrays -m64 -Wall -Werror -gpu=ccall,managed,implicitsections -stdpar -traceback -Minfo=accel -cpp -cuda -o 2c 2c.f90

I got:
NVFORTRAN-S-0034-Syntax error at or near end of line (2c.f90: 87)
NVFORTRAN-S-0034-Syntax error at or near end of line (2c.f90: 88)
0 inform, 0 warnings, 2 severes, 0 fatal for test_kernel_device

Thanks.

MatColgrove · January 16, 2024, 4:00pm

This error is because cudaMalloc is a function so you need to capture the return value.

Though in general using derived types with dynamic data members is very difficult to offload to the device. Given device memory is allocated using host side called, you’d need to access the device type from the host in order to allocate the members, which of course would cause a host side segfault. There’s ways to do it, but it means manipulating underlying pointers and using temps.

You’re better off making the host type or members “managed” so the CUDA runtime manages the data movement.

Another option is to use OpenACC for the data management and perform a manual deep copy.

-Mat

honggangwang1979 · January 16, 2024, 4:43pm

Super thanks, I will try managed data type.

Topic		Replies	Views
How to cudaMemcpy and use nested drived type data in CUDA fortrann nvc, nvc++ and nvfortran	6	336	February 29, 2024
How to use managed nested drived type in CUDA Fortran nvc, nvc++ and nvfortran	4	204	March 6, 2024
Trying to use device array as a derived type member in a kernel nvc, nvc++ and nvfortran	4	460	November 18, 2020
What is the correct way of working with arrays of derived types with allocatable components? nvc, nvc++ and nvfortran	7	42	June 27, 2025
member of derived type in CUF kernels Legacy PGI Compilers	2	2475	May 24, 2019
how to use the derived type in kernel subroutine Legacy PGI Compilers	5	2036	January 16, 2024
A runtime error during use of cusparseDgtsv2 function in CUDA Fortran nvc, nvc++ and nvfortran cuda	8	433	October 19, 2023
cublasDmatinvBatched typemismatch nvc, nvc++ and nvfortran	6	993	October 28, 2021
Device-to-device transfers involving components of derived types Legacy PGI Compilers	9	2706	June 12, 2020
Calling CUDA C from fortran CUDA Programming and Performance	4	949	December 4, 2021

How to use derived type in cuda fortran kernel

Related topics