try to allocate 2D array of derived type

Hi guys,

I tried to allocate a 2D array of a derived type in device code, but the compiler shows an error as below:
Error: /tmp/pgaccgGagsYMg5wu4.gpu (912, 92): parse use of undefined value ‘@basesmodule$basisfunctiontype$td’.

The device code is

MODULE BasesModule 
!
  USE ConstantsModule,  ONLY : Complex_j, k_z, xHat, yHat, zHat
  USE ElementModule,    ONLY : ElementType    
  USE VectorDeviceModule
!
  IMPLICIT NONE; !SAVE 
!
  PRIVATE
  PUBLIC :: BasisFunctionType,ComputeBases
!
  ! TYPE ComplexVectorType
  !      COMPLEX(8) :: x, y, z
  ! END TYPE ComplexVectorType

  ! TYPE RealVectorType
  !      REAL(8) :: x, y, z
  ! END TYPE RealVectorType

  TYPE BasisFunctionType
    TYPE(RealVectorType)          :: LambdaDivConform
    TYPE(RealVectorType)          :: LambdaCurlConform
    COMPLEX(8)                       :: DivLambda
    TYPE(ComplexVectorType)       :: CurlLambda
  END TYPE BasisFunctionType 
!
CONTAINS 
!
!====================================================================
!====================================================================
!
attributes(device) SUBROUTINE ComputeBases(Element,i_kind,NumLocalDoFs,NumberQuadPts,xsi,Basis,HarmonicDerivSign)
!
USE VectorDeviceModule

  IMPLICIT NONE 
!
  TYPE(ElementType), INTENT(IN)          ::  Element 
  TYPE(BasisFunctionType), ALLOCATABLE   ::  Basis(:,:)
  REAL(8), INTENT(IN)                       ::  xsi(:,:)  
  INTEGER, INTENT(IN)                    ::  NumberQuadPts, NumLocalDoFs &
                                            ,i_kind 
  INTEGER                                ::  k
  TYPE(RealVectorType)                   ::  UnitNormalVect, v1temp, v2temp
  REAL(8)                                   ::  HarmonicDerivSign, r1temp
!
!...Loop over the number of quadrature points
!
  IF(ALLOCATED(Basis)) DEALLOCATE(Basis)
  ALLOCATE(Basis(NumLocalDoFs,NumberQuadPts))

  DO k = 1, NumberQuadPts
    WRITE(*,*) NumberQuadPts
    WRITE(*,*) 'element id: ', Element%Element_id
    ! WRITE(*,*) 'element magitude: ', Element%GeomParameters%MagEdgeVect_l(1)
    ! WRITE(*,*) 'element height: ', Element%GeomParameters%HeightVect_h(1)%x
!
   SELECT CASE(Element%ElementKind)
!
   CASE("strip","wire")

       SELECT CASE(Element%Unknown(i_kind)%BasisOrder  )  
!
         CASE(0)
!
          SELECT CASE(Element%Unknown(i_kind)%EquationEnforced ) 
!

          CASE ('efie')    
!
!...Compute curl-conforming bases and their curl
!
          WRITE(*,*) 'multiply starts: ', xsi(1,k), zHat%z, size(Basis)
          Basis(1,k)%LambdaCurlConform = r_scalar_times_r_vector_device(xsi(1,k) , zHat )
          WRITE(*,*) 'multiply is finished'
          Basis(2,k)%LambdaCurlConform = xsi(2,k) * zHat
          WRITE(*,*) 'element id: ', Basis(2,k)%LambdaCurlConform%x   
!
          v1temp = Element%GeomParameters%HeightVect_h(1)
          v2temp = zHat .CROSSD. v1temp
          Basis(1,k)%CurlLambda  =  r_vector_divided_by_r_scalar_device( v2temp , Element%GeomParameters%MagHeightVect_h(1)**2)
!
          v1temp = Element%GeomParameters%HeightVect_h(2)
          v2temp = zHat .CROSSD. v1temp
          r1temp = Element%GeomParameters%MagHeightVect_h(2)
          Basis(2,k)%CurlLambda  =  r_vector_divided_by_r_scalar_device( v2temp , (r1temp)**2 )
!
!...Compute div-conforming bases and their divergence
!
          v1temp = Element%GeomParameters%EdgeVect_l(1)
          Basis(1,k)%LambdaDivConform  =  r_vector_divided_by_r_scalar_device( xsi(1,k) * v1temp, Element%GeomParameters%MagHeightVect_h(1) )
!
          v1temp = Element%GeomParameters%EdgeVect_l(2)
          Basis(2,k)%LambdaDivConform  =  r_vector_divided_by_r_scalar_device( xsi(2,k) * v1temp, Element%GeomParameters%MagHeightVect_h(2) )
!
            Basis(1,k)%DivLambda = 1./Element%GeomParameters%MagHeightVect_h(1)
!
            Basis(2,k)%DivLambda = 1./Element%GeomParameters%MagHeightVect_h(2)

            WRITE(*,*) 'basis divLambda: ', Basis(2,k)%DivLambda

!
          CASE DEFAULT
!              
               PRINT*, Element%ElementKind 
               PRINT*, 'Equation type ' &
               ,Element%Unknown(i_kind)%EquationEnforced,' not supported in ComputeBases.'
              !  WRITE (*,*) 'Press ENTER key to continue'
               STOP

          END SELECT     
!
       CASE DEFAULT
          PRINT*, 'Basis order',Element%Unknown(i_kind)%BasisOrder &
                ,' not supported in ComputeBases. '
          ! WRITE (*,*) 'Press ENTER key to continue'
          STOP

       END SELECT  

   CASE DEFAULT
          PRINT*, 'Element kind ' &
            ,Element%ElementKind,' not supported in ComputeBases. '
          ! WRITE (*,*) 'Press ENTER key to continue'
          STOP
   END SELECT 
!
  END DO 

  END SUBROUTINE  ComputeBases
!
!====================================================================
!====================================================================
!
END MODULE BasesModule

The BasisFunctionType is

TYPE BasisFunctionType
    TYPE(RealVectorType)          :: LambdaDivConform
    TYPE(RealVectorType)          :: LambdaCurlConform
    COMPLEX(8)                       :: DivLambda
    TYPE(ComplexVectorType)       :: CurlLambda
  END TYPE BasisFunctionType

The RealVectorType is defined in VectorDeviceModule

TYPE RealVectorType
         REAL(8), managed :: x, y, z
END TYPE RealVectorType

The ComplexVectorType is defined in VectorDeviceModule

TYPE ComplexVectorType
         COMPLEX(8), managed :: x, y, z
END TYPE ComplexVectorType

The problem exists in the allocate sentence where I want to allocate array on the device

ALLOCATE(Basis(NumLocalDoFs,NumberQuadPts))

What’s the meaning of the error? How can I fix the error so that I can access and initialize the Basis array?

The compiler I’m using is: pgf90 19.10-0 LLVM 64-bit target on Ubuntu18.04 64bit LTS.

Thank you,

Shubin

Hi Shubin,

The error is a compiler device code generation issue, though without a full reproducing example I can’t tell for sure what’s wrong, but does look like it’s a problem with the allocate. My best guess is that since “managed” can only be used on host side allocatable variables, having managed scalars in an array allocated on the device would be problematic.

While technically legal, it’s highly discouraged to do device side allocation. Besides severally limiting performance since allocation get serialized, the device has a very small heap (~8MB). Unless you limit the number of threads, even if you were able to compile, you’d most likely overflow the heap and get a runtime error. Also keep in mind that allocation here is on a per thread basis, so every thread will be allocating their own copy of “Basis”. Not sure this is what you’re intending.

If you can get me a small reproducing example, I then determine the exact problem, but would encourage you to rethink your algorithm and not use device size allocation.

Hope this helps,
Mat

Hi Mat,

Thank you for your help. It has been a little long for me to implement your suggestions. Anyway, I solved the problem I posted using the managed memory as you suggested. And yes, every thread should have its own “basis”.

Now I have another problem as shown below. I used nvcc to profile the compiled exe. The output is

==418== Profiling application: ./EMPACK_PGI_CUDA_freespace
==418== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   97.48%  400.35ms         1  400.35ms  400.35ms  400.35ms  elementmatrixmodule_fillelementmatrixglobal_
                    1.66%  6.8123ms         1  6.8123ms  6.8123ms  6.8123ms  sysmatrixmodule_fillsystemmatrixglobal_
                    0.85%  3.4939ms         1  3.4939ms  3.4939ms  3.4939ms  quadptsassignmentmodule_baseassignment_
                    0.01%  30.309us        33     918ns     544ns  1.5680us  [CUDA memcpy HtoD]
                    0.00%  6.6570us         8     832ns     640ns  1.1850us  [CUDA memcpy DtoH]
      API calls:   97.04%  23.4286s   1757508  13.330us  4.6920us  20.411ms  cudaMallocManaged
                    1.71%  411.93ms         8  51.491ms  45.236us  400.42ms  cudaFree
                    1.20%  288.76ms         2  144.38ms  1.2640us  288.76ms  cudaEventCreate
                    0.04%  9.4212ms         5  1.8842ms  9.0900us  9.3244ms  cudaMemcpy
                    0.01%  2.9050ms         3  968.32us  20.300us  2.7319ms  cudaLaunchKernel
                    0.00%  493.72us         5  98.743us  19.455us  153.65us  cudaMalloc
                    0.00%  308.95us        28  11.033us  3.7860us  26.392us  cudaMemcpyToSymbol
                    0.00%  229.28us         1  229.28us  229.28us  229.28us  cuDeviceTotalMem
                    0.00%  182.43us        96  1.9000us     238ns  74.036us  cuDeviceGetAttribute
                    0.00%  70.740us         8  8.8420us  7.0990us  13.728us  cudaMemcpyFromSymbol
                    0.00%  45.234us         1  45.234us  45.234us  45.234us  cudaEventSynchronize
                    0.00%  36.335us         2  18.167us  9.1190us  27.216us  cudaEventRecord
                    0.00%  30.776us         1  30.776us  30.776us  30.776us  cuDeviceGetName
                    0.00%  4.5740us         1  4.5740us  4.5740us  4.5740us  cuDeviceGetPCIBusId
                    0.00%  3.5840us         1  3.5840us  3.5840us  3.5840us  cudaEventElapsedTime
                    0.00%  2.5240us         3     841ns     349ns  1.7710us  cuDeviceGetCount
                    0.00%  1.1640us         2     582ns     244ns     920ns  cuDeviceGet

and

  • ==418== Unified Memory profiling result:
    Device “GeForce GTX 1080 with Max-Q Design (0)”
    Count Avg Size Min Size Max Size Total Size Total Time Name
    9769 38.889KB 4.0000KB 0.9961MB 371.0039MB 67.22339ms Host To Device
    150 86.773KB 4.0000KB 0.9922MB 12.71094MB 2.147744ms Device To Host
    1060 - - - - 313.4901ms Gpu page fault groups
    Total CPU Page faults: 1247

Do you have any suggestions to reduce the time to call “cudaMallocManaged” since it takes 97% of the total execution time?

Best regards,
Shubin

Hi Shubin,

Calls to cudaMallocManaged do have more overhead than cudaMalloc, but not too much. The problem here is the shear volume of calls, 1.7 million of them. Not sure why you have so many. Are you allocating each individual vector type?

Can really offer any suggestions without a reproducing example, but I’d look at why you have so many calls.

-Mat

Hi Mat,

The numerous allocation is shown below

ALLOCATE(BasesGlobal(NumberOfElements,NumberOfElements))
            ALLOCATE(ElementMatrixGroup(NumberOfElements,NumberOfElements))
            DO iEle=1,NumberOfElements
              DO jEle=1,NumberOfElements
                ALLOCATE(BasesGlobal(iEle,jEle)%TestBases(2,EleMatQuads(iEle,jEle)%NTestQuadPts))
                ALLOCATE(BasesGlobal(iEle,jEle)%SourceBases(2,EleMatQuads(iEle,jEle)%NSrcQuadPts))
                ALLOCATE(BasesGlobal(iEle,jEle)%xsiTest(2,EleMatQuads(iEle,jEle)%NTestQuadPts))
                ALLOCATE(BasesGlobal(iEle,jEle)%xsiSource(2,EleMatQuads(iEle,jEle)%NSrcQuadPts))
                ALLOCATE(BasesGlobal(iEle,jEle)%wghtTest(EleMatQuads(iEle,jEle)%NTestQuadPts))
                ALLOCATE(BasesGlobal(iEle,jEle)%wghtSource(EleMatQuads(iEle,jEle)%NSrcQuadPts))
                IF(EleMatQuads(iEle,jEle)%NearFlag) THEN
                  ALLOCATE(BasesGlobal(iEle,jEle)%xsiTempSing(2,7))
                  ALLOCATE(BasesGlobal(iEle,jEle)%WghtTempSing(7))
                ENDIF

                ALLOCATE(ElementMatrixGroup(iEle, jEle)%ElementMat( &
                            Element(iEle)%Unknown(1)%NumberLocalDoFs,Element(jEle)%Unknown(1)%NumberLocalDoFs))
              ENDDO
            ENDDO

where ‘BasesGlobal’ and ‘ElementMatrixGroup’ are defined as derived types:

TYPE BasisFunctionTypeGroup
    TYPE(BasisFunctionType), allocatable,managed :: TestBases(:,:)
    TYPE(BasisFunctionType), allocatable,managed :: SourceBases(:,:)
    REAL(8), ALLOCATABLE,managed                 :: xsiTest(:,:)
    REAL(8), ALLOCATABLE,managed                 :: wghtTest(:)
    REAL(8), ALLOCATABLE,managed                 :: xsiSource(:,:)
    REAL(8), ALLOCATABLE,managed                 :: wghtSource(:)
    REAL(8), ALLOCATABLE,managed                 :: xsiTempSing(:,:)
    REAL(8), ALLOCATABLE,managed                 :: WghtTempSing(:)
  END TYPE BasisFunctionTypeGroup

  TYPE(BasisFunctionTypeGroup), ALLOCATABLE, managed :: BasesGlobal(:,:)

and

TYPE ElementMatrixType
     COMPLEX(8), ALLOCATABLE, managed :: ElementMat(:,:)
   END TYPE ElementMatrixType

   TYPE(ElementMatrixType), ALLOCATABLE, managed :: ElementMatrixGroup(:,:)

.

The shape of the derived type and the shape of the structure members are determined at the run time. If ‘NumberOfElements=500’, then the allocation is huge. Perhaps there is a way to avoid the numerous calls, but I need a hint. Thanks.

Regards,
Shubin