CUF: -Minline causes internal error

I have the code below, which compiles and works fine without -Minline or -Minline=reshape, but setting either causes the compiler to crash with the following message:

pgf90 -O3 -c -Mcuda=cc60,fastmath,ptxinfo test.CUF -o test.o -Minline -Minfo=all



op_cuda_testme:
118, testme_gpu inlined, size=5, file test.CUF (21)
nvvmCompileProgram error: 9.
Error: /tmp/pgaccos6bQQfYX-Wa.gpu (315, 89): parse pointers to void are invalid - use i8* instead

The issue has to do with the device allocatable in the separate module. Is there a workaround for this? (Aside from not usign Minline of course, which is important for performance)

The full code:

MODULE HYDRA_CONST_MODULE
use cudafor
real*8, allocatable, device, dimension(:) :: r22z_OP2CONSTANT
END MODULE

MODULE KERNELS_testme_MODULE
USE CUDAFOR

! testmevariable declarations

INTEGER(kind=4), CONSTANT :: opDat1_stride_OP2CONSTANT

#define OP2_SOA(var,dim,stride) var((dim-1)*stride+1)

CONTAINS

       attributes(device) subroutine testme_gpu(qo2,mz1)
       use HYDRA_CONST_MODULE

       implicit none

       integer*4 mz1
       real*8    qo2(*)


       real*8    cr22,qp2



         cr22 = r22z_OP2CONSTANT(mz1)



         qp2 = OP2_SOA(qo2,2, opDat1_stride_OP2CONSTANT)
         OP2_SOA(qo2,2, opDat1_stride_OP2CONSTANT) = cr22*qp2

       end subroutine testme_gpu





! CUDA kernel function
attributes (global) SUBROUTINE op_cuda_testme( &
  & opDat1Devicetestme, &
  & opDat3Devicetestme, &
  & opDat5Devicetestme, &
  & opDat1Map, &
  & pblkMap, &
  & poffset, &
  & pnelems, &
  & pnthrcol, &
  & pthrcol, &
  & setSize, &
  & blockOffset)

  IMPLICIT NONE

! local variables
  INTEGER(kind=4), DEVICE :: opDat1Devicetestme(*)
  REAL(kind=8), DEVICE :: opDat3Devicetestme(*)
  INTEGER(kind=4), DEVICE :: opDat5Devicetestme(*)
  INTEGER(kind=4), DEVICE, INTENT(IN) :: opDat1Map(*)

  INTEGER(kind=4) map1idx, map2idx

  INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pblkMap
  INTEGER(kind=4), DIMENSION(0:*), DEVICE :: poffset
  INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pnelems
  INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pnthrcol
  INTEGER(kind=4), DIMENSION(0:*), DEVICE, INTENT(IN) :: pthrcol
  INTEGER(kind=4), VALUE :: blockOffset
  INTEGER(kind=4), VALUE :: setSize


  INTEGER(kind=4), SHARED :: numOfColours
  INTEGER(kind=4), SHARED :: numberOfActiveThreadsCeiling
  INTEGER(kind=4), SHARED :: blockID
  INTEGER(kind=4), SHARED :: threadBlockOffset
  INTEGER(kind=4), SHARED :: numberOfActiveThreads
  INTEGER(kind=4) :: colour1
  INTEGER(kind=4) :: colour2
  INTEGER(kind=4) :: n1
  INTEGER(kind=4) :: i3
  INTEGER(kind=4) :: i1
  INTEGER(kind=4) :: i2


  IF (threadIdx%x - 1 .EQ. 0) THEN
    blockID = pblkMap(blockIdx%x - 1 + blockOffset)
    numberOfActiveThreads = pnelems(blockID)
    numberOfActiveThreadsCeiling = blockDim%x * (1 + (numberOfActiveThreads - 1) / blockDim%x)
    numOfColours = pnthrcol(blockID)
    threadBlockOffset = poffset(blockID)

  END IF

  CALL syncthreads()


  i1 = threadIdx%x - 1

  DO WHILE (i1 < numberOfActiveThreadsCeiling )
    colour2 = -1
    DO colour1 = 0, numOfColours - 1, 1
      IF (i1 < numberOfActiveThreads) THEN
        i3 = i1
        map1idx = opDat1Map(1 + i3 + threadBlockOffset + setSize * 0)
        map2idx = opDat1Map(1 + i3 + threadBlockOffset + setSize * 1)
        colour2 = pthrcol(i1 + threadBlockOffset)
        IF (colour2 .EQ. colour1) THEN

! kernel call
          CALL testme_gpu( &
          & opDat3Devicetestme(1 + map2idx), &
          & opDat5Devicetestme(1 + map1idx) &
          & )

        END IF
      END IF
      IF (colour1 .NE. numOfColours-1) THEN
        CALL syncthreads()
      END IF
    END DO
    i1 = i1 + blockDim%x
  END DO



END SUBROUTINE

END MODULE

We have replicated the issue and we have assigned it TPR 24812.
Thanks for the submission. Note that this is also not working in our
upcoming 17.9 release.

dave