I have the code below, which compiles and works fine without -Minline or -Minline=reshape, but setting either causes the compiler to crash with the following message:
pgf90 -O3 -c -Mcuda=cc60,fastmath,ptxinfo test.CUF -o test.o -Minline -Minfo=all
op_cuda_testme:
118, testme_gpu inlined, size=5, file test.CUF (21)
nvvmCompileProgram error: 9.
Error: /tmp/pgaccos6bQQfYX-Wa.gpu (315, 89): parse pointers to void are invalid - use i8* instead
The issue has to do with the device allocatable in the separate module. Is there a workaround for this? (Aside from not usign Minline of course, which is important for performance)
The full code:
MODULE HYDRA_CONST_MODULE
use cudafor
real*8, allocatable, device, dimension(:) :: r22z_OP2CONSTANT
END MODULE
MODULE KERNELS_testme_MODULE
USE CUDAFOR
! testmevariable declarations
INTEGER(kind=4), CONSTANT :: opDat1_stride_OP2CONSTANT
#define OP2_SOA(var,dim,stride) var((dim-1)*stride+1)
CONTAINS
attributes(device) subroutine testme_gpu(qo2,mz1)
use HYDRA_CONST_MODULE
implicit none
integer*4 mz1
real*8 qo2(*)
real*8 cr22,qp2
cr22 = r22z_OP2CONSTANT(mz1)
qp2 = OP2_SOA(qo2,2, opDat1_stride_OP2CONSTANT)
OP2_SOA(qo2,2, opDat1_stride_OP2CONSTANT) = cr22*qp2
end subroutine testme_gpu
! CUDA kernel function
attributes (global) SUBROUTINE op_cuda_testme( &
& opDat1Devicetestme, &
& opDat3Devicetestme, &
& opDat5Devicetestme, &
& opDat1Map, &
& pblkMap, &
& poffset, &
& pnelems, &
& pnthrcol, &
& pthrcol, &
& setSize, &
& blockOffset)
IMPLICIT NONE
! local variables
INTEGER(kind=4), DEVICE :: opDat1Devicetestme(*)
REAL(kind=8), DEVICE :: opDat3Devicetestme(*)
INTEGER(kind=4), DEVICE :: opDat5Devicetestme(*)
INTEGER(kind=4), DEVICE, INTENT(IN) :: opDat1Map(*)
INTEGER(kind=4) map1idx, map2idx
INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pblkMap
INTEGER(kind=4), DIMENSION(0:*), DEVICE :: poffset
INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pnelems
INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pnthrcol
INTEGER(kind=4), DIMENSION(0:*), DEVICE, INTENT(IN) :: pthrcol
INTEGER(kind=4), VALUE :: blockOffset
INTEGER(kind=4), VALUE :: setSize
INTEGER(kind=4), SHARED :: numOfColours
INTEGER(kind=4), SHARED :: numberOfActiveThreadsCeiling
INTEGER(kind=4), SHARED :: blockID
INTEGER(kind=4), SHARED :: threadBlockOffset
INTEGER(kind=4), SHARED :: numberOfActiveThreads
INTEGER(kind=4) :: colour1
INTEGER(kind=4) :: colour2
INTEGER(kind=4) :: n1
INTEGER(kind=4) :: i3
INTEGER(kind=4) :: i1
INTEGER(kind=4) :: i2
IF (threadIdx%x - 1 .EQ. 0) THEN
blockID = pblkMap(blockIdx%x - 1 + blockOffset)
numberOfActiveThreads = pnelems(blockID)
numberOfActiveThreadsCeiling = blockDim%x * (1 + (numberOfActiveThreads - 1) / blockDim%x)
numOfColours = pnthrcol(blockID)
threadBlockOffset = poffset(blockID)
END IF
CALL syncthreads()
i1 = threadIdx%x - 1
DO WHILE (i1 < numberOfActiveThreadsCeiling )
colour2 = -1
DO colour1 = 0, numOfColours - 1, 1
IF (i1 < numberOfActiveThreads) THEN
i3 = i1
map1idx = opDat1Map(1 + i3 + threadBlockOffset + setSize * 0)
map2idx = opDat1Map(1 + i3 + threadBlockOffset + setSize * 1)
colour2 = pthrcol(i1 + threadBlockOffset)
IF (colour2 .EQ. colour1) THEN
! kernel call
CALL testme_gpu( &
& opDat3Devicetestme(1 + map2idx), &
& opDat5Devicetestme(1 + map1idx) &
& )
END IF
END IF
IF (colour1 .NE. numOfColours-1) THEN
CALL syncthreads()
END IF
END DO
i1 = i1 + blockDim%x
END DO
END SUBROUTINE
END MODULE