Hi,
in my code I declare two device allocatable array in a module, and then use them as target of two c_f_pointer calls in a host subroutine.
The compiler returns the internal error:
PGF90-S-0000-Internal compiler error. bld_lhs, ast nyd 716 (save_soln_cuda_module.CUF: 307)
PGF90-S-0000-Internal compiler error. bld_lhs, ast nyd 725 (save_soln_cuda_module.CUF: 307)
… and other similar ones
The compiler I am using is the 10.9 version.
Here is my code
module save_soln_cuda_module
type varSizes_save_soln
integer(4) :: parg0Size
integer(4) :: parg1Size
end type varSizes_save_soln
! logical that tells if the input data to the kernel has been already generated
! by previous calls to this same op_par_loop function
logical :: isKernelInputDataGenerated = .false.
! sizes of input arguments to CUDA kernel
type(varSizes_save_soln), device :: argSizes
real(8), dimension(:), allocatable, device :: argument0
real(8), dimension(:), allocatable, device :: argument1
! input data to CUDA kernel
! declared here to make them static variables (i.e. state that survives successive op_par_loop function calls)
integer(4) :: data0Size, data1Size
contains
! subroutine called by the kernel (modified by eliminating c void pointers)
attributes(device) subroutine save_soln ( q, qold )
implicit none
! declaration of formal parameters
real(8), device :: q(4)
real(8), device :: qold(4)
! iteration variable
integer(4) :: i
! size_q and size_qold are the same value
do i = 1, 4
qold(i) = q(i)
end do
end subroutine save_soln
! kernel function
attributes(global) subroutine op_cuda_save_soln ( argSizes, parg0, parg1, offsetS, setsize, warpSizeOP2 )
use cudafor
implicit none
! declaration of formal parameters
type(varSizes_save_soln), device :: argSizes
real(8), dimension(0:argSizes%parg0Size-1), device :: parg0
real(8), dimension(0:argSizes%parg1Size-1), device :: parg1
integer(4), value :: offsetS
integer(4), value :: setsize
integer(4), value :: warpSizeOP2
real(8), dimension(0:3) :: arg0_l
real(8), dimension(0:3) :: arg1_l
integer(4) :: tid, offset, nelems, n, m
! automatic shared memory
real(8), shared :: autoshared(0:*)
integer(4) :: argSDisplacement
tid = mod ( (threadidx%x)-1, warpSizeOP2 )
! remember that:
! char *arg_s = shared + offset_s*(threadIdx.x/OP_WARPSIZE);
! / 8 is because our autoshared variable is a real(8) variable
argSDisplacement = (offsetS * ( (threadidx%x-1) / warpSizeOP2 )) / 8
! process set elements
! implements:
! for (int n=threadIdx.x+blockIdx.x*blockDim.x;
! n<set_size; n+=blockDim.x*gridDim.x) {
n = (threadidx%x-1) + (blockidx%x-1) * blockdim%x
do while ( n .lt. setsize )
! implements:
! int offset = n - tid;
offset = n - tid
! implements:
! int nelems = MIN(OP_WARPSIZE,set_size-offset);
nelems = min ( warpSizeOP2, (setSize - offset) )
! copy data into shared memory, then into local
!for (int m=0; m<4; m++)
! ((double *)arg_s)[tid+m*nelems] = arg0[tid+m*nelems+offset*4];
do m = 0, 3
! autoshared ( argSDisplacement ) = arg_s
! 4 is the dimension of argument 0 in this op_par_loop call ! argSDisplacement
autoshared ( argSDisplacement + ( tid + m * nelems ) ) = parg0 ( tid + m * nelems + offset * 4 )
end do
! for (int m=0; m<4; m++)
! arg0_l[m] = ((double *)arg_s)[m+tid*4];
do m = 0, 3
! autoshared ( argSDisplacement ) = arg_s
! 4 is the dimension of argument 0 in this op_par_loop call
arg0_l(m) = autoshared ( argSDisplacement + ( m + tid * 4 ) )
end do
! user-supplied kernel call
! implements:
! save_soln( arg0_l,
! arg1_l );
call save_soln ( arg0_l, &
& arg1_l &
& )
! copy back into shared memory, then to device
! implements:
! for (int m=0; m<4; m++)
! ((double *)arg_s)[m+tid*4] = arg1_l[m];
do m = 0, 3
! autoshared ( argSDisplacement ) = arg_s
! 4 is the dimension of argument 0 in this op_par_loop call
autoshared ( argSDisplacement + ( m + tid * 4 ) ) = arg1_l(m)
end do
! implements:
! for (int m=0; m<4; m++)
! arg1[tid+m*nelems+offset*4] = ((double *)arg_s)[tid+m*nelems];
do m = 0, 3
! autoshared ( argSDisplacement ) = arg_s
! 4 is the dimension of argument 0 in this op_par_loop call
parg1 ( tid + m * nelems + offset * 4 ) = autoshared ( argSDisplacement + ( tid + m * nelems ) )
end do
n = n + blockdim%x * griddim%x
end do
end subroutine op_cuda_save_soln
! caller of the kernel
attributes(host) function op_par_loop_save_soln ( subroutineName, set, &
& arg0, idx0, ptr0, access0, &
& arg1, idx1, ptr1, access1 &
& )
! use directives
use, intrinsic :: ISO_C_BINDING
use cudafor
! mandatory
implicit none
! declaration of intrinsic functions
intrinsic int, max
type(profInfo) :: op_par_loop_save_soln
! formal arguments
character, dimension(*), intent(in) :: subroutineName
! data set on which we loop
type(op_set), intent(in) :: set
! data ids used in the function
type(op_dat) :: arg0, arg1
! index to be used in first and second pointers
integer(4), intent(in) :: idx0, idx1
! ptr ids for indirect access to data
type(op_map) :: ptr0, ptr1
! access values for arguments
integer(4), intent(in) :: access0, access1
! local variables
! used for mallocs and memcpys
! integer(4) :: data0Size, data1Size
! define and compute grid and block sizes and other variables (unused in this case)
! real(8), dimension(:), allocatable, device :: argument0
! real(8), dimension(:), allocatable, device :: argument1
! type(varSizes_save_soln), device :: argSizes
integer(4) :: nblocks = 200
integer(4) :: nthreads = 128
integer(4) :: nshared = 0
integer(4) :: offsetS = 0
integer(4) :: warpSizeOP2
integer(4) :: threadSynchRet
! profiling
integer :: istat
type (cudaEvent) :: startKernelTime, stopKernelTime, startHostTime, stopHostTime
real(4) :: tmpHostTime
! create events
istat = cudaEventCreate(startKernelTime)
istat = cudaEventCreate(stopKernelTime)
istat = cudaEventCreate(startHostTime)
istat = cudaEventCreate(stopHostTime)
istat = cudaEventRecord ( startHostTime, 0 )
warpSizeOP2 = OP_WARP_SIZE
! this is mandatory, otherwise nshared will become 4096 from the previous invocation!!
nshared = 0
! work out shared memory requirements per element
nshared = max ( nshared, 8 * 4 ) ! 8 = sizeof(double) => real(8)
nshared = max ( nshared, 8 * 4 ) ! 8 = sizeof(double) => real(8)
offsetS = nshared * OP_WARP_SIZE
nshared = nshared * nthreads
if ( isKernelInputDataGenerated .eq. .false. ) then
data0Size = ( arg0%dim * arg0%set%size)
data1Size = ( arg1%dim * arg1%set%size)
call c_f_pointer ( arg0%dat_d, argument0, (/data0Size/) )
call c_f_pointer ( arg1%dat_d, argument1, (/data1Size/) )
argSizes%parg0Size = data0Size
argSizes%parg1Size = data1Size
isKernelInputDataGenerated = .true.
end if
istat = cudaEventRecord ( stopHostTime, 0 )
istat = cudaEventSynchronize ( stopHostTime )
istat = cudaEventElapsedTime ( tmpHostTime, startHostTime, stopHostTime )
op_par_loop_save_soln%hostTime = 0
op_par_loop_save_soln%hostTime = op_par_loop_save_soln%hostTime + tmpHostTime
tmpHostTime = 0
istat = cudaEventRecord ( startKernelTime, 0 )
! apply kernel to all set elements
call op_cuda_save_soln<<<nblocks,nthreads,nshared>>> ( argSizes, &
& argument0, &
& argument1, &
& offsetS, &
& set%size, &
& warpSizeOP2 &
& )
! synchronise threads after kernel call
threadSynchRet = cudaThreadSynchronize()
istat = cudaEventRecord ( stopKernelTime, 0 )
istat = cudaEventSynchronize ( stopKernelTime )
istat = cudaEventElapsedTime ( op_par_loop_save_soln%kernelTime, startKernelTime, stopKernelTime )
istat = cudaEventRecord ( startHostTime, 0 )
! empty code here...only if there is a reduction it is filled up with something
istat = cudaEventRecord ( stopHostTime, 0 )
istat = cudaEventSynchronize ( stopHostTime )
istat = cudaEventElapsedTime ( tmpHostTime, startHostTime, stopHostTime )
op_par_loop_save_soln%hostTime = op_par_loop_save_soln%hostTime + tmpHostTime
end function op_par_loop_save_soln
end module save_soln_cuda_module
Any idea of what is going on?