CUD Fortran - Device allocatable variable in and c_f_pointer

Hi,

in my code I declare two device allocatable array in a module, and then use them as target of two c_f_pointer calls in a host subroutine.

The compiler returns the internal error:

PGF90-S-0000-Internal compiler error. bld_lhs, ast nyd 716 (save_soln_cuda_module.CUF: 307)
PGF90-S-0000-Internal compiler error. bld_lhs, ast nyd 725 (save_soln_cuda_module.CUF: 307)
… and other similar ones

The compiler I am using is the 10.9 version.

Here is my code

module save_soln_cuda_module

	type varSizes_save_soln

		integer(4) :: parg0Size
		integer(4) :: parg1Size

	end type varSizes_save_soln

	! logical that tells if the input data to the kernel has been already generated
	! by previous calls to this same op_par_loop function
	logical :: isKernelInputDataGenerated = .false.

	! sizes of input arguments to CUDA kernel
	type(varSizes_save_soln), device :: argSizes


	real(8), dimension(:), allocatable, device :: argument0
	real(8), dimension(:), allocatable, device :: argument1


	! input data to CUDA kernel
	! declared here to make them static variables (i.e. state that survives successive op_par_loop function calls)
	integer(4) :: data0Size, data1Size

	contains

		! subroutine called by the kernel (modified by eliminating c void pointers)
		attributes(device) subroutine save_soln ( q, qold )

			implicit none

			! declaration of formal parameters
			real(8), device :: q(4)
			real(8), device :: qold(4)
			
			! iteration variable
			integer(4) :: i

			! size_q and size_qold are the same value
			do i = 1, 4
				qold(i) = q(i)
			end do

		end subroutine save_soln

		! kernel function
		attributes(global) subroutine op_cuda_save_soln ( argSizes, parg0, parg1, offsetS, setsize, warpSizeOP2 )

			use cudafor

			implicit none

			! declaration of formal parameters
			
			type(varSizes_save_soln), device :: argSizes

			real(8), dimension(0:argSizes%parg0Size-1), device :: parg0
			real(8), dimension(0:argSizes%parg1Size-1), device :: parg1

			integer(4), value :: offsetS
			integer(4), value :: setsize
			integer(4), value :: warpSizeOP2
			
			real(8), dimension(0:3) :: arg0_l
			real(8), dimension(0:3) :: arg1_l
			
			integer(4) :: tid, offset, nelems, n, m
				
			! automatic shared memory
			real(8), shared :: autoshared(0:*)
				
			integer(4) :: argSDisplacement

				
			tid = mod ( (threadidx%x)-1, warpSizeOP2 )
			
			! remember that:
			! char *arg_s = shared + offset_s*(threadIdx.x/OP_WARPSIZE);
			! / 8 is because our autoshared variable is a real(8) variable
			argSDisplacement = (offsetS * ( (threadidx%x-1) / warpSizeOP2 )) / 8

			! process set elements

			! implements:
			! for (int n=threadIdx.x+blockIdx.x*blockDim.x;
			!			 n<set_size; n+=blockDim.x*gridDim.x) {
			n = (threadidx%x-1) + (blockidx%x-1) * blockdim%x
			do while ( n .lt. setsize )

				! implements:
				! int offset = n - tid;
				offset = n - tid

				! implements:
				! int nelems = MIN(OP_WARPSIZE,set_size-offset);
				nelems = min ( warpSizeOP2, (setSize - offset) )

				! copy data into shared memory, then into local
																																											
				!for (int m=0; m<4; m++)
				!	((double *)arg_s)[tid+m*nelems] = arg0[tid+m*nelems+offset*4];
				do m = 0, 3

					! autoshared ( argSDisplacement ) = arg_s
					! 4 is the dimension of argument 0 in this op_par_loop call ! argSDisplacement
					autoshared ( argSDisplacement  + ( tid + m * nelems ) ) = parg0 ( tid + m * nelems + offset * 4 )
					
				end do
																		 
				! for (int m=0; m<4; m++)
				!		arg0_l[m] = ((double *)arg_s)[m+tid*4];
				do m = 0, 3
				
					! autoshared ( argSDisplacement ) = arg_s
					! 4 is the dimension of argument 0 in this op_par_loop call
					arg0_l(m) = autoshared ( argSDisplacement + ( m + tid * 4 ) )
				
				end do
			 
				! user-supplied kernel call

				! implements:
				! save_soln( arg0_l,
				!            arg1_l );
				call save_soln ( arg0_l, &
											 & arg1_l &
										 & )
											 
				! copy back into shared memory, then to device

				! implements:
				! for (int m=0; m<4; m++)
				!		((double *)arg_s)[m+tid*4] = arg1_l[m];

				do m = 0, 3
				
					! autoshared ( argSDisplacement ) = arg_s
					! 4 is the dimension of argument 0 in this op_par_loop call
					autoshared ( argSDisplacement + ( m + tid * 4 ) ) = arg1_l(m)
								
				end do


				! implements:
				! for (int m=0; m<4; m++)
				!		arg1[tid+m*nelems+offset*4] = ((double *)arg_s)[tid+m*nelems];
				do m = 0, 3
				
					! autoshared ( argSDisplacement ) = arg_s
					! 4 is the dimension of argument 0 in this op_par_loop call
					parg1 ( tid + m * nelems + offset * 4 ) = autoshared ( argSDisplacement + ( tid + m * nelems ) )
				
				end do
			 

				n = n + blockdim%x * griddim%x


			end do
			
		end subroutine op_cuda_save_soln

		! caller of the kernel
		attributes(host) function op_par_loop_save_soln ( subroutineName, set, &
																		 & arg0, idx0, ptr0, access0, &
																		 & arg1, idx1, ptr1, access1 &
																	 & )

			! use directives	
			use, intrinsic :: ISO_C_BINDING
			use cudafor

			! mandatory	
			implicit none

			! declaration of intrinsic functions	
			intrinsic int, max

			type(profInfo) :: op_par_loop_save_soln
			
			! formal arguments
			character, dimension(*), intent(in) :: subroutineName
			
			! data set on which we loop
			type(op_set), intent(in) :: set

			! data ids used in the function
			type(op_dat) :: arg0, arg1
			
			! index to be used in first and second pointers
			integer(4), intent(in) :: idx0, idx1
			
			! ptr ids for indirect access to data
			type(op_map) :: ptr0, ptr1
			
			! access values for arguments
			integer(4), intent(in) :: access0, access1

			! local variables
			
			! used for mallocs and memcpys
!			integer(4) :: data0Size, data1Size
			
			! define and compute grid and block sizes and other variables (unused in this case)

!			real(8), dimension(:), allocatable, device :: argument0
!			real(8), dimension(:), allocatable, device :: argument1

!			type(varSizes_save_soln), device :: argSizes

			integer(4) :: nblocks = 200
			integer(4) :: nthreads = 128
			integer(4) :: nshared = 0
			integer(4) :: offsetS = 0

			integer(4) :: warpSizeOP2

			integer(4) :: threadSynchRet

			! profiling
			integer :: istat
			type (cudaEvent) :: startKernelTime, stopKernelTime, startHostTime, stopHostTime
			real(4) :: tmpHostTime

			! create events
			istat = cudaEventCreate(startKernelTime)
			istat = cudaEventCreate(stopKernelTime)
			istat = cudaEventCreate(startHostTime)
			istat = cudaEventCreate(stopHostTime)
			
			istat = cudaEventRecord ( startHostTime, 0 )
			

			warpSizeOP2 = OP_WARP_SIZE

			! this is mandatory, otherwise nshared will become 4096 from the previous invocation!!
			nshared = 0
																																		
			! work out shared memory requirements per element
									 
			nshared = max ( nshared, 8 * 4 ) ! 8 = sizeof(double) => real(8)
			nshared = max ( nshared, 8 * 4 ) ! 8 = sizeof(double) => real(8)


			offsetS = nshared * OP_WARP_SIZE

			nshared = nshared * nthreads


			if ( isKernelInputDataGenerated .eq. .false. ) then 

				data0Size = ( arg0%dim * arg0%set%size)
				data1Size = ( arg1%dim * arg1%set%size)

				call c_f_pointer ( arg0%dat_d, argument0, (/data0Size/) )
				call c_f_pointer ( arg1%dat_d, argument1, (/data1Size/) )
				
				argSizes%parg0Size = data0Size
				argSizes%parg1Size = data1Size

				isKernelInputDataGenerated = .true.

			end if
			
			istat = cudaEventRecord ( stopHostTime, 0 )
			istat = cudaEventSynchronize ( stopHostTime )
			istat = cudaEventElapsedTime ( tmpHostTime, startHostTime, stopHostTime )

			op_par_loop_save_soln%hostTime = 0
			op_par_loop_save_soln%hostTime = op_par_loop_save_soln%hostTime + tmpHostTime
			tmpHostTime = 0
			
			istat = cudaEventRecord ( startKernelTime, 0 )
			
			! apply kernel to all set elements
			call op_cuda_save_soln<<<nblocks,nthreads,nshared>>> ( argSizes, &
																													 & argument0, &
																													 & argument1, &
																													 & offsetS, &
																													 & set%size, &
																													 & warpSizeOP2 &
																												 & )


			! synchronise threads after kernel call
			threadSynchRet = cudaThreadSynchronize()

			istat = cudaEventRecord ( stopKernelTime, 0 )
			istat = cudaEventSynchronize ( stopKernelTime )
			istat = cudaEventElapsedTime ( op_par_loop_save_soln%kernelTime, startKernelTime, stopKernelTime )

			istat = cudaEventRecord ( startHostTime, 0 )
			! empty code here...only if there is a reduction it is filled up with something
			istat = cudaEventRecord ( stopHostTime, 0 )
			istat = cudaEventSynchronize ( stopHostTime )
			istat = cudaEventElapsedTime ( tmpHostTime, startHostTime, stopHostTime )

			op_par_loop_save_soln%hostTime = op_par_loop_save_soln%hostTime + tmpHostTime	


			
		end function op_par_loop_save_soln

end module save_soln_cuda_module

Any idea of what is going on?

Hi Carlo,

An internal compiler error (ICE) is always a compiler problem but unfortunately I’m not able to reproduce the ICE with the source you provided. Can you please send a complete reproducing example code to PGI Customer Service (trs@pgroup.com)?

Thanks,
Mat

Hi Matt,

thanks for your reply: I have e-mailed a complete example to the address below.


Regards,

Carlo