pgfortran hangs on .cuf file with shared memory

I’m trying to speed up an application I have using shared memory. However, when I try to compile the relevant module, the compiler hangs after telling me how much time I have left on my trial license, forcing me to kill the process. I am using 64-bit Ubuntu Linux. The bash terminal looks like this:

pgfortran -c ../src/cubic_bspline_interp_2D_fast_mod.cuf
NOTE: your trial license will expire in 8 days, 8.98 hours.

The module code is below (sorry for the length):

module cubic_bspline_interp_2D_fast_mod
	
	use cudafor

	implicit none

	!------------------------------------------------------------------------------------------------------------------------
	! parameters
	integer, parameter :: real_kind = 4
	real(real_kind), parameter :: pole = sqrt(3.0)-2.0
	real(real_kind), parameter :: lambda = (1.0-pole)*(1.0-1.0/pole)

	!------------------------------------------------------------------------------------------------------------------------
	! module-scope device array containing interpolation coefficients and associated dimensions
	real(real_kind), device, allocatable, dimension(:,:) :: coeffs
	integer :: xDim, yDim, allocFlag

	contains

	!------------------------------------------------------------------------------------------------------------------------
	!------------------------------------------------------------------------------------------------------------------------
	! the following routines handle memory allocation on the gpu
	!------------------------------------------------------------------------------------------------------------------------
	subroutine CudaInit()

		integer :: devNum=0, devStat
		type(cudadeviceprop) :: devProp

		devStat = cudaGetDeviceProperties(devProp,devNum)
		if (devStat .gt. 0) then
			print *, cudaGetErrorString(devStat)
			stop 'Error getting device properties!'
		endif

		print *, 'CUDA Fortran initialized on the following device: ', trim(devProp%name)	

	end subroutine CudaInit
	
	!------------------------------------------------------------------------------------------------------------------------
	subroutine CopyValuesToDevice(samples)

		real(real_kind), dimension(:,:) :: samples

		xDim = size(samples,1)
		yDim = size(samples,2)	
	
		if(allocFlag==0) then
			allocate(coeffs(xDim,yDim))
		endif

		coeffs = samples(1:xDim,1:yDim)
		allocFlag=1
		
	end subroutine CopyValuesToDevice

	!------------------------------------------------------------------------------------------------------------------------
	subroutine CopyCoeffsToHost(coeffsHost)

		real(real_kind), dimension(xDim,yDim) :: coeffsHost
		coeffsHost = coeffs(1:xDim,1:yDim)

	end subroutine CopyCoeffsToHost

	!------------------------------------------------------------------------------------------------------------------------
	subroutine DeallocCoeffs()

		deallocate(coeffs)
		allocFlag=0

	end subroutine DeallocCoeffs

	!------------------------------------------------------------------------------------------------------------------------
	!------------------------------------------------------------------------------------------------------------------------
	! the following routines are used to convert the sample data to interpolation coefficients
	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) function InitialCausalCoefficientX(coeffsBlock, blockRow, width)

		real(real_kind), shared, dimension(blockdim%x,width) :: coeffsBlock
		integer :: blockRow	, width	

		real(real_kind) :: InitialCausalCoefficientX

		integer :: horizon, n
		real(real_kind) :: zn, total

		horizon = min(28,width)
		zn=pole

		total = coeffsBlock(blockRow,1);
		do n=2,horizon
			total = total + zn*coeffsBlock(blockRow,n)
			zn = zn * pole
		enddo

		InitialCausalCoefficientX = total
		return

	end function InitialCausalCoefficientX

	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) function InitialAnticausalCoefficientX(coeffsBlock, blockRow, width)

		real(real_kind), shared, dimension(blockdim%x,width) :: coeffsBlock
		integer :: blockRow	, width	

		real(real_kind) :: InitialAnticausalCoefficientX

		InitialAnticausalCoefficientX = (pole/(pole*pole-1.0))*(pole*coeffsBlock(blockRow,width-1)+coeffsBlock(blockRow,width))
		return
		
	end function InitialAnticausalCoefficientX

	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) subroutine ConvertToInterpolationCoefficientsX(coeffsBlock, blockRow, width)

		real(real_kind), shared, dimension(blockdim%x,width) :: coeffsBlock
		integer :: blockRow	, width	

		!real(real_kind), device, dimension(dataLength) :: c
		!integer :: dataLength

		real(real_kind) :: previous_c
		integer :: n	

		! causal initialization
		coeffsBlock(blockRow,1) = lambda * InitialCausalCoefficientX(coeffsBlock,blockRow,width)
		previous_c = coeffsBlock(blockRow,1)

		! causal recursion
		do n=2,width
			coeffsBlock(blockRow,n) = lambda*coeffsBlock(blockRow,n) + pole*previous_c
			previous_c = coeffsBlock(blockRow,n)
		enddo	

		! anticausal initialization
		coeffsBlock(blockRow,width) = InitialAnticausalCoefficientX(coeffsBlock,blockRow,width)
		previous_c = coeffsBlock(blockRow,width)

		! anticausal recursion
		do n=width-1,1,-1
			coeffsBlock(blockRow,n) = pole * (previous_c - coeffsBlock(blockRow,n))
			previous_c = coeffsBlock(blockRow,n)
		enddo		 

	end subroutine ConvertToInterpolationCoefficientsX

	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) function InitialCausalCoefficientY(blockCoeffs, blockCol, height)

		real(real_kind), shared, dimension(height,blockdim%x) :: blockCoeffs
		integer :: blockCol, height

		real(real_kind) :: InitialCausalCoefficientY

		integer :: horizon, n
		real(real_kind) :: zn, total

		horizon = min(28,height)
		zn=pole

		total = blockCoeffs(1,blockCol);
		do n=2,horizon
			total = total + zn*blockCoeffs(n,blockCol)
			zn = zn * pole
		enddo

		InitialCausalCoefficientY = total
		return

	end function InitialCausalCoefficientY

	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) function InitialAnticausalCoefficientY(blockCoeffs, blockCol, height)

		real(real_kind), shared, dimension(height,blockdim%x) :: blockCoeffs
		integer :: blockCol, height

		real(real_kind) :: InitialAnticausalCoefficientY

		InitialAnticausalCoefficientY = (pole/(pole*pole-1.0))*(pole*blockCoeffs(height-1,blockCol)+blockCoeffs(height,blockCol))
		return

	end function InitialAnticausalCoefficientY

	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) subroutine ConvertToInterpolationCoefficientsY(blockCoeffs, blockCol, height)

		real(real_kind), shared, dimension(height,blockdim%x) :: blockCoeffs
		integer :: blockCol, height

		real(real_kind) :: previous_c
		integer :: n	

		! causal initialization
		blockCoeffs(1,blockCol) = lambda * InitialCausalCoefficientY(blockCoeffs,blockCol,height)
		previous_c = blockCoeffs(1,blockCol)

		! causal recursion
		do n=2,height
			blockCoeffs(n,blockCol) = lambda*blockCoeffs(n,blockCol) + pole*previous_c
			previous_c = blockCoeffs(n,blockCol)
		enddo	

		! anticausal initialization
		blockCoeffs(height,blockCol) = InitialAnticausalCoefficientY(blockCol,height)
		previous_c = blockCoeffs(height,blockCol)

		! anticausal recursion
		do n=height-1,1,-1
			blockCoeffs(n,blockCol) = pole * (previous_c - blockCoeffs(n,blockCol))
			previous_c = blockCoeffs(n,blockCol)
		enddo		


	end subroutine ConvertToInterpolationCoefficientsY
	!------------------------------------------------------------------------------------------------------------------------
	attributes(global) subroutine SamplesToCoefficients2DX(height,width)

		integer, value :: height,width

		integer :: arrRow, blockRow, r
		real(real_kind), shared, dimension(blockdim%x,width) :: coeffsBlock

		blockRow = threadid%x
		arrRow = (blockidx%x-1) * blockdim%x + threadidx%x
		
		if(arrRow<=height .and. blockRow<=blockDim%x) then

			coeffsBlock(blockRow,1:width) = coeffs(arrRow,1:width)
			r = cudathreadsynchronize()

			call ConvertToInterpolationCoefficientsX(coeffsBlock,blockRow, width)
			coeffs(arrRow,1:width) = coeffsBlock(blockRow,1:width)

		endif
	
	end subroutine SamplesToCoefficients2DX

	!------------------------------------------------------------------------------------------------------------------------
	attributes(global) subroutine SamplesToCoefficients2DY(height,width)

		integer, value :: height,width

		integer :: arrCol, blockCol, r
		real(real_kind), shared, dimension(height,blockdim%x) :: coeffsBlock
		
		blockCol = threadid%x
		arrCol = (blockidx%x-1) * blockdim%x + threadidx%x
		
		if(col<=width .and. blockCol <= blockdim%x) then
			
			coeffsBlock(1:height,blockCol) = coeffs(1:height,arrCol)
			r = cudathreadsynchronize()

			call ConvertToInterpolationCoefficientsY(blockCoeffs, blockCol, height)
			coeffs(1:height,arrCol) = blockCoeffs(1:height,blockCol)

		endif
	
	end subroutine SamplesToCoefficients2DY

	!------------------------------------------------------------------------------------------------------------------------
	subroutine SamplesToCoefficients2D()

		integer :: r

		integer :: dimBlockX, dimGridX, dimBlockY, dimGridY, errCode

		if(allocFlag==1) then
			dimBlockX = min(64,xDim);
			dimGridX = max(1,xDim/(dimBlockX)+1)
			call SamplesToCoefficients2DX<<<dimGridX,dimBlockX>>>(xDim,yDim)
			r = cudathreadsynchronize()
			errCode = cudaGetLastError()
			if (errCode .gt. 0) then
			   print *, cudaGetErrorString(errCode)
				stop 'Error! Kernel failed!'
			endif


			dimBlockY = min(64,yDim);
			dimGridY = max(1,yDim/(dimBlockY)+1)
			call SamplesToCoefficients2DY<<<dimGridY,dimBlockY>>>(xDim,yDim)
			errCode = cudaGetLastError()
			if (errCode .gt. 0) then
			   print *, cudaGetErrorString(errCode)
				stop 'Error! Kernel failed!'
			endif

		else
			print *, 'Coefficient matrix not allocated on device yet!'
			stop 'Program terminated by cubic_bspline_interp_2D_mod:SamplesToCoefficients2D'
		endif

	end subroutine SamplesToCoefficients2D

	!------------------------------------------------------------------------------------------------------------------------
	subroutine SamplesToCoefficients2DTimer()

		integer :: r, c1, c2

		integer :: dimBlockX, dimGridX, dimBlockY, dimGridY, errCode

		if(allocFlag==1) then
			call system_clock( count=c1 )

			dimBlockX = min(64,xDim);
			dimGridX = max(1,xDim/(dimBlockX)+1)
			call SamplesToCoefficients2DX<<<dimGridX,dimBlockX>>>(xDim,yDim)
			r = cudathreadsynchronize()
			errCode = cudaGetLastError()
			if (errCode .gt. 0) then
			   print *, cudaGetErrorString(errCode)
				stop 'Error! Kernel failed!'
			endif


			dimBlockY = min(64,yDim);
			dimGridY = max(1,yDim/(dimBlockY)+1)
			call SamplesToCoefficients2DY<<<dimGridY,dimBlockY>>>(xDim,yDim)
			errCode = cudaGetLastError()
			if (errCode .gt. 0) then
			   print *, cudaGetErrorString(errCode)
				stop 'Error! Kernel failed!'
			endif


			call system_clock( count=c2 )

			print *, 'Total prefilter time: ', c2-c1, ' microseconds' 
		else
			print *, 'Coefficient matrix not allocated on device yet!'
			stop 'Program terminated by cubic_bspline_interp_2D_mod:SamplesToCoefficients2DTimer'
		endif 

	end subroutine SamplesToCoefficients2DTimer

	!------------------------------------------------------------------------------------------------------------------------
	!------------------------------------------------------------------------------------------------------------------------
	! the following routines are used to perform the actual interpolation
	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) subroutine BsplineWeights(frac, w0, w1, w2, w3)

		real(real_kind) :: frac, w0, w1, w2, w3

		real(real_kind) :: one_frac, squared, one_squared

		one_frac = 1.0-frac
		squared = frac*frac
		one_squared = one_frac*one_frac

		w0 = 1.0/6.0 * one_squared * one_frac
		w1 = 2.0/3.0 - 0.5 * squared * (2.0-frac)
		w2 = 2.0/3.0 - 0.5 * one_squared * (2.0-one_frac)
		w3 = 1.0/6.0 * squared * frac

	end subroutine BsplineWeights

	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) function LinearInterp2DFunc(u,v)

		real(real_kind) :: u, v
		real(real_kind) :: LinearInterp2DFunc

		integer :: x0, x1, y0, y1
		real(real_kind) :: q00, q01, q10, q11, nx1, nx0, ny1, ny0, dx, dy

		x0 = floor(u)
		x1 = x0+1
		y0 = floor(v)
		y1 = y0+1

		q00 = coeffs(x0, y0)
		q01 = coeffs(x0, y1)
		q10 = coeffs(x1, y0)
		q11 = coeffs(x1, y1)

		nx0 = u - real(x0,real_kind)
		nx1 = real(x1,real_kind) - u
		ny0 = v - real(y0,real_kind)
		ny1 = real(y1,real_kind) - v

		dx = real(x1-x0,real_kind)
		dy = real(y1-y0,real_kind)

		LinearInterp2DFunc = q00*(nx1*ny1)/(dx*dy) + q10*(nx0*ny1)/(dx*dy) + q01*(nx1*ny0)/(dx*dy) + q11*(nx0*ny0)/(dx*dy)
		return			

	end function LinearInterp2DFunc
	
	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) function CubicInterp2DFunc(x,y)

		real(real_kind) :: x, y
		real(real_kind) :: CubicInterp2DFunc

		integer :: xx, yy
		real(real_kind) :: x0, y0, indX, indY, fracX, fracY, w0X, w1X, w2X, w3X, w0Y, w1Y, w2Y, w3Y, g0X, g1X, h0X, h1X, g0Y, g1Y, h0Y, h1Y, li00, li10, li01, li11

		x0 = x-0.5
		y0 = y-0.5
		indX = floor(x0)
		indY = floor(y0)
		fracX = x0 - indX
		fracY = y0 - indY

		call BsplineWeights(fracX, w0X, w1X, w2X, w3X)
		call BsplineWeights(fracY, w0Y, w1Y, w2Y, w3Y)

		g0X= w0X + w1X
		g1X = w2X + w3X
		h0X = (w1X/g0X) - 0.5 + indX
		h1X = (w3X/g1X) + 1.5 + indX

		g0Y= w0Y + w1Y
		g1Y = w2Y + w3Y
		h0Y = (w1Y/g0Y) - 0.5 + indY
		h1Y = (w3Y/g1Y) + 1.5 + indY

		li00 = LinearInterp2DFunc(h0X,h0Y)
		li10 = LinearInterp2DFunc(h1X,h0Y)
		li01 = LinearInterp2DFunc(h0X,h1Y)
		li11 = LinearInterp2DFunc(h1X,h1Y)

		li00 = g0Y * li00 + g1Y * li01
		li10 = g0Y * li10 + g1Y * li11

		CubicInterp2DFunc = g0X * li00 + g1X * li10
		return
		
	end function CubicInterp2DFunc	

	!------------------------------------------------------------------------------------------------------------------------
	attributes(global) subroutine CubicInterpVec2D_kernel(coordsDev,resultDev,nCoords)
		
		real(real_kind), device, dimension(nCoords,2) :: coordsDev
		real(real_kind), device, dimension(nCoords) :: resultDev
		integer,value :: nCoords

		integer :: i
		real(real_kind) :: x, y

		i = (blockidx%x-1) * blockdim%x + threadidx%x
		if (i .le. nCoords) then
			x = coordsDev(i,1)
			y = coordsDev(i,2)
			resultDev(i) = CubicInterp2DFunc(x,y)
		endif

	end subroutine CubicInterpVec2D_kernel
	
	!------------------------------------------------------------------------------------------------------------------------
	subroutine CubicInterpVec2D(coordsHost, resultHost)

		!integer :: nCoords
		real(real_kind), dimension(:,:) :: coordsHost
		real(real_kind), dimension(:) :: resultHost
		!integer :: nCoords

		integer :: dimGrid, dimBlock, r, n, errCode, nCoords
		real(real_kind), device, allocatable, dimension(:,:) :: coordsDev
		real(real_kind), device, allocatable, dimension(:) :: resultDev

		if(allocFlag==1) then

			nCoords = size(resultHost)
			if(size(coordsHost,1) .ne. nCoords) then
				print *, 'Number of coordinates is not equal to the number of desired interpolated values!'
				stop 'Program terminated by cubic_bspline_interp_2D_mod:CubicVec2D'
			endif

			!print *, 'Attempting to allocate device memory...'
			allocate( coordsDev(nCoords, 2), resultDev(nCoords) )

			!print *, 'Attempting to copy test points to device...'			
			coordsDev = coordsHost(1:nCoords, 1:2)

			!print *, 'Attempting to call the kernel...'
			dimBlock = min(64,nCoords)
			dimGrid = max(1,nCoords/dimBlock+1)
			call CubicInterpVec2D_kernel<<<dimGrid,dimBlock>>>(coordsDev,resultDev,nCoords)
			
			errCode = cudaGetLastError()
			if (errCode .gt. 0) then
			   print *, cudaGetErrorString(errCode)
				stop 'Error! Kernel failed!'
			endif

			!print *, 'Attempting to copy results back to host...'
			resultHost=resultDev(1:nCoords)

			!print *, 'Deallocating device memory...'
			deallocate(coordsDev,resultDev)
		else
			print *, 'Coefficient matrix not allocated on device yet!'
			stop 'Program terminated by cubic_bspline_interp_2D_mod:CubicInterpVec2D'
		endif			

	end subroutine CubicInterpVec2D

	!------------------------------------------------------------------------------------------------------------------------
	attributes(global) subroutine CubicInterpSingle2D_kernel(x,y,resultDev)
		
		real(real_kind), value :: x,y
		real(real_kind), device :: resultDev

		resultDev = CubicInterp2DFunc(x,y)

	end subroutine CubicInterpSingle2D_kernel
	!------------------------------------------------------------------------------------------------------------------------
	subroutine CubicInterpSingle2D(x,y,resultHost)

		real(real_kind) :: x,y,resultHost

		integer :: r, errCode
		real(real_kind), allocatable, device :: resultDev

		if(allocFlag==1) then
			allocate(resultDev)
			call CubicInterpSingle2D_kernel<<<1,1>>>(x,y,resultDev)
			
			errCode = cudaGetLastError()
			if (errCode .gt. 0) then
			   print *, cudaGetErrorString(errCode)
				stop 'Error! Kernel failed!'
			endif

			resultHost=resultDev
			deallocate(resultDev)
		else
			print *, 'Coefficient matrix not allocated on device yet!'
			stop 'Program terminated by cubic_bspline_interp_2D_mod:CubicInterpSingle2D'
		endif			

	end subroutine CubicInterpSingle2D

end module cubic_bspline_interp_2D_fast_mod

The only things I’ve changed from the earlier version are: InitialCausalCoefficientX(Y), InitialAntiCausalCoefficientX(Y), ConvertToInterpolationCoefficientsX(Y), and SamplesToCoefficients2DX(Y). Everything else is unchanged from the version of the code that works properly.