pgfortran hangs on .cuf file with shared memory

I’m trying to speed up an application I have using shared memory. However, when I try to compile the relevant module, the compiler hangs after telling me how much time I have left on my trial license, forcing me to kill the process. I am using 64-bit Ubuntu Linux. The bash terminal looks like this:

pgfortran -c ../src/cubic_bspline_interp_2D_fast_mod.cuf
NOTE: your trial license will expire in 8 days, 8.98 hours.

The module code is below (sorry for the length):

module cubic_bspline_interp_2D_fast_mod
	
	use cudafor

	implicit none

	!------------------------------------------------------------------------------------------------------------------------
	! parameters
	integer, parameter :: real_kind = 4
	real(real_kind), parameter :: pole = sqrt(3.0)-2.0
	real(real_kind), parameter :: lambda = (1.0-pole)*(1.0-1.0/pole)

	!------------------------------------------------------------------------------------------------------------------------
	! module-scope device array containing interpolation coefficients and associated dimensions
	real(real_kind), device, allocatable, dimension(:,:) :: coeffs
	integer :: xDim, yDim, allocFlag

	contains

	!------------------------------------------------------------------------------------------------------------------------
	!------------------------------------------------------------------------------------------------------------------------
	! the following routines handle memory allocation on the gpu
	!------------------------------------------------------------------------------------------------------------------------
	subroutine CudaInit()

		integer :: devNum=0, devStat
		type(cudadeviceprop) :: devProp

		devStat = cudaGetDeviceProperties(devProp,devNum)
		if (devStat .gt. 0) then
			print *, cudaGetErrorString(devStat)
			stop 'Error getting device properties!'
		endif

		print *, 'CUDA Fortran initialized on the following device: ', trim(devProp%name)	

	end subroutine CudaInit
	
	!------------------------------------------------------------------------------------------------------------------------
	subroutine CopyValuesToDevice(samples)

		real(real_kind), dimension(:,:) :: samples

		xDim = size(samples,1)
		yDim = size(samples,2)	
	
		if(allocFlag==0) then
			allocate(coeffs(xDim,yDim))
		endif

		coeffs = samples(1:xDim,1:yDim)
		allocFlag=1
		
	end subroutine CopyValuesToDevice

	!------------------------------------------------------------------------------------------------------------------------
	subroutine CopyCoeffsToHost(coeffsHost)

		real(real_kind), dimension(xDim,yDim) :: coeffsHost
		coeffsHost = coeffs(1:xDim,1:yDim)

	end subroutine CopyCoeffsToHost

	!------------------------------------------------------------------------------------------------------------------------
	subroutine DeallocCoeffs()

		deallocate(coeffs)
		allocFlag=0

	end subroutine DeallocCoeffs

	!------------------------------------------------------------------------------------------------------------------------
	!------------------------------------------------------------------------------------------------------------------------
	! the following routines are used to convert the sample data to interpolation coefficients
	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) function InitialCausalCoefficientX(coeffsBlock, blockRow, width)

		real(real_kind), shared, dimension(blockdim%x,width) :: coeffsBlock
		integer :: blockRow	, width	

		real(real_kind) :: InitialCausalCoefficientX

		integer :: horizon, n
		real(real_kind) :: zn, total

		horizon = min(28,width)
		zn=pole

		total = coeffsBlock(blockRow,1);
		do n=2,horizon
			total = total + zn*coeffsBlock(blockRow,n)
			zn = zn * pole
		enddo

		InitialCausalCoefficientX = total
		return

	end function InitialCausalCoefficientX

	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) function InitialAnticausalCoefficientX(coeffsBlock, blockRow, width)

		real(real_kind), shared, dimension(blockdim%x,width) :: coeffsBlock
		integer :: blockRow	, width	

		real(real_kind) :: InitialAnticausalCoefficientX

		InitialAnticausalCoefficientX = (pole/(pole*pole-1.0))*(pole*coeffsBlock(blockRow,width-1)+coeffsBlock(blockRow,width))
		return
		
	end function InitialAnticausalCoefficientX

	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) subroutine ConvertToInterpolationCoefficientsX(coeffsBlock, blockRow, width)

		real(real_kind), shared, dimension(blockdim%x,width) :: coeffsBlock
		integer :: blockRow	, width	

		!real(real_kind), device, dimension(dataLength) :: c
		!integer :: dataLength

		real(real_kind) :: previous_c
		integer :: n	

		! causal initialization
		coeffsBlock(blockRow,1) = lambda * InitialCausalCoefficientX(coeffsBlock,blockRow,width)
		previous_c = coeffsBlock(blockRow,1)

		! causal recursion
		do n=2,width
			coeffsBlock(blockRow,n) = lambda*coeffsBlock(blockRow,n) + pole*previous_c
			previous_c = coeffsBlock(blockRow,n)
		enddo	

		! anticausal initialization
		coeffsBlock(blockRow,width) = InitialAnticausalCoefficientX(coeffsBlock,blockRow,width)
		previous_c = coeffsBlock(blockRow,width)

		! anticausal recursion
		do n=width-1,1,-1
			coeffsBlock(blockRow,n) = pole * (previous_c - coeffsBlock(blockRow,n))
			previous_c = coeffsBlock(blockRow,n)
		enddo		 

	end subroutine ConvertToInterpolationCoefficientsX

	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) function InitialCausalCoefficientY(blockCoeffs, blockCol, height)

		real(real_kind), shared, dimension(height,blockdim%x) :: blockCoeffs
		integer :: blockCol, height

		real(real_kind) :: InitialCausalCoefficientY

		integer :: horizon, n
		real(real_kind) :: zn, total

		horizon = min(28,height)
		zn=pole

		total = blockCoeffs(1,blockCol);
		do n=2,horizon
			total = total + zn*blockCoeffs(n,blockCol)
			zn = zn * pole
		enddo

		InitialCausalCoefficientY = total
		return

	end function InitialCausalCoefficientY

	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) function InitialAnticausalCoefficientY(blockCoeffs, blockCol, height)

		real(real_kind), shared, dimension(height,blockdim%x) :: blockCoeffs
		integer :: blockCol, height

		real(real_kind) :: InitialAnticausalCoefficientY

		InitialAnticausalCoefficientY = (pole/(pole*pole-1.0))*(pole*blockCoeffs(height-1,blockCol)+blockCoeffs(height,blockCol))
		return

	end function InitialAnticausalCoefficientY

	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) subroutine ConvertToInterpolationCoefficientsY(blockCoeffs, blockCol, height)

		real(real_kind), shared, dimension(height,blockdim%x) :: blockCoeffs
		integer :: blockCol, height

		real(real_kind) :: previous_c
		integer :: n	

		! causal initialization
		blockCoeffs(1,blockCol) = lambda * InitialCausalCoefficientY(blockCoeffs,blockCol,height)
		previous_c = blockCoeffs(1,blockCol)

		! causal recursion
		do n=2,height
			blockCoeffs(n,blockCol) = lambda*blockCoeffs(n,blockCol) + pole*previous_c
			previous_c = blockCoeffs(n,blockCol)
		enddo	

		! anticausal initialization
		blockCoeffs(height,blockCol) = InitialAnticausalCoefficientY(blockCol,height)
		previous_c = blockCoeffs(height,blockCol)

		! anticausal recursion
		do n=height-1,1,-1
			blockCoeffs(n,blockCol) = pole * (previous_c - blockCoeffs(n,blockCol))
			previous_c = blockCoeffs(n,blockCol)
		enddo		


	end subroutine ConvertToInterpolationCoefficientsY
	!------------------------------------------------------------------------------------------------------------------------
	attributes(global) subroutine SamplesToCoefficients2DX(height,width)

		integer, value :: height,width

		integer :: arrRow, blockRow, r
		real(real_kind), shared, dimension(blockdim%x,width) :: coeffsBlock

		blockRow = threadid%x
		arrRow = (blockidx%x-1) * blockdim%x + threadidx%x
		
		if(arrRow<=height .and. blockRow<=blockDim%x) then

			coeffsBlock(blockRow,1:width) = coeffs(arrRow,1:width)
			r = cudathreadsynchronize()

			call ConvertToInterpolationCoefficientsX(coeffsBlock,blockRow, width)
			coeffs(arrRow,1:width) = coeffsBlock(blockRow,1:width)

		endif
	
	end subroutine SamplesToCoefficients2DX

	!------------------------------------------------------------------------------------------------------------------------
	attributes(global) subroutine SamplesToCoefficients2DY(height,width)

		integer, value :: height,width

		integer :: arrCol, blockCol, r
		real(real_kind), shared, dimension(height,blockdim%x) :: coeffsBlock
		
		blockCol = threadid%x
		arrCol = (blockidx%x-1) * blockdim%x + threadidx%x
		
		if(col<=width .and. blockCol <= blockdim%x) then
			
			coeffsBlock(1:height,blockCol) = coeffs(1:height,arrCol)
			r = cudathreadsynchronize()

			call ConvertToInterpolationCoefficientsY(blockCoeffs, blockCol, height)
			coeffs(1:height,arrCol) = blockCoeffs(1:height,blockCol)

		endif
	
	end subroutine SamplesToCoefficients2DY

	!------------------------------------------------------------------------------------------------------------------------
	subroutine SamplesToCoefficients2D()

		integer :: r

		integer :: dimBlockX, dimGridX, dimBlockY, dimGridY, errCode

		if(allocFlag==1) then
			dimBlockX = min(64,xDim);
			dimGridX = max(1,xDim/(dimBlockX)+1)
			call SamplesToCoefficients2DX<<<dimGridX,dimBlockX>>>(xDim,yDim)
			r = cudathreadsynchronize()
			errCode = cudaGetLastError()
			if (errCode .gt. 0) then
			   print *, cudaGetErrorString(errCode)
				stop 'Error! Kernel failed!'
			endif


			dimBlockY = min(64,yDim);
			dimGridY = max(1,yDim/(dimBlockY)+1)
			call SamplesToCoefficients2DY<<<dimGridY,dimBlockY>>>(xDim,yDim)
			errCode = cudaGetLastError()
			if (errCode .gt. 0) then
			   print *, cudaGetErrorString(errCode)
				stop 'Error! Kernel failed!'
			endif

		else
			print *, 'Coefficient matrix not allocated on device yet!'
			stop 'Program terminated by cubic_bspline_interp_2D_mod:SamplesToCoefficients2D'
		endif

	end subroutine SamplesToCoefficients2D

	!------------------------------------------------------------------------------------------------------------------------
	subroutine SamplesToCoefficients2DTimer()

		integer :: r, c1, c2

		integer :: dimBlockX, dimGridX, dimBlockY, dimGridY, errCode

		if(allocFlag==1) then
			call system_clock( count=c1 )

			dimBlockX = min(64,xDim);
			dimGridX = max(1,xDim/(dimBlockX)+1)
			call SamplesToCoefficients2DX<<<dimGridX,dimBlockX>>>(xDim,yDim)
			r = cudathreadsynchronize()
			errCode = cudaGetLastError()
			if (errCode .gt. 0) then
			   print *, cudaGetErrorString(errCode)
				stop 'Error! Kernel failed!'
			endif


			dimBlockY = min(64,yDim);
			dimGridY = max(1,yDim/(dimBlockY)+1)
			call SamplesToCoefficients2DY<<<dimGridY,dimBlockY>>>(xDim,yDim)
			errCode = cudaGetLastError()
			if (errCode .gt. 0) then
			   print *, cudaGetErrorString(errCode)
				stop 'Error! Kernel failed!'
			endif


			call system_clock( count=c2 )

			print *, 'Total prefilter time: ', c2-c1, ' microseconds' 
		else
			print *, 'Coefficient matrix not allocated on device yet!'
			stop 'Program terminated by cubic_bspline_interp_2D_mod:SamplesToCoefficients2DTimer'
		endif 

	end subroutine SamplesToCoefficients2DTimer

	!------------------------------------------------------------------------------------------------------------------------
	!------------------------------------------------------------------------------------------------------------------------
	! the following routines are used to perform the actual interpolation
	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) subroutine BsplineWeights(frac, w0, w1, w2, w3)

		real(real_kind) :: frac, w0, w1, w2, w3

		real(real_kind) :: one_frac, squared, one_squared

		one_frac = 1.0-frac
		squared = frac*frac
		one_squared = one_frac*one_frac

		w0 = 1.0/6.0 * one_squared * one_frac
		w1 = 2.0/3.0 - 0.5 * squared * (2.0-frac)
		w2 = 2.0/3.0 - 0.5 * one_squared * (2.0-one_frac)
		w3 = 1.0/6.0 * squared * frac

	end subroutine BsplineWeights

	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) function LinearInterp2DFunc(u,v)

		real(real_kind) :: u, v
		real(real_kind) :: LinearInterp2DFunc

		integer :: x0, x1, y0, y1
		real(real_kind) :: q00, q01, q10, q11, nx1, nx0, ny1, ny0, dx, dy

		x0 = floor(u)
		x1 = x0+1
		y0 = floor(v)
		y1 = y0+1

		q00 = coeffs(x0, y0)
		q01 = coeffs(x0, y1)
		q10 = coeffs(x1, y0)
		q11 = coeffs(x1, y1)

		nx0 = u - real(x0,real_kind)
		nx1 = real(x1,real_kind) - u
		ny0 = v - real(y0,real_kind)
		ny1 = real(y1,real_kind) - v

		dx = real(x1-x0,real_kind)
		dy = real(y1-y0,real_kind)

		LinearInterp2DFunc = q00*(nx1*ny1)/(dx*dy) + q10*(nx0*ny1)/(dx*dy) + q01*(nx1*ny0)/(dx*dy) + q11*(nx0*ny0)/(dx*dy)
		return			

	end function LinearInterp2DFunc
	
	!------------------------------------------------------------------------------------------------------------------------
	attributes(device) function CubicInterp2DFunc(x,y)

		real(real_kind) :: x, y
		real(real_kind) :: CubicInterp2DFunc

		integer :: xx, yy
		real(real_kind) :: x0, y0, indX, indY, fracX, fracY, w0X, w1X, w2X, w3X, w0Y, w1Y, w2Y, w3Y, g0X, g1X, h0X, h1X, g0Y, g1Y, h0Y, h1Y, li00, li10, li01, li11

		x0 = x-0.5
		y0 = y-0.5
		indX = floor(x0)
		indY = floor(y0)
		fracX = x0 - indX
		fracY = y0 - indY

		call BsplineWeights(fracX, w0X, w1X, w2X, w3X)
		call BsplineWeights(fracY, w0Y, w1Y, w2Y, w3Y)

		g0X= w0X + w1X
		g1X = w2X + w3X
		h0X = (w1X/g0X) - 0.5 + indX
		h1X = (w3X/g1X) + 1.5 + indX

		g0Y= w0Y + w1Y
		g1Y = w2Y + w3Y
		h0Y = (w1Y/g0Y) - 0.5 + indY
		h1Y = (w3Y/g1Y) + 1.5 + indY

		li00 = LinearInterp2DFunc(h0X,h0Y)
		li10 = LinearInterp2DFunc(h1X,h0Y)
		li01 = LinearInterp2DFunc(h0X,h1Y)
		li11 = LinearInterp2DFunc(h1X,h1Y)

		li00 = g0Y * li00 + g1Y * li01
		li10 = g0Y * li10 + g1Y * li11

		CubicInterp2DFunc = g0X * li00 + g1X * li10
		return
		
	end function CubicInterp2DFunc	

	!------------------------------------------------------------------------------------------------------------------------
	attributes(global) subroutine CubicInterpVec2D_kernel(coordsDev,resultDev,nCoords)
		
		real(real_kind), device, dimension(nCoords,2) :: coordsDev
		real(real_kind), device, dimension(nCoords) :: resultDev
		integer,value :: nCoords

		integer :: i
		real(real_kind) :: x, y

		i = (blockidx%x-1) * blockdim%x + threadidx%x
		if (i .le. nCoords) then
			x = coordsDev(i,1)
			y = coordsDev(i,2)
			resultDev(i) = CubicInterp2DFunc(x,y)
		endif

	end subroutine CubicInterpVec2D_kernel
	
	!------------------------------------------------------------------------------------------------------------------------
	subroutine CubicInterpVec2D(coordsHost, resultHost)

		!integer :: nCoords
		real(real_kind), dimension(:,:) :: coordsHost
		real(real_kind), dimension(:) :: resultHost
		!integer :: nCoords

		integer :: dimGrid, dimBlock, r, n, errCode, nCoords
		real(real_kind), device, allocatable, dimension(:,:) :: coordsDev
		real(real_kind), device, allocatable, dimension(:) :: resultDev

		if(allocFlag==1) then

			nCoords = size(resultHost)
			if(size(coordsHost,1) .ne. nCoords) then
				print *, 'Number of coordinates is not equal to the number of desired interpolated values!'
				stop 'Program terminated by cubic_bspline_interp_2D_mod:CubicVec2D'
			endif

			!print *, 'Attempting to allocate device memory...'
			allocate( coordsDev(nCoords, 2), resultDev(nCoords) )

			!print *, 'Attempting to copy test points to device...'			
			coordsDev = coordsHost(1:nCoords, 1:2)

			!print *, 'Attempting to call the kernel...'
			dimBlock = min(64,nCoords)
			dimGrid = max(1,nCoords/dimBlock+1)
			call CubicInterpVec2D_kernel<<<dimGrid,dimBlock>>>(coordsDev,resultDev,nCoords)
			
			errCode = cudaGetLastError()
			if (errCode .gt. 0) then
			   print *, cudaGetErrorString(errCode)
				stop 'Error! Kernel failed!'
			endif

			!print *, 'Attempting to copy results back to host...'
			resultHost=resultDev(1:nCoords)

			!print *, 'Deallocating device memory...'
			deallocate(coordsDev,resultDev)
		else
			print *, 'Coefficient matrix not allocated on device yet!'
			stop 'Program terminated by cubic_bspline_interp_2D_mod:CubicInterpVec2D'
		endif			

	end subroutine CubicInterpVec2D

	!------------------------------------------------------------------------------------------------------------------------
	attributes(global) subroutine CubicInterpSingle2D_kernel(x,y,resultDev)
		
		real(real_kind), value :: x,y
		real(real_kind), device :: resultDev

		resultDev = CubicInterp2DFunc(x,y)

	end subroutine CubicInterpSingle2D_kernel
	!------------------------------------------------------------------------------------------------------------------------
	subroutine CubicInterpSingle2D(x,y,resultHost)

		real(real_kind) :: x,y,resultHost

		integer :: r, errCode
		real(real_kind), allocatable, device :: resultDev

		if(allocFlag==1) then
			allocate(resultDev)
			call CubicInterpSingle2D_kernel<<<1,1>>>(x,y,resultDev)
			
			errCode = cudaGetLastError()
			if (errCode .gt. 0) then
			   print *, cudaGetErrorString(errCode)
				stop 'Error! Kernel failed!'
			endif

			resultHost=resultDev
			deallocate(resultDev)
		else
			print *, 'Coefficient matrix not allocated on device yet!'
			stop 'Program terminated by cubic_bspline_interp_2D_mod:CubicInterpSingle2D'
		endif			

	end subroutine CubicInterpSingle2D

end module cubic_bspline_interp_2D_fast_mod

The only things I’ve changed from the earlier version are: InitialCausalCoefficientX(Y), InitialAntiCausalCoefficientX(Y), ConvertToInterpolationCoefficientsX(Y), and SamplesToCoefficients2DX(Y). Everything else is unchanged from the version of the code that works properly.

I think the problem is probably in your use of shared memory. The compiler should be giving you an error, but evidently isn’t.

In the CUDA Fortran doc, section 3.2.3, it talks about shared memory limitations. You need to make the shared memory declarations either fixed size, or assumed-size. If it is assumed size, and you want it to be two dimensional, use a fixed-size first dimension, and make the 2nd dimension ‘*’. Then, pass in the size of the total shared array, in bytes, as the 3rd argument inside of the chevron syntax. This is basically the same way as it needs to be done in CUDA C.

I’ll enter a bug for not flagging your use of shared memory as an error.

Thanks. I figured there was an error somewhere. I will try that tomorrow when I get to work.