I’m trying to speed up an application I have using shared memory. However, when I try to compile the relevant module, the compiler hangs after telling me how much time I have left on my trial license, forcing me to kill the process. I am using 64-bit Ubuntu Linux. The bash terminal looks like this:
pgfortran -c ../src/cubic_bspline_interp_2D_fast_mod.cuf
NOTE: your trial license will expire in 8 days, 8.98 hours.
The module code is below (sorry for the length):
module cubic_bspline_interp_2D_fast_mod
use cudafor
implicit none
!------------------------------------------------------------------------------------------------------------------------
! parameters
integer, parameter :: real_kind = 4
real(real_kind), parameter :: pole = sqrt(3.0)-2.0
real(real_kind), parameter :: lambda = (1.0-pole)*(1.0-1.0/pole)
!------------------------------------------------------------------------------------------------------------------------
! module-scope device array containing interpolation coefficients and associated dimensions
real(real_kind), device, allocatable, dimension(:,:) :: coeffs
integer :: xDim, yDim, allocFlag
contains
!------------------------------------------------------------------------------------------------------------------------
!------------------------------------------------------------------------------------------------------------------------
! the following routines handle memory allocation on the gpu
!------------------------------------------------------------------------------------------------------------------------
subroutine CudaInit()
integer :: devNum=0, devStat
type(cudadeviceprop) :: devProp
devStat = cudaGetDeviceProperties(devProp,devNum)
if (devStat .gt. 0) then
print *, cudaGetErrorString(devStat)
stop 'Error getting device properties!'
endif
print *, 'CUDA Fortran initialized on the following device: ', trim(devProp%name)
end subroutine CudaInit
!------------------------------------------------------------------------------------------------------------------------
subroutine CopyValuesToDevice(samples)
real(real_kind), dimension(:,:) :: samples
xDim = size(samples,1)
yDim = size(samples,2)
if(allocFlag==0) then
allocate(coeffs(xDim,yDim))
endif
coeffs = samples(1:xDim,1:yDim)
allocFlag=1
end subroutine CopyValuesToDevice
!------------------------------------------------------------------------------------------------------------------------
subroutine CopyCoeffsToHost(coeffsHost)
real(real_kind), dimension(xDim,yDim) :: coeffsHost
coeffsHost = coeffs(1:xDim,1:yDim)
end subroutine CopyCoeffsToHost
!------------------------------------------------------------------------------------------------------------------------
subroutine DeallocCoeffs()
deallocate(coeffs)
allocFlag=0
end subroutine DeallocCoeffs
!------------------------------------------------------------------------------------------------------------------------
!------------------------------------------------------------------------------------------------------------------------
! the following routines are used to convert the sample data to interpolation coefficients
!------------------------------------------------------------------------------------------------------------------------
attributes(device) function InitialCausalCoefficientX(coeffsBlock, blockRow, width)
real(real_kind), shared, dimension(blockdim%x,width) :: coeffsBlock
integer :: blockRow , width
real(real_kind) :: InitialCausalCoefficientX
integer :: horizon, n
real(real_kind) :: zn, total
horizon = min(28,width)
zn=pole
total = coeffsBlock(blockRow,1);
do n=2,horizon
total = total + zn*coeffsBlock(blockRow,n)
zn = zn * pole
enddo
InitialCausalCoefficientX = total
return
end function InitialCausalCoefficientX
!------------------------------------------------------------------------------------------------------------------------
attributes(device) function InitialAnticausalCoefficientX(coeffsBlock, blockRow, width)
real(real_kind), shared, dimension(blockdim%x,width) :: coeffsBlock
integer :: blockRow , width
real(real_kind) :: InitialAnticausalCoefficientX
InitialAnticausalCoefficientX = (pole/(pole*pole-1.0))*(pole*coeffsBlock(blockRow,width-1)+coeffsBlock(blockRow,width))
return
end function InitialAnticausalCoefficientX
!------------------------------------------------------------------------------------------------------------------------
attributes(device) subroutine ConvertToInterpolationCoefficientsX(coeffsBlock, blockRow, width)
real(real_kind), shared, dimension(blockdim%x,width) :: coeffsBlock
integer :: blockRow , width
!real(real_kind), device, dimension(dataLength) :: c
!integer :: dataLength
real(real_kind) :: previous_c
integer :: n
! causal initialization
coeffsBlock(blockRow,1) = lambda * InitialCausalCoefficientX(coeffsBlock,blockRow,width)
previous_c = coeffsBlock(blockRow,1)
! causal recursion
do n=2,width
coeffsBlock(blockRow,n) = lambda*coeffsBlock(blockRow,n) + pole*previous_c
previous_c = coeffsBlock(blockRow,n)
enddo
! anticausal initialization
coeffsBlock(blockRow,width) = InitialAnticausalCoefficientX(coeffsBlock,blockRow,width)
previous_c = coeffsBlock(blockRow,width)
! anticausal recursion
do n=width-1,1,-1
coeffsBlock(blockRow,n) = pole * (previous_c - coeffsBlock(blockRow,n))
previous_c = coeffsBlock(blockRow,n)
enddo
end subroutine ConvertToInterpolationCoefficientsX
!------------------------------------------------------------------------------------------------------------------------
attributes(device) function InitialCausalCoefficientY(blockCoeffs, blockCol, height)
real(real_kind), shared, dimension(height,blockdim%x) :: blockCoeffs
integer :: blockCol, height
real(real_kind) :: InitialCausalCoefficientY
integer :: horizon, n
real(real_kind) :: zn, total
horizon = min(28,height)
zn=pole
total = blockCoeffs(1,blockCol);
do n=2,horizon
total = total + zn*blockCoeffs(n,blockCol)
zn = zn * pole
enddo
InitialCausalCoefficientY = total
return
end function InitialCausalCoefficientY
!------------------------------------------------------------------------------------------------------------------------
attributes(device) function InitialAnticausalCoefficientY(blockCoeffs, blockCol, height)
real(real_kind), shared, dimension(height,blockdim%x) :: blockCoeffs
integer :: blockCol, height
real(real_kind) :: InitialAnticausalCoefficientY
InitialAnticausalCoefficientY = (pole/(pole*pole-1.0))*(pole*blockCoeffs(height-1,blockCol)+blockCoeffs(height,blockCol))
return
end function InitialAnticausalCoefficientY
!------------------------------------------------------------------------------------------------------------------------
attributes(device) subroutine ConvertToInterpolationCoefficientsY(blockCoeffs, blockCol, height)
real(real_kind), shared, dimension(height,blockdim%x) :: blockCoeffs
integer :: blockCol, height
real(real_kind) :: previous_c
integer :: n
! causal initialization
blockCoeffs(1,blockCol) = lambda * InitialCausalCoefficientY(blockCoeffs,blockCol,height)
previous_c = blockCoeffs(1,blockCol)
! causal recursion
do n=2,height
blockCoeffs(n,blockCol) = lambda*blockCoeffs(n,blockCol) + pole*previous_c
previous_c = blockCoeffs(n,blockCol)
enddo
! anticausal initialization
blockCoeffs(height,blockCol) = InitialAnticausalCoefficientY(blockCol,height)
previous_c = blockCoeffs(height,blockCol)
! anticausal recursion
do n=height-1,1,-1
blockCoeffs(n,blockCol) = pole * (previous_c - blockCoeffs(n,blockCol))
previous_c = blockCoeffs(n,blockCol)
enddo
end subroutine ConvertToInterpolationCoefficientsY
!------------------------------------------------------------------------------------------------------------------------
attributes(global) subroutine SamplesToCoefficients2DX(height,width)
integer, value :: height,width
integer :: arrRow, blockRow, r
real(real_kind), shared, dimension(blockdim%x,width) :: coeffsBlock
blockRow = threadid%x
arrRow = (blockidx%x-1) * blockdim%x + threadidx%x
if(arrRow<=height .and. blockRow<=blockDim%x) then
coeffsBlock(blockRow,1:width) = coeffs(arrRow,1:width)
r = cudathreadsynchronize()
call ConvertToInterpolationCoefficientsX(coeffsBlock,blockRow, width)
coeffs(arrRow,1:width) = coeffsBlock(blockRow,1:width)
endif
end subroutine SamplesToCoefficients2DX
!------------------------------------------------------------------------------------------------------------------------
attributes(global) subroutine SamplesToCoefficients2DY(height,width)
integer, value :: height,width
integer :: arrCol, blockCol, r
real(real_kind), shared, dimension(height,blockdim%x) :: coeffsBlock
blockCol = threadid%x
arrCol = (blockidx%x-1) * blockdim%x + threadidx%x
if(col<=width .and. blockCol <= blockdim%x) then
coeffsBlock(1:height,blockCol) = coeffs(1:height,arrCol)
r = cudathreadsynchronize()
call ConvertToInterpolationCoefficientsY(blockCoeffs, blockCol, height)
coeffs(1:height,arrCol) = blockCoeffs(1:height,blockCol)
endif
end subroutine SamplesToCoefficients2DY
!------------------------------------------------------------------------------------------------------------------------
subroutine SamplesToCoefficients2D()
integer :: r
integer :: dimBlockX, dimGridX, dimBlockY, dimGridY, errCode
if(allocFlag==1) then
dimBlockX = min(64,xDim);
dimGridX = max(1,xDim/(dimBlockX)+1)
call SamplesToCoefficients2DX<<<dimGridX,dimBlockX>>>(xDim,yDim)
r = cudathreadsynchronize()
errCode = cudaGetLastError()
if (errCode .gt. 0) then
print *, cudaGetErrorString(errCode)
stop 'Error! Kernel failed!'
endif
dimBlockY = min(64,yDim);
dimGridY = max(1,yDim/(dimBlockY)+1)
call SamplesToCoefficients2DY<<<dimGridY,dimBlockY>>>(xDim,yDim)
errCode = cudaGetLastError()
if (errCode .gt. 0) then
print *, cudaGetErrorString(errCode)
stop 'Error! Kernel failed!'
endif
else
print *, 'Coefficient matrix not allocated on device yet!'
stop 'Program terminated by cubic_bspline_interp_2D_mod:SamplesToCoefficients2D'
endif
end subroutine SamplesToCoefficients2D
!------------------------------------------------------------------------------------------------------------------------
subroutine SamplesToCoefficients2DTimer()
integer :: r, c1, c2
integer :: dimBlockX, dimGridX, dimBlockY, dimGridY, errCode
if(allocFlag==1) then
call system_clock( count=c1 )
dimBlockX = min(64,xDim);
dimGridX = max(1,xDim/(dimBlockX)+1)
call SamplesToCoefficients2DX<<<dimGridX,dimBlockX>>>(xDim,yDim)
r = cudathreadsynchronize()
errCode = cudaGetLastError()
if (errCode .gt. 0) then
print *, cudaGetErrorString(errCode)
stop 'Error! Kernel failed!'
endif
dimBlockY = min(64,yDim);
dimGridY = max(1,yDim/(dimBlockY)+1)
call SamplesToCoefficients2DY<<<dimGridY,dimBlockY>>>(xDim,yDim)
errCode = cudaGetLastError()
if (errCode .gt. 0) then
print *, cudaGetErrorString(errCode)
stop 'Error! Kernel failed!'
endif
call system_clock( count=c2 )
print *, 'Total prefilter time: ', c2-c1, ' microseconds'
else
print *, 'Coefficient matrix not allocated on device yet!'
stop 'Program terminated by cubic_bspline_interp_2D_mod:SamplesToCoefficients2DTimer'
endif
end subroutine SamplesToCoefficients2DTimer
!------------------------------------------------------------------------------------------------------------------------
!------------------------------------------------------------------------------------------------------------------------
! the following routines are used to perform the actual interpolation
!------------------------------------------------------------------------------------------------------------------------
attributes(device) subroutine BsplineWeights(frac, w0, w1, w2, w3)
real(real_kind) :: frac, w0, w1, w2, w3
real(real_kind) :: one_frac, squared, one_squared
one_frac = 1.0-frac
squared = frac*frac
one_squared = one_frac*one_frac
w0 = 1.0/6.0 * one_squared * one_frac
w1 = 2.0/3.0 - 0.5 * squared * (2.0-frac)
w2 = 2.0/3.0 - 0.5 * one_squared * (2.0-one_frac)
w3 = 1.0/6.0 * squared * frac
end subroutine BsplineWeights
!------------------------------------------------------------------------------------------------------------------------
attributes(device) function LinearInterp2DFunc(u,v)
real(real_kind) :: u, v
real(real_kind) :: LinearInterp2DFunc
integer :: x0, x1, y0, y1
real(real_kind) :: q00, q01, q10, q11, nx1, nx0, ny1, ny0, dx, dy
x0 = floor(u)
x1 = x0+1
y0 = floor(v)
y1 = y0+1
q00 = coeffs(x0, y0)
q01 = coeffs(x0, y1)
q10 = coeffs(x1, y0)
q11 = coeffs(x1, y1)
nx0 = u - real(x0,real_kind)
nx1 = real(x1,real_kind) - u
ny0 = v - real(y0,real_kind)
ny1 = real(y1,real_kind) - v
dx = real(x1-x0,real_kind)
dy = real(y1-y0,real_kind)
LinearInterp2DFunc = q00*(nx1*ny1)/(dx*dy) + q10*(nx0*ny1)/(dx*dy) + q01*(nx1*ny0)/(dx*dy) + q11*(nx0*ny0)/(dx*dy)
return
end function LinearInterp2DFunc
!------------------------------------------------------------------------------------------------------------------------
attributes(device) function CubicInterp2DFunc(x,y)
real(real_kind) :: x, y
real(real_kind) :: CubicInterp2DFunc
integer :: xx, yy
real(real_kind) :: x0, y0, indX, indY, fracX, fracY, w0X, w1X, w2X, w3X, w0Y, w1Y, w2Y, w3Y, g0X, g1X, h0X, h1X, g0Y, g1Y, h0Y, h1Y, li00, li10, li01, li11
x0 = x-0.5
y0 = y-0.5
indX = floor(x0)
indY = floor(y0)
fracX = x0 - indX
fracY = y0 - indY
call BsplineWeights(fracX, w0X, w1X, w2X, w3X)
call BsplineWeights(fracY, w0Y, w1Y, w2Y, w3Y)
g0X= w0X + w1X
g1X = w2X + w3X
h0X = (w1X/g0X) - 0.5 + indX
h1X = (w3X/g1X) + 1.5 + indX
g0Y= w0Y + w1Y
g1Y = w2Y + w3Y
h0Y = (w1Y/g0Y) - 0.5 + indY
h1Y = (w3Y/g1Y) + 1.5 + indY
li00 = LinearInterp2DFunc(h0X,h0Y)
li10 = LinearInterp2DFunc(h1X,h0Y)
li01 = LinearInterp2DFunc(h0X,h1Y)
li11 = LinearInterp2DFunc(h1X,h1Y)
li00 = g0Y * li00 + g1Y * li01
li10 = g0Y * li10 + g1Y * li11
CubicInterp2DFunc = g0X * li00 + g1X * li10
return
end function CubicInterp2DFunc
!------------------------------------------------------------------------------------------------------------------------
attributes(global) subroutine CubicInterpVec2D_kernel(coordsDev,resultDev,nCoords)
real(real_kind), device, dimension(nCoords,2) :: coordsDev
real(real_kind), device, dimension(nCoords) :: resultDev
integer,value :: nCoords
integer :: i
real(real_kind) :: x, y
i = (blockidx%x-1) * blockdim%x + threadidx%x
if (i .le. nCoords) then
x = coordsDev(i,1)
y = coordsDev(i,2)
resultDev(i) = CubicInterp2DFunc(x,y)
endif
end subroutine CubicInterpVec2D_kernel
!------------------------------------------------------------------------------------------------------------------------
subroutine CubicInterpVec2D(coordsHost, resultHost)
!integer :: nCoords
real(real_kind), dimension(:,:) :: coordsHost
real(real_kind), dimension(:) :: resultHost
!integer :: nCoords
integer :: dimGrid, dimBlock, r, n, errCode, nCoords
real(real_kind), device, allocatable, dimension(:,:) :: coordsDev
real(real_kind), device, allocatable, dimension(:) :: resultDev
if(allocFlag==1) then
nCoords = size(resultHost)
if(size(coordsHost,1) .ne. nCoords) then
print *, 'Number of coordinates is not equal to the number of desired interpolated values!'
stop 'Program terminated by cubic_bspline_interp_2D_mod:CubicVec2D'
endif
!print *, 'Attempting to allocate device memory...'
allocate( coordsDev(nCoords, 2), resultDev(nCoords) )
!print *, 'Attempting to copy test points to device...'
coordsDev = coordsHost(1:nCoords, 1:2)
!print *, 'Attempting to call the kernel...'
dimBlock = min(64,nCoords)
dimGrid = max(1,nCoords/dimBlock+1)
call CubicInterpVec2D_kernel<<<dimGrid,dimBlock>>>(coordsDev,resultDev,nCoords)
errCode = cudaGetLastError()
if (errCode .gt. 0) then
print *, cudaGetErrorString(errCode)
stop 'Error! Kernel failed!'
endif
!print *, 'Attempting to copy results back to host...'
resultHost=resultDev(1:nCoords)
!print *, 'Deallocating device memory...'
deallocate(coordsDev,resultDev)
else
print *, 'Coefficient matrix not allocated on device yet!'
stop 'Program terminated by cubic_bspline_interp_2D_mod:CubicInterpVec2D'
endif
end subroutine CubicInterpVec2D
!------------------------------------------------------------------------------------------------------------------------
attributes(global) subroutine CubicInterpSingle2D_kernel(x,y,resultDev)
real(real_kind), value :: x,y
real(real_kind), device :: resultDev
resultDev = CubicInterp2DFunc(x,y)
end subroutine CubicInterpSingle2D_kernel
!------------------------------------------------------------------------------------------------------------------------
subroutine CubicInterpSingle2D(x,y,resultHost)
real(real_kind) :: x,y,resultHost
integer :: r, errCode
real(real_kind), allocatable, device :: resultDev
if(allocFlag==1) then
allocate(resultDev)
call CubicInterpSingle2D_kernel<<<1,1>>>(x,y,resultDev)
errCode = cudaGetLastError()
if (errCode .gt. 0) then
print *, cudaGetErrorString(errCode)
stop 'Error! Kernel failed!'
endif
resultHost=resultDev
deallocate(resultDev)
else
print *, 'Coefficient matrix not allocated on device yet!'
stop 'Program terminated by cubic_bspline_interp_2D_mod:CubicInterpSingle2D'
endif
end subroutine CubicInterpSingle2D
end module cubic_bspline_interp_2D_fast_mod
The only things I’ve changed from the earlier version are: InitialCausalCoefficientX(Y), InitialAntiCausalCoefficientX(Y), ConvertToInterpolationCoefficientsX(Y), and SamplesToCoefficients2DX(Y). Everything else is unchanged from the version of the code that works properly.