I’m relatively new to CUDA Fortran - been working with it for a couple of months - and I’m attempting to increase the performance of a CFD loop (below). My card is a Tesla C1060 with a theoretical peak of 1 TFLOP/S, but I can’t seem to run the CFD code any faster than about 18 GFLOP/S (timing the kernel call, not including the memory transfer). I can get matrix-multiplys to run at almost 200 GFLOP/S, and I’d like to get similar performance with this code.
Why am I not getting better performance? Does anyone have any suggestions?
I’m compiling with: pgfortran -O2 -o loop loop.f -Mcuda
Here is the original CFD loop (which you’ll find in the code below for error checking):
DO k=2,nz-2 ! scalar limits u(2) is the q's/forcing.
DO j=1,ny-1 ! scalar limits u(1) is the updated/previous u
DO i=3,nx-2
u(i,j,k,2)=-u(i,j,k,2)*rk_constant1(n)
: +tema4*((u(i,j,k,1)+u(i+2,j,k,1))*(u(i+2,j,k,1)-u(i,j,k,1))
: +(u(i,j,k,1)+u(i-2,j,k,1))*(u(i,j,k,1)-u(i-2,j,k,1)))
: -temb4*((u(i+1,j,k,1)+u(i,j,k,1))*(u(i+1,j,k,1)-u(i,j,k,1))
: + (u(i,j,k,1)+u(i-1,j,k,1))*(u(i,j,k,1)-u(i-1,j,k,1)))
: -temg4*(((((u(i+2,j,k,1)-ubar(i+2,j,k))-
: (u(i+1,j,k,1)-ubar(i+1,j,k)))-
: ((u(i+1,j,k,1)-ubar(i+1,j,k))-
: (u(i,j,k,1)-ubar(i,j,k))))-
: (((u(i+1,j,k,1)-ubar(i+1,j,k))-
: (u(i,j,k,1)-ubar(i,j,k)))-
: ((u(i,j,k,1)-ubar(i,j,k))-
: (u(i-1,j,k,1)-ubar(i-1,j,k)))))-
: ((((u(i+1,j,k,1)-ubar(i+1,j,k))-
: (u(i,j,k,1)-ubar(i,j,k)))-
: ((u(i,j,k,1)-ubar(i,j,k))-
: (u(i-1,j,k,1)-ubar(i-1,j,k))))-
: (((u(i,j,k,1)-ubar(i,j,k))-
: (u(i-1,j,k,1)-ubar(i-1,j,k)))-
: ((u(i-1,j,k,1)-ubar(i-1,j,k))-
: (u(i-2,j,k,1)-ubar(i-2,j,k))))))
END DO ! 52 calculations...
END DO
END DO
Here is my CUDA version of the code:
CONTENTS:
module blocksize- contains parameters
module modu- contains kernel and the function (first_routine) that calls it
Program LOOPTEST- main host routine
module blocksize
implicit none
save
integer, parameter :: NX_param = 32 ! can be changed to improve performance
integer, parameter :: NY_param = 256 ! can be changed to improve performance
integer, parameter :: NZ_param = 256 ! can be changed to improve performance
integer, parameter :: BSIZEx = 8 !must divide evenly into NY_param
integer, parameter :: BSIZEy = 8 !must divide evenly into NZ_param
end module
! start the module containing the kernel
module modu
use cudafor
use blocksize
contains
! _kernel
attributes(global) subroutine kernel( udev,ubardev,rk_const_1n
: ,rk_const_2n,nx,ny,nz,n4
: ,tema4,temb4,temg4 )
real*4,device :: udev(nx,ny,nz,n4)
real*4,constant :: ubardev(NX_param,NY_param,NZ_param)
integer, value :: i,j,k,nx,ny,nz,n4,Ty,Tx,idx
real*4, value :: rk_const_1n,rk_const_2n,tema4,temb4,temg4
real*4, shared :: sh_u(5,BSIZEx,BSIZEy,2)
real*4, shared :: sh_ubar(5,BSIZEx,BSIZEy)
k = (blockIdx%y-1)*blockdim%y+threadIdx%y
j = (blockIdx%x-1)*blockdim%x+threadIdx%x
if (k <= nz-2 .and. j <ny>= 2) then
Ty = threadIdx%y
Tx = threadIdx%x
! have each thread load its first 5 elements of u and ubar into shared mem
DO idx=1,4 ! do #5 at beginning of loop
sh_u(idx,Ty,Tx,1) = udev(idx,j,k,1)
sh_u(idx,Ty,Tx,2) = udev(idx,j,k,2)
sh_ubar(idx,Ty,Tx) = ubardev(idx,j,k)
END DO
DO i=3,nx-2
! load element 5 into shared
sh_u(5,Ty,Tx,1) = udev(i+2,j,k,1)
sh_u(5,Ty,Tx,2) = udev(i+2,j,k,2)
sh_ubar(5,Ty,Tx) = ubardev(i+2,j,k)
sh_u(3,Ty,Tx,2)=-sh_u(3,Ty,Tx,2)*rk_const_1n
: +tema4*( (sh_u(3,Ty,Tx,1)+sh_u(5,Ty,Tx,1))
: *(sh_u(5,Ty,Tx,1)-sh_u(3,Ty,Tx,1))
: +(sh_u(3,Ty,Tx,1)+sh_u(1,Ty,Tx,1))
: *(sh_u(3,Ty,Tx,1)-sh_u(1,Ty,Tx,1)) )
: -temb4*( (sh_u(4,Ty,Tx,1)+sh_u(3,Ty,Tx,1))
: *(sh_u(4,Ty,Tx,1)-sh_u(3,Ty,Tx,1))
: +(sh_u(3,Ty,Tx,1)+sh_u(2,Ty,Tx,1))
: *(sh_u(3,Ty,Tx,1)-sh_u(2,Ty,Tx,1)) )
: -temg4*((((( sh_u(5,Ty,Tx,1)-sh_ubar(5,Ty,Tx))-
: (sh_u(4,Ty,Tx,1)-sh_ubar(4,Ty,Tx)))-
: ((sh_u(4,Ty,Tx,1)-sh_ubar(4,Ty,Tx))-
: (sh_u(3,Ty,Tx,1)-sh_ubar(3,Ty,Tx))))-
: (((sh_u(4,Ty,Tx,1)-sh_ubar(4,Ty,Tx))-
: (sh_u(3,Ty,Tx,1)-sh_ubar(3,Ty,Tx)))-
: ((sh_u(3,Ty,Tx,1)-sh_ubar(3,Ty,Tx))-
: (sh_u(2,Ty,Tx,1)-sh_ubar(2,Ty,Tx)))))-
: (((( sh_u(4,Ty,Tx,1)-sh_ubar(4,Ty,Tx))-
: (sh_u(3,Ty,Tx,1)-sh_ubar(3,Ty,Tx)))-
: ((sh_u(3,Ty,Tx,1)-sh_ubar(3,Ty,Tx))-
: (sh_u(2,Ty,Tx,1)-sh_ubar(2,Ty,Tx))))-
: (((sh_u(3,Ty,Tx,1)-sh_ubar(3,Ty,Tx))-
: (sh_u(2,Ty,Tx,1)-sh_ubar(2,Ty,Tx)))-
: ((sh_u(2,Ty,Tx,1)-sh_ubar(2,Ty,Tx))-
: (sh_u(1,Ty,Tx,1)-sh_ubar(1,Ty,Tx) )))))
! now we're done with element i-2, copy it back to udev
udev(i-2,j,k,1) = sh_u(1,Ty,Tx,1)
udev(i-2,j,k,2) = sh_u(1,Ty,Tx,2)
! bump each level back one in line to make room for new 5th level (added at beginning of
loop)
sh_u(1,Ty,Tx,1) = sh_u(2,Ty,Tx,1)
sh_u(1,Ty,Tx,2) = sh_u(2,Ty,Tx,2)
sh_ubar(1,Ty,Tx) = sh_ubar(2,Ty,Tx)
sh_u(2,Ty,Tx,1) = sh_u(3,Ty,Tx,1)
sh_u(2,Ty,Tx,2) = sh_u(3,Ty,Tx,2)
sh_ubar(2,Ty,Tx) = sh_ubar(3,Ty,Tx)
sh_u(3,Ty,Tx,1) = sh_u(4,Ty,Tx,1)
sh_u(3,Ty,Tx,2) = sh_u(4,Ty,Tx,2)
sh_ubar(3,Ty,Tx) = sh_ubar(4,Ty,Tx)
sh_u(4,Ty,Tx,1) = sh_u(5,Ty,Tx,1)
sh_u(4,Ty,Tx,2) = sh_u(5,Ty,Tx,2)
sh_ubar(4,Ty,Tx) = sh_ubar(5,Ty,Tx)
END DO ! 52 calculations...
! copy last two terms out to global memory
udev(nx-3,j,k,1) = sh_u(1,Ty,Tx,1)
udev(nx-3,j,k,2) = sh_u(1,Ty,Tx,2)
udev(nx-2,j,k,1) = sh_u(2,Ty,Tx,1)
udev(nx-2,j,k,2) = sh_u(2,Ty,Tx,2)
endif
end subroutine kernel
! host routine that calls the kernel
subroutine first_routine(u,ubar,rk_const_1n,rk_const_2n
: ,nx,ny,nz,tema4,temb4,temg4)
real, dimension (:,:,:,:) :: u
real, dimension (:,:,:) :: ubar
real rk_const_1n,rk_const_2n, tema4,temb4,temg4
integer :: nx,ny,nz,n4
! allocatable device arrays
real,device,allocatable,dimension(:,:,:,:) :: udev
real,device,allocatable,dimension(:,:,:) :: ubardev
type(dim3) :: dimGrid, dimBlock
integer :: r
real ctimeall,ctimekernel,flops,mflopskernel,mflopsall
integer c1, c2, c3, c4
! Begin execution, first determine the sizes of the input arrays
nx = size( u, 1 )
ny = size( u, 2 )
nz = size( u, 3 )
n4 = size( u, 4 )
! start data xfer-inclusive timer and allocate the
! device arrays
call system_clock( count=c1 )
allocate( udev(nx,ny,nz,2), ubardev(nx,ny,nz))
! Copy arrays to the device
udev = u(1:nx,1:ny,1:nz,1:2)
ubardev = ubar(1:nx,1:ny,1:nz)
! grid and block dimensions
dimGrid = dim3(ny/BSIZEx,nz/BSIZEy,1)
dimBlock = dim3(BSIZEx,BSIZEy,1)
! Start the data xfer-exclusive timer, launch the GPU kernel,
! wait for completion
call system_clock( count=c2 )
call kernel<<<dimGrid>>>(udev,ubardev
: ,rk_const_1n,rk_const_2n
: ,nx,ny,nz,n4
: ,tema4,temb4,temg4)
r = cudathreadsynchronize()
! Stop data xfer-exclusive timer, copy the results back,
! stop data xfer-inclusive timer
call system_clock( count=c3 )
u(1:nx,1:ny,1:nz,1:2) = udev
call system_clock( count=c4 )
! Calculate inclusive/exclusive execution times, and report MFLOPS
flops = (nz-3)*(ny-1)*(nx-4)*52
ctimekernel = c3 - c2
mflopskernel = flops / ctimekernel
ctimeall = c4 - c1
mflopsall = flops / ctimeall
! Print out results
print *, 'Kernel time excluding data xfer:', ctimekernel,
: ' microseconds'
print *, 'Megaflops excluding data xfer: ', mflopskernel
print *, 'Total time including data xfer: ', ctimeall,
: ' microseconds'
print *, 'Megaflops including data xfer: ', mflopsall
! Deallocate device arrays and exit
deallocate( udev, ubardev )
end subroutine first_routine
end module modu
c##################################################################
c##################################################################
c ###### ######
c ###### PROGRAM LOOPTEST ######
c ###### ######
c##################################################################
c##################################################################
c
Program LOOPTEST
use modu
c######################################################################
c
c PURPOSE:
c
c Test loop computations for optimization
c
c######################################################################
c
c AUTHOR: DW
c 2/23/2010
c
c######################################################################
c
implicit none ! Force explicit declarations
c include 'time.inc'
integer nx ! Number of grid points in the x-direction.
integer ny ! Number of grid points in the y-direction.
integer nz ! Number of grid points in the z-direction.
real, allocatable:: u(:,:,:,:) ! total u-velocity (m/s)
real, allocatable:: w(:,:,:,:) ! total u-velocity (m/s)
real, allocatable:: ubar(:,:,:) ! total u-velocity (m/s)
real, allocatable:: testu(:,:,:,:) ! testing to make sure kernel works
c######################################################################
c
c Miscellaneous local variables:
c scalars, constants etc.
c
c######################################################################
integer i,j,k,ubgn,uend,jbgn,jend,vbgn,vend,kbgn,kend,ibgn,iend
integer nxa,nya,nza,m,n,wbgn,wend,rkorder
real rk_constant1(3),rk_constant2(3),dx,dy,dz,dxinv,dyinv,dzinv
real kx4,ky4,kv4,tema4,temb4,temg4
real tema,temb,temc,temd,teme,temf,temg,dt,temrk
c timing variables
double precision :: ttime,incrementtime,addtime,temh,cudatime
c start of papi variable definition
REAL :: finaltime,amax,amin,bmax,bmin,umax,umin,wmax,wmin
c end of papi variable definition
C TESTING variables
integer correct,incorrect,tm
real diff, denom, error
c######################################################################
c
c begin executable code:
c
c######################################################################
c read input file parameters/variables
c open (5,file='loop.input',form='formatted')
c read (5,*) nx
c read (5,*) ny
c read (5,*) nz
nx = NX_param;
ny = NY_param;
nz = NZ_param;
c allocate variable memory
allocate (u(nx,ny,nz,3))
allocate (w(nx,ny,nz,3))
allocate (ubar(nx,ny,nz))
allocate (testu(nx,ny,nz,3))
print *,'grid size=',nx,ny,nz
print *,'total data size=',nx*ny*nz*7*4
c set more constants associated with the time differencing scheme
c TODO don't need all of these...clean up the code!
rkorder = 3
rk_constant1(1) = 0.0
rk_constant1(2) = 5./9.
rk_constant1(3) = 153./128.
rk_constant2(1) = 1.0/3.0
rk_constant2(2) = 15./16.
rk_constant2(3) = 8./15.
dt = 1.0
temrk = dt
dx = 1000.0
dy = 1000.0
dz = 1000.0
dxinv = 1.0/dx
dyinv = 1.0/dy
dzinv = 1.0/dz
tema4= temrk*dxinv/24.0
temb4= temrk*dxinv/3.0
temg4 = temrk*0.06
tema = 0.25*dxinv*temrk
temb = 0.25*dyinv*temrk
c initialize the variables
DO k= 1,nz
DO j= 1,ny
DO i= 1,nx
temc = 3.0*(sin(2.0*3.141592*dx*(i-1)/(nx*dx))
: +sin(2.0*3.141592*dy*(j-1)/(ny*dy))
: +sin(2.0*3.141592*dz*(k-1)/(nz*dz)))
u(i,j,k,2) = temc
w(i,j,k,2) = -temc
u(i,j,k,1) = u(i,j,k,2)
w(i,j,k,1) = w(i,j,k,2)
u(i,j,k,3) = u(i,j,k,2)
w(i,j,k,3) = w(i,j,k,2)
ubar(i,j,k) = 2.0
testu(i,j,k,2) = temc
testu(i,j,k,1) = u(i,j,k,2)
testu(i,j,k,3) = u(i,j,k,2)
END DO
END DO
END DO
c check initial values
amax = -99999.0
amin = 99999.0
bmax = -99999.0
bmin = 99999.0
do n = 1,3
do k=1,nz
do j=1,ny
do i=1,nx
amax = max(u(i,j,k,n),amax)
amin = min(u(i,j,k,n),amin)
bmax = max(w(i,j,k,n),bmax)
bmin = min(w(i,j,k,n),bmin)
end do
end do
end do
c print *,'umax= ',amax,'umin= ',amin,'wmax= ',bmax
end do
m = cudaSetDevice(0)
c start the main loop 5 time steps
do m=1,5
c rk 3 loop
do n=1,3
call first_routine(u,ubar,rk_constant1(n),rk_constant2(n)
: ,nx,ny,nz
: ,tema4,temb4,temg4)
DO k=2,nz-2 ! scalar limits u(2) is the q's/forcing.
DO j=1,ny-1 ! scalar limits u(1) is the updated/previous u
DO i=3,nx-2
testu(i,j,k,2)=-testu(i,j,k,2)*rk_constant1(n)
: +tema4*((testu(i,j,k,1)+testu(i+2,j,k,1))
: *(testu(i+2,j,k,1)-testu(i,j,k,1))
: +(testu(i,j,k,1)+testu(i-2,j,k,1))
: *(testu(i,j,k,1)-testu(i-2,j,k,1)))
: -temb4*((testu(i+1,j,k,1)+testu(i,j,k,1))
: *(testu(i+1,j,k,1)-testu(i,j,k,1))
: + (testu(i,j,k,1)+testu(i-1,j,k,1))
: *(testu(i,j,k,1)-testu(i-1,j,k,1)))
: -temg4*(((((testu(i+2,j,k,1)-ubar(i+2,j,k))-
: (testu(i+1,j,k,1)-ubar(i+1,j,k)))-
: ((testu(i+1,j,k,1)-ubar(i+1,j,k))-
: (testu(i,j,k,1)-ubar(i,j,k))))-
: (((testu(i+1,j,k,1)-ubar(i+1,j,k))-
: (testu(i,j,k,1)-ubar(i,j,k)))-
: ((testu(i,j,k,1)-ubar(i,j,k))-
: (testu(i-1,j,k,1)-ubar(i-1,j,k)))))-
: ((((testu(i+1,j,k,1)-ubar(i+1,j,k))-
: (testu(i,j,k,1)-ubar(i,j,k)))-
: ((testu(i,j,k,1)-ubar(i,j,k))-
: (testu(i-1,j,k,1)-ubar(i-1,j,k))))-
: (((testu(i,j,k,1)-ubar(i,j,k))-
: (testu(i-1,j,k,1)-ubar(i-1,j,k)))-
: ((testu(i-1,j,k,1)-ubar(i-1,j,k))-
: (testu(i-2,j,k,1)-ubar(i-2,j,k))))))
END DO ! 52 calculations...
END DO
END DO
! endif
print *,'-------------------------------------------------'
! TEST ############################################################
correct = 0
incorrect = 0
DO i=1,nx
DO j=1,ny
DO k=1,nz
DO tm=1,3
if (k<=nz-2 .and. j<ny>=2
: .and. i>=3 .and. i<nx> 1.0e-6) then
c print *,'ERR:',u(i,j,k,tm),testu(i,j,k,tm)
c : ,i,j,k,tm
incorrect = incorrect + 1
else
C print *,u(i,j,k,tm),testu(i,j,k,tm)
correct = correct + 1
endif
endif
END DO
END DO
END DO
END DO
print *,'KERNEL TEST: Correct:',correct,'Incorrect:',incorrect
! END TEST ######################################################
DO k=2,nz-2 ! complete the u computations
DO j=1,ny-1
DO i=3,nx-2
testu(i,j,k,1)= testu(i,j,k,1)
: +testu(i,j,k,2)*rk_constant2(n)
END DO
END DO
END DO
print *,'-------------------------------------------------'
DO k=2,nz-2 ! complete the u computations
DO j=1,ny-1
DO i=3,nx-2
u(i,j,k,1)= u(i,j,k,1)
: +u(i,j,k,2)*rk_constant2(n)
END DO
END DO
END DO
print *,'-------------------------------------------------'
DO k=3,nz-2 ! limits 3,nz-2
DO j=1,ny-1 ! scalar limits 2,ny-2
DO i=1,nx-1 ! scalar limits 2,nx-2
w(i,j,k,2)=-w(i,j,k,2)*rk_constant1(n)
c vert adv fourth order
: +tema4*((w(i,j,k,1)+w(i,j,k+2,1))*(w(i,j,k+2,1)-w(i,j,k,1))
: +(w(i,j,k-2,1)+w(i,j,k,1))*(w(i,j,k,1)-w(i,j,k-2,1)))
: -temb4*((w(i,j,k-1,1)+w(i,j,k,1))*(w(i,j,k,1)-w(i,j,k-1,1))
: +(w(i,j,k+1,1)+w(i,j,k,1))*(w(i,j,k+1,1)-w(i,j,k,1)))
c compute the fourth order cmix z terms.
: -temg4*((((w(i,j,k+2,1)-w(i,j,k+1,1))-
: (w(i,j,k+1,1)-w(i,j,k,1)))-
: ((w(i,j,k+1,1)-w(i,j,k,1))-
: (w(i,j,k,1)-w(i,j,k-1,1))))-
: (((w(i,j,k+1,1)-w(i,j,k,1))-
: (w(i,j,k,1)-w(i,j,k-1,1)))-
: ((w(i,j,k,1)-w(i,j,k-1,1))-
: (w(i,j,k-1,1)-w(i,j,k-2,1)))))
END DO ! 36 calculations...
END DO
END DO
print *,'-------------------------------------------------'
c check for bogus solution
amax = -99999.0
amin = 99999.0
bmax = -99999.0
bmin = 99999.0
do k=1,nz
do j=1,ny
do i=1,nx
amax = max(u(i,j,k,1),amax)
amin = min(u(i,j,k,1),amin)
bmax = max(w(i,j,k,2),bmax)
bmin = min(w(i,j,k,2),bmin)
end do
end do
end do
c print *,'umax= ',amax,'umin= ',amin,'wmax= ',bmax,
c : 'wmin= ',bmin
end do ! end of the rk3 loop
end do ! end of the main time loop
stop
end
c program end