CUDA implementation of CFD loop only 18 GFLOPS on TFLOP card

I’m relatively new to CUDA Fortran - been working with it for a couple of months - and I’m attempting to increase the performance of a CFD loop (below). My card is a Tesla C1060 with a theoretical peak of 1 TFLOP/S, but I can’t seem to run the CFD code any faster than about 18 GFLOP/S (timing the kernel call, not including the memory transfer). I can get matrix-multiplys to run at almost 200 GFLOP/S, and I’d like to get similar performance with this code.

Why am I not getting better performance? Does anyone have any suggestions?


I’m compiling with: pgfortran -O2 -o loop loop.f -Mcuda

Here is the original CFD loop (which you’ll find in the code below for error checking):

      DO k=2,nz-2  ! scalar limits  u(2) is the q's/forcing.
        DO j=1,ny-1 ! scalar limits u(1) is the updated/previous u
          DO i=3,nx-2
            u(i,j,k,2)=-u(i,j,k,2)*rk_constant1(n)

     :  +tema4*((u(i,j,k,1)+u(i+2,j,k,1))*(u(i+2,j,k,1)-u(i,j,k,1))
     :        +(u(i,j,k,1)+u(i-2,j,k,1))*(u(i,j,k,1)-u(i-2,j,k,1)))
     :  -temb4*((u(i+1,j,k,1)+u(i,j,k,1))*(u(i+1,j,k,1)-u(i,j,k,1))
     :       + (u(i,j,k,1)+u(i-1,j,k,1))*(u(i,j,k,1)-u(i-1,j,k,1)))
     :  -temg4*(((((u(i+2,j,k,1)-ubar(i+2,j,k))-
     :             (u(i+1,j,k,1)-ubar(i+1,j,k)))-
     :            ((u(i+1,j,k,1)-ubar(i+1,j,k))-
     :             (u(i,j,k,1)-ubar(i,j,k))))-
     :           (((u(i+1,j,k,1)-ubar(i+1,j,k))-
     :             (u(i,j,k,1)-ubar(i,j,k)))-
     :            ((u(i,j,k,1)-ubar(i,j,k))-
     :             (u(i-1,j,k,1)-ubar(i-1,j,k)))))-
     :          ((((u(i+1,j,k,1)-ubar(i+1,j,k))-
     :             (u(i,j,k,1)-ubar(i,j,k)))-
     :            ((u(i,j,k,1)-ubar(i,j,k))-
     :             (u(i-1,j,k,1)-ubar(i-1,j,k))))-
     :           (((u(i,j,k,1)-ubar(i,j,k))-
     :             (u(i-1,j,k,1)-ubar(i-1,j,k)))-
     :            ((u(i-1,j,k,1)-ubar(i-1,j,k))-
     :             (u(i-2,j,k,1)-ubar(i-2,j,k))))))
          END DO  !  52 calculations...
        END DO
      END DO

Here is my CUDA version of the code:

CONTENTS:
module blocksize- contains parameters
module modu- contains kernel and the function (first_routine) that calls it
Program LOOPTEST- main host routine

module blocksize
         implicit none
         save
         integer, parameter :: NX_param = 32    ! can be changed to improve performance
         integer, parameter :: NY_param = 256   ! can be changed to improve performance
         integer, parameter :: NZ_param = 256   ! can be changed to improve performance
         integer, parameter :: BSIZEx = 8  !must divide evenly into NY_param
         integer, parameter :: BSIZEy = 8  !must divide evenly into NZ_param
       end module




       ! start the module containing the kernel

       module modu
         use cudafor
         use blocksize
         contains

         ! _kernel 

         attributes(global) subroutine kernel( udev,ubardev,rk_const_1n
     :                                         ,rk_const_2n,nx,ny,nz,n4
     :                                         ,tema4,temb4,temg4 )
           real*4,device :: udev(nx,ny,nz,n4)
           real*4,constant :: ubardev(NX_param,NY_param,NZ_param)
           integer, value :: i,j,k,nx,ny,nz,n4,Ty,Tx,idx
           real*4, value :: rk_const_1n,rk_const_2n,tema4,temb4,temg4


           real*4, shared :: sh_u(5,BSIZEx,BSIZEy,2)
           real*4, shared :: sh_ubar(5,BSIZEx,BSIZEy)



           k = (blockIdx%y-1)*blockdim%y+threadIdx%y
           j = (blockIdx%x-1)*blockdim%x+threadIdx%x
          

           if (k <= nz-2 .and. j <ny>= 2) then

          
              Ty = threadIdx%y
              Tx = threadIdx%x

              ! have each thread load its first 5 elements of u and ubar into shared mem
              DO idx=1,4  ! do #5 at beginning of loop

                 sh_u(idx,Ty,Tx,1) = udev(idx,j,k,1)
                 sh_u(idx,Ty,Tx,2) = udev(idx,j,k,2)
                 sh_ubar(idx,Ty,Tx) = ubardev(idx,j,k)

              END DO


              DO i=3,nx-2


                ! load element 5 into shared
                sh_u(5,Ty,Tx,1) = udev(i+2,j,k,1)
                sh_u(5,Ty,Tx,2) = udev(i+2,j,k,2)
                sh_ubar(5,Ty,Tx) = ubardev(i+2,j,k)



                sh_u(3,Ty,Tx,2)=-sh_u(3,Ty,Tx,2)*rk_const_1n

     :         +tema4*( (sh_u(3,Ty,Tx,1)+sh_u(5,Ty,Tx,1))
     :                 *(sh_u(5,Ty,Tx,1)-sh_u(3,Ty,Tx,1))
     :                 +(sh_u(3,Ty,Tx,1)+sh_u(1,Ty,Tx,1))
     :                 *(sh_u(3,Ty,Tx,1)-sh_u(1,Ty,Tx,1)) )


     :         -temb4*( (sh_u(4,Ty,Tx,1)+sh_u(3,Ty,Tx,1))
     :                 *(sh_u(4,Ty,Tx,1)-sh_u(3,Ty,Tx,1))
     :                 +(sh_u(3,Ty,Tx,1)+sh_u(2,Ty,Tx,1))
     :                 *(sh_u(3,Ty,Tx,1)-sh_u(2,Ty,Tx,1)) )


     :    -temg4*((((( sh_u(5,Ty,Tx,1)-sh_ubar(5,Ty,Tx))-
     :                (sh_u(4,Ty,Tx,1)-sh_ubar(4,Ty,Tx)))-
     :               ((sh_u(4,Ty,Tx,1)-sh_ubar(4,Ty,Tx))-
     :                (sh_u(3,Ty,Tx,1)-sh_ubar(3,Ty,Tx))))-
     :              (((sh_u(4,Ty,Tx,1)-sh_ubar(4,Ty,Tx))-
     :                (sh_u(3,Ty,Tx,1)-sh_ubar(3,Ty,Tx)))-
     :               ((sh_u(3,Ty,Tx,1)-sh_ubar(3,Ty,Tx))-
     :                (sh_u(2,Ty,Tx,1)-sh_ubar(2,Ty,Tx)))))-

     :           (((( sh_u(4,Ty,Tx,1)-sh_ubar(4,Ty,Tx))-
     :               (sh_u(3,Ty,Tx,1)-sh_ubar(3,Ty,Tx)))-
     :              ((sh_u(3,Ty,Tx,1)-sh_ubar(3,Ty,Tx))-
     :               (sh_u(2,Ty,Tx,1)-sh_ubar(2,Ty,Tx))))-
     :             (((sh_u(3,Ty,Tx,1)-sh_ubar(3,Ty,Tx))-
     :               (sh_u(2,Ty,Tx,1)-sh_ubar(2,Ty,Tx)))-
     :              ((sh_u(2,Ty,Tx,1)-sh_ubar(2,Ty,Tx))-
     :               (sh_u(1,Ty,Tx,1)-sh_ubar(1,Ty,Tx) )))))


                ! now we're done with element i-2, copy it back to udev
                udev(i-2,j,k,1) = sh_u(1,Ty,Tx,1)
                udev(i-2,j,k,2) = sh_u(1,Ty,Tx,2)


                ! bump each level back one in line to make room for new 5th level (added at beginning of 

loop)
                sh_u(1,Ty,Tx,1) = sh_u(2,Ty,Tx,1)
                sh_u(1,Ty,Tx,2) = sh_u(2,Ty,Tx,2)
                sh_ubar(1,Ty,Tx) = sh_ubar(2,Ty,Tx)

                sh_u(2,Ty,Tx,1) = sh_u(3,Ty,Tx,1)
                sh_u(2,Ty,Tx,2) = sh_u(3,Ty,Tx,2)
                sh_ubar(2,Ty,Tx) = sh_ubar(3,Ty,Tx)

                sh_u(3,Ty,Tx,1) = sh_u(4,Ty,Tx,1)
                sh_u(3,Ty,Tx,2) = sh_u(4,Ty,Tx,2)
                sh_ubar(3,Ty,Tx) = sh_ubar(4,Ty,Tx)

                sh_u(4,Ty,Tx,1) = sh_u(5,Ty,Tx,1)
                sh_u(4,Ty,Tx,2) = sh_u(5,Ty,Tx,2)
                sh_ubar(4,Ty,Tx) = sh_ubar(5,Ty,Tx)

              END DO  !  52 calculations...


              ! copy last two terms out to global memory
              udev(nx-3,j,k,1) = sh_u(1,Ty,Tx,1)
              udev(nx-3,j,k,2) = sh_u(1,Ty,Tx,2)

              udev(nx-2,j,k,1) = sh_u(2,Ty,Tx,1)
              udev(nx-2,j,k,2) = sh_u(2,Ty,Tx,2)


           endif

         end subroutine kernel






         ! host routine that calls the kernel

         subroutine first_routine(u,ubar,rk_const_1n,rk_const_2n
     :                           ,nx,ny,nz,tema4,temb4,temg4)

               real, dimension (:,:,:,:) :: u
               real, dimension (:,:,:) :: ubar
               real rk_const_1n,rk_const_2n, tema4,temb4,temg4
               integer :: nx,ny,nz,n4

               ! allocatable device arrays
               real,device,allocatable,dimension(:,:,:,:) :: udev
               real,device,allocatable,dimension(:,:,:) :: ubardev

               type(dim3) :: dimGrid, dimBlock
               integer :: r 

               real ctimeall,ctimekernel,flops,mflopskernel,mflopsall
               integer c1, c2, c3, c4

               ! Begin execution, first determine the sizes of the input arrays
               nx = size( u, 1 )
               ny = size( u, 2 )
               nz = size( u, 3 )
               n4 = size( u, 4 )

               ! start data xfer-inclusive timer and allocate the
               ! device arrays
               call system_clock( count=c1 )
               allocate( udev(nx,ny,nz,2), ubardev(nx,ny,nz))

               ! Copy arrays to the device
               udev = u(1:nx,1:ny,1:nz,1:2)
               ubardev = ubar(1:nx,1:ny,1:nz)

               ! grid and block dimensions
               dimGrid = dim3(ny/BSIZEx,nz/BSIZEy,1)  
               dimBlock = dim3(BSIZEx,BSIZEy,1)

               ! Start the data xfer-exclusive timer, launch the GPU kernel,
               ! wait for completion
               call system_clock( count=c2 )
               call kernel<<<dimGrid>>>(udev,ubardev
     :                                       ,rk_const_1n,rk_const_2n
     :                                       ,nx,ny,nz,n4
     :                                       ,tema4,temb4,temg4)
               r = cudathreadsynchronize()


               ! Stop data xfer-exclusive timer, copy the results back,
               ! stop data xfer-inclusive timer
               call system_clock( count=c3 )
               u(1:nx,1:ny,1:nz,1:2) = udev
               call system_clock( count=c4 )

               ! Calculate inclusive/exclusive execution times, and report MFLOPS
               flops = (nz-3)*(ny-1)*(nx-4)*52
               ctimekernel = c3 - c2
               mflopskernel = flops / ctimekernel
               ctimeall = c4 - c1
               mflopsall = flops / ctimeall

               ! Print out results
               print *, 'Kernel time excluding data xfer:', ctimekernel,
     :                  ' microseconds'
               print *, 'Megaflops excluding data xfer: ', mflopskernel
               print *, 'Total time including data xfer: ', ctimeall,
     :                  ' microseconds'
               print *, 'Megaflops including data xfer: ', mflopsall

               ! Deallocate device arrays and exit
               deallocate( udev, ubardev )

           end subroutine first_routine
       end module modu









c##################################################################
c##################################################################
c     ######                                                      ######
c     ######                 PROGRAM LOOPTEST                     ######
c     ######                                                      ######
c##################################################################
c##################################################################
c

      Program LOOPTEST
      use modu

c######################################################################
c
c     PURPOSE:
c
c     Test loop computations for optimization 
c
c######################################################################
c
c     AUTHOR: DW
c     2/23/2010
c
c######################################################################

c
      implicit none        ! Force explicit declarations
c      include 'time.inc'

      integer nx        ! Number of grid points in the x-direction.
      integer ny        ! Number of grid points in the y-direction.
      integer nz        ! Number of grid points in the z-direction.

      real, allocatable:: u(:,:,:,:) ! total u-velocity (m/s)
      real, allocatable:: w(:,:,:,:) ! total u-velocity (m/s)
      real, allocatable:: ubar(:,:,:) ! total u-velocity (m/s)
      real, allocatable:: testu(:,:,:,:) ! testing to make sure kernel works
c######################################################################
c
c     Miscellaneous local variables:
c     scalars, constants etc.
c
c######################################################################
      integer i,j,k,ubgn,uend,jbgn,jend,vbgn,vend,kbgn,kend,ibgn,iend
      integer nxa,nya,nza,m,n,wbgn,wend,rkorder
      real rk_constant1(3),rk_constant2(3),dx,dy,dz,dxinv,dyinv,dzinv
      real kx4,ky4,kv4,tema4,temb4,temg4
      real tema,temb,temc,temd,teme,temf,temg,dt,temrk

c     timing variables
      double precision :: ttime,incrementtime,addtime,temh,cudatime

c     start of papi variable definition
      REAL :: finaltime,amax,amin,bmax,bmin,umax,umin,wmax,wmin
c     end of papi variable definition

C     TESTING variables
      integer correct,incorrect,tm
      real diff, denom, error

c######################################################################
c
c     begin executable code:
c
c######################################################################


c     read input file parameters/variables
c      open (5,file='loop.input',form='formatted')
c      read (5,*) nx
c      read (5,*) ny
c      read (5,*) nz

      nx = NX_param;
      ny = NY_param;
      nz = NZ_param;


c     allocate variable memory
      allocate (u(nx,ny,nz,3))
      allocate (w(nx,ny,nz,3))
      allocate (ubar(nx,ny,nz))
      allocate (testu(nx,ny,nz,3))

      print *,'grid size=',nx,ny,nz
      print *,'total data size=',nx*ny*nz*7*4                       
c     set more constants associated with the time differencing scheme
c     TODO don't need all of these...clean up the code!
      rkorder = 3
      rk_constant1(1) = 0.0
      rk_constant1(2) = 5./9. 
      rk_constant1(3) = 153./128.
      rk_constant2(1) = 1.0/3.0
      rk_constant2(2) = 15./16.
      rk_constant2(3) = 8./15.
      dt = 1.0
      temrk = dt
      dx = 1000.0
      dy = 1000.0
      dz = 1000.0
      dxinv = 1.0/dx
      dyinv = 1.0/dy
      dzinv = 1.0/dz

      tema4= temrk*dxinv/24.0
      temb4= temrk*dxinv/3.0
      temg4 = temrk*0.06


      tema = 0.25*dxinv*temrk
      temb = 0.25*dyinv*temrk

c     initialize the variables
      DO k= 1,nz
        DO j= 1,ny
          DO i= 1,nx
            temc = 3.0*(sin(2.0*3.141592*dx*(i-1)/(nx*dx))
     :                 +sin(2.0*3.141592*dy*(j-1)/(ny*dy))
     :                 +sin(2.0*3.141592*dz*(k-1)/(nz*dz)))
            u(i,j,k,2) = temc
            w(i,j,k,2) = -temc
            u(i,j,k,1) = u(i,j,k,2)
            w(i,j,k,1) = w(i,j,k,2)
            u(i,j,k,3) = u(i,j,k,2)
            w(i,j,k,3) = w(i,j,k,2)
            ubar(i,j,k) = 2.0
            testu(i,j,k,2) = temc
            testu(i,j,k,1) = u(i,j,k,2)
            testu(i,j,k,3) = u(i,j,k,2)
          END DO
        END DO
      END DO

c     check initial values
      amax = -99999.0
      amin =  99999.0
      bmax = -99999.0
      bmin =  99999.0
      do n = 1,3
      do k=1,nz
        do j=1,ny
          do i=1,nx
            amax = max(u(i,j,k,n),amax)
            amin = min(u(i,j,k,n),amin)
            bmax = max(w(i,j,k,n),bmax)
            bmin = min(w(i,j,k,n),bmin)
          end do
        end do
      end do
c      print *,'umax= ',amax,'umin= ',amin,'wmax= ',bmax
      end do

      m = cudaSetDevice(0)

c     start the main loop  5 time steps
      do m=1,5
c     rk 3 loop 
      do n=1,3



      call first_routine(u,ubar,rk_constant1(n),rk_constant2(n)
     :                   ,nx,ny,nz
     :                   ,tema4,temb4,temg4)



      DO k=2,nz-2  ! scalar limits  u(2) is the q's/forcing.
        DO j=1,ny-1 ! scalar limits u(1) is the updated/previous u
          DO i=3,nx-2

            testu(i,j,k,2)=-testu(i,j,k,2)*rk_constant1(n)

     :  +tema4*((testu(i,j,k,1)+testu(i+2,j,k,1))
     :        *(testu(i+2,j,k,1)-testu(i,j,k,1))
     :        +(testu(i,j,k,1)+testu(i-2,j,k,1))
     :        *(testu(i,j,k,1)-testu(i-2,j,k,1)))
     :  -temb4*((testu(i+1,j,k,1)+testu(i,j,k,1))
     :        *(testu(i+1,j,k,1)-testu(i,j,k,1))
     :       + (testu(i,j,k,1)+testu(i-1,j,k,1))
     :        *(testu(i,j,k,1)-testu(i-1,j,k,1)))
     :  -temg4*(((((testu(i+2,j,k,1)-ubar(i+2,j,k))-
     :             (testu(i+1,j,k,1)-ubar(i+1,j,k)))-
     :            ((testu(i+1,j,k,1)-ubar(i+1,j,k))-
     :             (testu(i,j,k,1)-ubar(i,j,k))))-
     :           (((testu(i+1,j,k,1)-ubar(i+1,j,k))-
     :             (testu(i,j,k,1)-ubar(i,j,k)))-
     :            ((testu(i,j,k,1)-ubar(i,j,k))-
     :             (testu(i-1,j,k,1)-ubar(i-1,j,k)))))-
     :          ((((testu(i+1,j,k,1)-ubar(i+1,j,k))-
     :             (testu(i,j,k,1)-ubar(i,j,k)))-
     :            ((testu(i,j,k,1)-ubar(i,j,k))-
     :             (testu(i-1,j,k,1)-ubar(i-1,j,k))))-
     :           (((testu(i,j,k,1)-ubar(i,j,k))-
     :             (testu(i-1,j,k,1)-ubar(i-1,j,k)))-
     :            ((testu(i-1,j,k,1)-ubar(i-1,j,k))-
     :             (testu(i-2,j,k,1)-ubar(i-2,j,k))))))
          END DO  !  52 calculations...
        END DO
      END DO
!      endif

      print *,'-------------------------------------------------'



!     TEST ############################################################

      correct = 0
      incorrect = 0
      DO i=1,nx
        DO j=1,ny
          DO k=1,nz
            DO tm=1,3

               if (k<=nz-2 .and. j<ny>=2
     :                     .and. i>=3 .and. i<nx> 1.0e-6) then
c                print *,'ERR:',u(i,j,k,tm),testu(i,j,k,tm)
c     :                 ,i,j,k,tm
                incorrect = incorrect + 1
              else
C                print *,u(i,j,k,tm),testu(i,j,k,tm)
                correct = correct + 1
              endif

           endif

            END DO
          END DO
        END DO
      END DO
      print *,'KERNEL TEST: Correct:',correct,'Incorrect:',incorrect


!     END TEST  ######################################################



      DO k=2,nz-2   ! complete the u computations
        DO j=1,ny-1
          DO i=3,nx-2
            testu(i,j,k,1)= testu(i,j,k,1)
     :                     +testu(i,j,k,2)*rk_constant2(n)
          END DO
        END DO
      END DO




      print *,'-------------------------------------------------'


      DO k=2,nz-2   ! complete the u computations
        DO j=1,ny-1
          DO i=3,nx-2
            u(i,j,k,1)= u(i,j,k,1)
     :                     +u(i,j,k,2)*rk_constant2(n)
          END DO
        END DO
      END DO


      print *,'-------------------------------------------------'


      DO k=3,nz-2   ! limits 3,nz-2
        DO j=1,ny-1  ! scalar limits 2,ny-2
          DO i=1,nx-1 ! scalar limits 2,nx-2
            w(i,j,k,2)=-w(i,j,k,2)*rk_constant1(n)
c  vert adv fourth order
     : +tema4*((w(i,j,k,1)+w(i,j,k+2,1))*(w(i,j,k+2,1)-w(i,j,k,1))
     :       +(w(i,j,k-2,1)+w(i,j,k,1))*(w(i,j,k,1)-w(i,j,k-2,1)))
     : -temb4*((w(i,j,k-1,1)+w(i,j,k,1))*(w(i,j,k,1)-w(i,j,k-1,1))
     :       +(w(i,j,k+1,1)+w(i,j,k,1))*(w(i,j,k+1,1)-w(i,j,k,1)))
c compute the fourth order cmix z terms.
     :  -temg4*((((w(i,j,k+2,1)-w(i,j,k+1,1))-
     :            (w(i,j,k+1,1)-w(i,j,k,1)))-
     :           ((w(i,j,k+1,1)-w(i,j,k,1))-
     :            (w(i,j,k,1)-w(i,j,k-1,1))))-
     :          (((w(i,j,k+1,1)-w(i,j,k,1))-
     :            (w(i,j,k,1)-w(i,j,k-1,1)))-
     :           ((w(i,j,k,1)-w(i,j,k-1,1))-
     :            (w(i,j,k-1,1)-w(i,j,k-2,1)))))
          END DO   ! 36 calculations...
        END DO
      END DO


      print *,'-------------------------------------------------'

c     check for bogus solution
      amax = -99999.0
      amin =  99999.0
      bmax = -99999.0
      bmin =  99999.0
      do k=1,nz
        do j=1,ny
          do i=1,nx
            amax = max(u(i,j,k,1),amax)
            amin = min(u(i,j,k,1),amin)
            bmax = max(w(i,j,k,2),bmax)
            bmin = min(w(i,j,k,2),bmin)
          end do
        end do
      end do

c      print *,'umax= ',amax,'umin= ',amin,'wmax= ',bmax,
c     :        'wmin= ',bmin

      end do  !  end of the rk3 loop
      end do  !  end of the main time loop

      stop
      end
c     program end

Hi JDS7,

What I would try is to make your blocks three dimensional, with the “i” corresponding to the block’s X dimension, “j” be the Y dimension, and “k”, be the Z dimension. This should help decrease the number of operations per kernel and increase your parallelization.

  • Mat

Thanks Mat, I can now get the kernel execution (minus memory transfer) up to about 48 GFLOP/s when the block size is 32x1x16 and nx, ny, nz are 608, 584, and 16 (I tried thousands of different parameter and block size combinations, this one won). I really need it to be closer to 400 GFLOP/s, is there something else I can do to speed it up? This is my first cuda program with 3-dimensional blocks, so I’m sure it isn’t perfect.

Also, can you explain why exactly the version with 3d blocks runs 2.5x faster? I get the best performance when one of those dimensions is 1, so it seems like the only difference is that I’m using more threads with fewer calculations each. Why does this increase the speed, when the total number of calculations that need to be done is the same? I made sure to try running the old code (above) with enough threads/block to use all 240 cores (16x8 blocks, with a max of 8 blocks per MP makes 1024 threads, which should be the max number of threads that can be scheduled per MP for the Tesla 1060).


Here’s the new code:

module blocksize
         implicit none
         save
         integer, parameter :: NX_param = 608
         integer, parameter :: NY_param = 584
         integer, parameter :: NZ_param = 16
         integer, parameter :: BSIZEx = 32  !must divide evenly into NX_param
         integer, parameter :: BSIZEy = 1  !must divide evenly into NY_param
         integer, parameter :: BSIZEz = 16  !must equal NZ_param
       end module




       ! start the module containing the matrix multiply kernel

       module modu
         use cudafor
         use blocksize
         contains

         ! _kernel 

         attributes(global) subroutine kernel( udev,ubardev,rk_const_1n
     :                                         ,rk_const_2n,nx,ny,nz,n4
     :                                         ,tema4,temb4,temg4 )
           real*4,device :: udev(nx,ny,nz,n4)
           real*4,constant :: ubardev(NX_param,NY_param,NZ_param)
           integer, value :: nx,ny,nz,n4,idx, i,j,k,Ty,Tx,Tz
           real*4, value :: rk_const_1n,rk_const_2n,tema4,temb4,temg4


           real*4, shared :: sh_u(5,BSIZEx,BSIZEy,2)
           real*4, shared :: sh_ubar(5,BSIZEx,BSIZEy)


           k = threadIdx%z
           j = (blockIdx%y-1)*blockdim%y+threadIdx%y
           i = (blockIdx%x-1)*blockdim%x+threadIdx%x


           if (k <= nz-2 .and. j <ny>= 2
     :                   .and. i >= 3 .and. i <= nx-2) then

          
              Ty = threadIdx%y
              Tz = threadIdx%z


c              sh_u(1:5,Ty,Tz,1) = udev((i-2):(i+2),j,k,1)
c              sh_ubar(1:5,Ty,Tz) = ubardev((i-2):(i+2),j,k)
c
c              sh_u(3,Ty,Tz,2) = udev(i,j,k,2)

              sh_u(1,Ty,Tz,1) = udev(i-2,j,k,1)
              sh_ubar(1,Ty,Tz) = ubardev(i-2,j,k)

              sh_u(2,Ty,Tz,1) = udev(i-1,j,k,1)
              sh_ubar(2,Ty,Tz) = ubardev(i-1,j,k)

              sh_u(3,Ty,Tz,1) = udev(i,j,k,1)
              sh_u(3,Ty,Tz,2) = udev(i,j,k,2)
              sh_ubar(3,Ty,Tz) = ubardev(i,j,k)

              sh_u(4,Ty,Tz,1) = udev(i+1,j,k,1)
              sh_ubar(4,Ty,Tz) = ubardev(i+1,j,k)

              sh_u(5,Ty,Tz,1) = udev(i+2,j,k,1)
              sh_ubar(5,Ty,Tz) = ubardev(i+2,j,k)




                sh_u(3,Ty,Tz,2)=-sh_u(3,Ty,Tz,2)*rk_const_1n

     :         +tema4*( (sh_u(3,Ty,Tz,1)+sh_u(5,Ty,Tz,1))
     :                 *(sh_u(5,Ty,Tz,1)-sh_u(3,Ty,Tz,1))
     :                 +(sh_u(3,Ty,Tz,1)+sh_u(1,Ty,Tz,1))
     :                 *(sh_u(3,Ty,Tz,1)-sh_u(1,Ty,Tz,1)) )


     :         -temb4*( (sh_u(4,Ty,Tz,1)+sh_u(3,Ty,Tz,1))
     :                 *(sh_u(4,Ty,Tz,1)-sh_u(3,Ty,Tz,1))
     :                 +(sh_u(3,Ty,Tz,1)+sh_u(2,Ty,Tz,1))
     :                 *(sh_u(3,Ty,Tz,1)-sh_u(2,Ty,Tz,1)) )


     :    -temg4*((((( sh_u(5,Ty,Tz,1)-sh_ubar(5,Ty,Tz))-
     :                (sh_u(4,Ty,Tz,1)-sh_ubar(4,Ty,Tz)))-
     :               ((sh_u(4,Ty,Tz,1)-sh_ubar(4,Ty,Tz))-
     :                (sh_u(3,Ty,Tz,1)-sh_ubar(3,Ty,Tz))))-
     :              (((sh_u(4,Ty,Tz,1)-sh_ubar(4,Ty,Tz))-
     :                (sh_u(3,Ty,Tz,1)-sh_ubar(3,Ty,Tz)))-
     :               ((sh_u(3,Ty,Tz,1)-sh_ubar(3,Ty,Tz))-
     :                (sh_u(2,Ty,Tz,1)-sh_ubar(2,Ty,Tz)))))-

     :           (((( sh_u(4,Ty,Tz,1)-sh_ubar(4,Ty,Tz))-
     :               (sh_u(3,Ty,Tz,1)-sh_ubar(3,Ty,Tz)))-
     :              ((sh_u(3,Ty,Tz,1)-sh_ubar(3,Ty,Tz))-
     :               (sh_u(2,Ty,Tz,1)-sh_ubar(2,Ty,Tz))))-
     :             (((sh_u(3,Ty,Tz,1)-sh_ubar(3,Ty,Tz))-
     :               (sh_u(2,Ty,Tz,1)-sh_ubar(2,Ty,Tz)))-
     :              ((sh_u(2,Ty,Tz,1)-sh_ubar(2,Ty,Tz))-
     :               (sh_u(1,Ty,Tz,1)-sh_ubar(1,Ty,Tz) )))))




                udev(i,j,k,2) = sh_u(3,Ty,Tz,2)


           endif

         end subroutine kernel



         ! The host routine that calls the kernel

         subroutine first_routine(u,ubar,rk_const_1n,rk_const_2n
     :                           ,nx,ny,nz,tema4,temb4,temg4)

               real, dimension (:,:,:,:) :: u
               real, dimension (:,:,:) :: ubar
               real rk_const_1n,rk_const_2n, tema4,temb4,temg4
               integer :: nx,ny,nz,n4

               ! allocatable device arrays
               real,device,allocatable,dimension(:,:,:,:) :: udev
               real,device,allocatable,dimension(:,:,:) :: ubardev

               type(dim3) :: dimGrid, dimBlock
               integer :: r 

               real ctimeall,ctimekernel,flops,mflopskernel,mflopsall
               integer c1, c2, c3, c4

               ! Begin execution, first determine the sizes of the input arrays
               nx = size( u, 1 )
               ny = size( u, 2 )
               nz = size( u, 3 )
               n4 = size( u, 4 )

               ! start data xfer-inclusive timer and allocate the
               ! device arrays
               call system_clock( count=c1 )
               allocate( udev(nx,ny,nz,2), ubardev(nx,ny,nz))

               ! Copy arrays to the device
               udev = u(1:nx,1:ny,1:nz,1:2)
               ubardev = ubar(1:nx,1:ny,1:nz)

               ! grid and block dimensions
               dimGrid = dim3(nx/BSIZEx,ny/BSIZEy,1)  
               dimBlock = dim3(BSIZEx,BSIZEy,BSIZEz)

               ! Start the data xfer-exclusive timer, launch the GPU kernel,
               ! wait for completion
               call system_clock( count=c2 )
               call kernel<<<dimGrid>>>(udev,ubardev
     :                                       ,rk_const_1n,rk_const_2n
     :                                       ,nx,ny,nz,n4
     :                                       ,tema4,temb4,temg4)
               r = cudathreadsynchronize()


               ! Stop data xfer-exclusive timer, copy the results back,
               ! stop data xfer-inclusive timer
               call system_clock( count=c3 )
               u(1:nx,1:ny,1:nz,1:2) = udev
               call system_clock( count=c4 )

               ! Calculate inclusive/exclusive execution times, and report MFLOPS
               flops = (nz-3.0)*(ny-1.0)*(nx-4.0)*52.0
!     :                +(nz-3.0)*(ny-1.0)*(nx-4)*2.0
               ctimekernel = c3 - c2
               mflopskernel = flops / ctimekernel
               ctimeall = c4 - c1
               mflopsall = flops / ctimeall

               ! Print out results
               print *, 'Kernel time excluding data xfer:', ctimekernel,
     :                  ' microseconds'
               print *, 'Megaflops excluding data xfer: ', mflopskernel
               print *, 'Total time including data xfer: ', ctimeall,
     :                  ' microseconds'
               print *, 'Megaflops including data xfer: ', mflopsall

               ! Deallocate device arrays and exit
               deallocate( udev, ubardev )

           end subroutine first_routine
       end module modu








c     ##################################################################
c     ##################################################################
c     ######                                                      ######
c     ######                 PROGRAM LOOPTEST                     ######
c     ######                                                      ######
c     ##################################################################
c     ##################################################################
c

      Program LOOPTEST
      use modu

c######################################################################
c
c     PURPOSE:
c
c     Test loop computations for optimization 
c
c######################################################################
c
c     AUTHOR: DW
c     2/23/2010
c
c######################################################################

c
      implicit none        ! Force explicit declarations
c      include 'time.inc'

      integer nx        ! Number of grid points in the x-direction.
      integer ny        ! Number of grid points in the y-direction.
      integer nz        ! Number of grid points in the z-direction.

      real, allocatable:: u(:,:,:,:) ! total u-velocity (m/s)
      real, allocatable:: w(:,:,:,:) ! total u-velocity (m/s)
      real, allocatable:: ubar(:,:,:) ! total u-velocity (m/s)
      real, allocatable:: testu(:,:,:,:) ! testing to make sure kernel works
c######################################################################
c
c     Miscellaneous local variables:
c     scalars, constants etc.
c
c######################################################################
      integer i,j,k,ubgn,uend,jbgn,jend,vbgn,vend,kbgn,kend,ibgn,iend
      integer nxa,nya,nza,m,n,wbgn,wend,rkorder
      real rk_constant1(3),rk_constant2(3),dx,dy,dz,dxinv,dyinv,dzinv
      real kx4,ky4,kv4,tema4,temb4,temg4
      real tema,temb,temc,temd,teme,temf,temg,dt,temrk

c     timing variables
      double precision :: ttime,incrementtime,addtime,temh,cudatime

c     start of papi variable definition
      REAL :: finaltime,amax,amin,bmax,bmin,umax,umin,wmax,wmin
c     end of papi variable definition

C     TESTING variables
      integer correct,incorrect,tm
      real diff, denom, error

c######################################################################
c
c     begin executable code:
c
c######################################################################


c     read input file parameters/variables
c      open (5,file='loop.input',form='formatted')
c      read (5,*) nx
c      read (5,*) ny
c      read (5,*) nz

      nx = NX_param;
      ny = NY_param;
      nz = NZ_param;


c     allocate variable memory
      allocate (u(nx,ny,nz,3))
      allocate (w(nx,ny,nz,3))
      allocate (ubar(nx,ny,nz))
      allocate (testu(nx,ny,nz,3))

      print *,'grid size=',nx,ny,nz
      print *,'total data size=',nx*ny*nz*7*4                       
c     set more constants associated with the time differencing scheme
c     TODO don't need all of these...clean up the code!
      rkorder = 3
      rk_constant1(1) = 0.0
      rk_constant1(2) = 5./9. 
      rk_constant1(3) = 153./128.
      rk_constant2(1) = 1.0/3.0
      rk_constant2(2) = 15./16.
      rk_constant2(3) = 8./15.
      dt = 1.0
      temrk = dt
      dx = 1000.0
      dy = 1000.0
      dz = 1000.0
      dxinv = 1.0/dx
      dyinv = 1.0/dy
      dzinv = 1.0/dz

      tema4= temrk*dxinv/24.0
      temb4= temrk*dxinv/3.0
      temg4 = temrk*0.06


      tema = 0.25*dxinv*temrk
      temb = 0.25*dyinv*temrk

c     initialize the variables
      DO k= 1,nz
        DO j= 1,ny
          DO i= 1,nx
            temc = 3.0*(sin(2.0*3.141592*dx*(i-1)/(nx*dx))
     :                 +sin(2.0*3.141592*dy*(j-1)/(ny*dy))
     :                 +sin(2.0*3.141592*dz*(k-1)/(nz*dz)))
            u(i,j,k,2) = temc
            w(i,j,k,2) = -temc
            u(i,j,k,1) = u(i,j,k,2)
            w(i,j,k,1) = w(i,j,k,2)
            u(i,j,k,3) = u(i,j,k,2)
            w(i,j,k,3) = w(i,j,k,2)
            ubar(i,j,k) = 2.0
            testu(i,j,k,2) = temc
            testu(i,j,k,1) = u(i,j,k,2)
            testu(i,j,k,3) = u(i,j,k,2)
          END DO
        END DO
      END DO

c     check initial values
      amax = -99999.0
      amin =  99999.0
      bmax = -99999.0
      bmin =  99999.0
      do n = 1,3
      do k=1,nz
        do j=1,ny
          do i=1,nx
            amax = max(u(i,j,k,n),amax)
            amin = min(u(i,j,k,n),amin)
            bmax = max(w(i,j,k,n),bmax)
            bmin = min(w(i,j,k,n),bmin)
          end do
        end do
      end do
c      print *,'umax= ',amax,'umin= ',amin,'wmax= ',bmax
      end do

      m = cudaSetDevice(0)

c     start the main loop  5 time steps
      do m=1,5
c     rk 3 loop 
      do n=1,3



      call first_routine(u,ubar,rk_constant1(n),rk_constant2(n)
     :                   ,nx,ny,nz
     :                   ,tema4,temb4,temg4)



      DO k=2,nz-2  ! scalar limits  u(2) is the q's/forcing.
        DO j=1,ny-1 ! scalar limits u(1) is the updated/previous u
          DO i=3,nx-2

            testu(i,j,k,2)=-testu(i,j,k,2)*rk_constant1(n)

     :  +tema4*((testu(i,j,k,1)+testu(i+2,j,k,1))
     :        *(testu(i+2,j,k,1)-testu(i,j,k,1))
     :        +(testu(i,j,k,1)+testu(i-2,j,k,1))
     :        *(testu(i,j,k,1)-testu(i-2,j,k,1)))
     :  -temb4*((testu(i+1,j,k,1)+testu(i,j,k,1))
     :        *(testu(i+1,j,k,1)-testu(i,j,k,1))
     :       + (testu(i,j,k,1)+testu(i-1,j,k,1))
     :        *(testu(i,j,k,1)-testu(i-1,j,k,1)))
     :  -temg4*(((((testu(i+2,j,k,1)-ubar(i+2,j,k))-
     :             (testu(i+1,j,k,1)-ubar(i+1,j,k)))-
     :            ((testu(i+1,j,k,1)-ubar(i+1,j,k))-
     :             (testu(i,j,k,1)-ubar(i,j,k))))-
     :           (((testu(i+1,j,k,1)-ubar(i+1,j,k))-
     :             (testu(i,j,k,1)-ubar(i,j,k)))-
     :            ((testu(i,j,k,1)-ubar(i,j,k))-
     :             (testu(i-1,j,k,1)-ubar(i-1,j,k)))))-
     :          ((((testu(i+1,j,k,1)-ubar(i+1,j,k))-
     :             (testu(i,j,k,1)-ubar(i,j,k)))-
     :            ((testu(i,j,k,1)-ubar(i,j,k))-
     :             (testu(i-1,j,k,1)-ubar(i-1,j,k))))-
     :           (((testu(i,j,k,1)-ubar(i,j,k))-
     :             (testu(i-1,j,k,1)-ubar(i-1,j,k)))-
     :            ((testu(i-1,j,k,1)-ubar(i-1,j,k))-
     :             (testu(i-2,j,k,1)-ubar(i-2,j,k))))))
          END DO  !  52 calculations...
        END DO
      END DO
!      endif





!     TEST ############################################################

      correct = 0
      incorrect = 0
      DO i=1,nx
        DO j=1,ny
          DO k=1,nz
            DO tm=1,3!1,3

               if (k<=nz-2 .and. j<ny>=2
     :                     .and. i>=3 .and. i<nx> 1.0e-5) then
c                print *,'ERR:',u(i,j,k,tm),testu(i,j,k,tm)
c     :                 ,i,j,k,tm
                incorrect = incorrect + 1
              else
C                print *,u(i,j,k,tm),testu(i,j,k,tm)
                correct = correct + 1
              endif

           endif

            END DO
          END DO
        END DO
      END DO
      print *,'KERNEL TEST: Correct:',correct,'Incorrect:',incorrect


!     END TEST  ######################################################



      print *,'-------------------------------------------------'


      DO k=2,nz-2   ! complete the u computations
        DO j=1,ny-1
          DO i=3,nx-2
            testu(i,j,k,1)= testu(i,j,k,1)
     :                     +testu(i,j,k,2)*rk_constant2(n)
          END DO
        END DO
      END DO


      print *,'-------------------------------------------------'


      DO k=2,nz-2   ! complete the u computations
        DO j=1,ny-1
          DO i=3,nx-2
            u(i,j,k,1)= u(i,j,k,1)
     :                     +u(i,j,k,2)*rk_constant2(n)
          END DO
        END DO
      END DO







      print *,'-------------------------------------------------'


      DO k=3,nz-2   ! limits 3,nz-2
        DO j=1,ny-1  ! scalar limits 2,ny-2
          DO i=1,nx-1 ! scalar limits 2,nx-2
            w(i,j,k,2)=-w(i,j,k,2)*rk_constant1(n)
c  vert adv fourth order
     : +tema4*((w(i,j,k,1)+w(i,j,k+2,1))*(w(i,j,k+2,1)-w(i,j,k,1))
     :       +(w(i,j,k-2,1)+w(i,j,k,1))*(w(i,j,k,1)-w(i,j,k-2,1)))
     : -temb4*((w(i,j,k-1,1)+w(i,j,k,1))*(w(i,j,k,1)-w(i,j,k-1,1))
     :       +(w(i,j,k+1,1)+w(i,j,k,1))*(w(i,j,k+1,1)-w(i,j,k,1)))
c compute the fourth order cmix z terms.
     :  -temg4*((((w(i,j,k+2,1)-w(i,j,k+1,1))-
     :            (w(i,j,k+1,1)-w(i,j,k,1)))-
     :           ((w(i,j,k+1,1)-w(i,j,k,1))-
     :            (w(i,j,k,1)-w(i,j,k-1,1))))-
     :          (((w(i,j,k+1,1)-w(i,j,k,1))-
     :            (w(i,j,k,1)-w(i,j,k-1,1)))-
     :           ((w(i,j,k,1)-w(i,j,k-1,1))-
     :            (w(i,j,k-1,1)-w(i,j,k-2,1)))))
          END DO   ! 36 calculations...
        END DO
      END DO


      print *,'-------------------------------------------------'

c     check for bogus solution
      amax = -99999.0
      amin =  99999.0
      bmax = -99999.0
      bmin =  99999.0
      do k=1,nz
        do j=1,ny
          do i=1,nx
            amax = max(u(i,j,k,1),amax)
            amin = min(u(i,j,k,1),amin)
            bmax = max(w(i,j,k,2),bmax)
            bmin = min(w(i,j,k,2),bmin)
          end do
        end do
      end do

c      print *,'umax= ',amax,'umin= ',amin,'wmax= ',bmax,
c     :        'wmin= ',bmin

      end do  !  end of the rk3 loop
      end do  !  end of the main time loop

      stop
      end
c     program end

Hello Mat,

I work with Jim. In my expereince, more computations in which you reuse data has been the key to achieving good optimization on standard CPU’s and vector machines. On GPU’s it seems that this approach may not be optimal. Is there any reference material that you can point us to in regards to optimizing thread computations on a Tesla MP (MP = a single multiprocessor on a Tesla card which has 8 processing elements)?

For reference: You stated: “This should help decrease the number of operations per kernel and increase your parallelization.”

Why? It seems that your logic leans toward a system that is better at scheduling work than actually performing calculations…hmmm

Take the loop code that Jim has posted, it has on the order of 52 calculations for each iteration of the loop, but only has 10 or so different instances of the arrays used in that loop. It has a computation to memory reference ratio of about 5:1. Take the matrix matrix multiply that he worked on last year, is has two calculations for every two memory references. Note that the result is added to a scalar and restored into that scalar. That code runs about 250GFLOPS vs the loop code at 48GFLOPS.

Can you help us understand why this is the case?

Thanks!

Dan

matrix matrix multiply:
! Multiply the two submatrices; ! Each of the 16x16 threads accumulates the
! dot product for its element of C(i,j)

do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo

! Synchronize to make sure all threads are done reading the submatrices before
! overwriting them in the next iteration of hte kb loop

call syncthreads()



Loop code: described in Jim’s code above.

I’ve updated the code again. The biggest difference is that now, each thread only brings two elements into shared memory. The new speed is about 100 GFLOP/s, which is definitely faster, but I think the questions about number of calculations/thread still stand. As long as you’re maxing out the number of threads per microprocessor, why is it better to have more threads with fewer calculations each?

I’ll post the new code on Monday.

Okay, the new top speed is about 145 GFLOP/s, and the code is below. We used Cuda-Z to test our card, and got it to run at 600 GFLOP/s. What can I do to get this code closer to that 600 GFLOP/s? Also, the speed including the memory transfers to and from the device is much slower - around 1 GFLOP/s - what can be done about that? Questions about coalescing and shared memory bank conflicts comming soon…

module blocksize
         implicit none
         save
         integer, parameter :: NX_param = 1984 !must be BSIZEx more than a multiple of (BSIZEx-4)
         integer, parameter :: NY_param = 913  !must be one more than a multiple of BSIZEy
         integer, parameter :: NZ_param = 7    !must be 3 more than BSIZEz
         integer, parameter :: BSIZEx = 64 !NX_param = integer*(BSIZEx-4) + BSIZEx
         integer, parameter :: BSIZEy = 1  !must divide evenly into (NY_param-1)
         integer, parameter :: BSIZEz = 4  !must equal (NZ_param-3)

       end module



       ! start the module containing the matrix multiply kernel

       module modu
         use cudafor
         use blocksize
         contains

         ! _kernel 

         attributes(global) subroutine kernel( udev,ubardev,rk_const_1n
     :                                         ,rk_const_2n
     :                                         ,tema4,temb4,temg4 )
           real*4,device :: udev(NX_param,NY_param,NZ_param,2)
           real*4,constant :: ubardev(NX_param,NY_param,NZ_param)
           integer, value :: i,j,k,Tx,Ty,Tz
           real*4, value :: rk_const_1n,rk_const_2n, result
           real*4, value :: tema4,temb4,temg4

           real*4, shared :: sh_u(BSIZEx,BSIZEy,BSIZEz)
           real*4, shared :: sh_ubar(BSIZEx,BSIZEy,BSIZEz)



           Tx = threadIdx%x
           Ty = threadIdx%y
           Tz = threadIdx%z


           i = (blockIdx%x-1)*(blockdim%x-4)+Tx
           j = (blockIdx%y-1)*blockdim%y+Ty
           k = Tz+1


           !  Each thread pulls two elements into shared memory
           sh_u(Tx,Ty,Tz) = udev(i,j,k,1)
           sh_ubar(Tx,Ty,Tz) = ubardev(i,j,k)

           call syncthreads()

           if (Tx > 2 .and. Tx <BSIZEx>= 3 .and. i <= NX_param-2) then


                result = udev(i,j,k,2)


                result=-result*rk_const_1n

     :         +tema4*( (sh_u(Tx,Ty,Tz)+sh_u(Tx+2,Ty,Tz))
     :                 *(sh_u(Tx+2,Ty,Tz)-sh_u(Tx,Ty,Tz))
     :                 +(sh_u(Tx,Ty,Tz)+sh_u(Tx-2,Ty,Tz))
     :                 *(sh_u(Tx,Ty,Tz)-sh_u(Tx-2,Ty,Tz)) )


     :         -temb4*( (sh_u(Tx+1,Ty,Tz)+sh_u(Tx,Ty,Tz))
     :                 *(sh_u(Tx+1,Ty,Tz)-sh_u(Tx,Ty,Tz))
     :                 +(sh_u(Tx,Ty,Tz)+sh_u(Tx-1,Ty,Tz))
     :                 *(sh_u(Tx,Ty,Tz)-sh_u(Tx-1,Ty,Tz)) )


     :    -temg4*((((( sh_u(Tx+2,Ty,Tz)-sh_ubar(Tx+2,Ty,Tz))-
     :                (sh_u(Tx+1,Ty,Tz)-sh_ubar(Tx+1,Ty,Tz)))-
     :               ((sh_u(Tx+1,Ty,Tz)-sh_ubar(Tx+1,Ty,Tz))-
     :                (sh_u(Tx,Ty,Tz)-sh_ubar(Tx,Ty,Tz))))-
     :              (((sh_u(Tx+1,Ty,Tz)-sh_ubar(Tx+1,Ty,Tz))-
     :                (sh_u(Tx,Ty,Tz)-sh_ubar(Tx,Ty,Tz)))-
     :               ((sh_u(Tx,Ty,Tz)-sh_ubar(Tx,Ty,Tz))-
     :                (sh_u(Tx-1,Ty,Tz)-sh_ubar(Tx-1,Ty,Tz)))))-

     :           (((( sh_u(Tx+1,Ty,Tz)-sh_ubar(Tx+1,Ty,Tz))-
     :               (sh_u(Tx,Ty,Tz)-sh_ubar(Tx,Ty,Tz)))-
     :              ((sh_u(Tx,Ty,Tz)-sh_ubar(Tx,Ty,Tz))-
     :               (sh_u(Tx-1,Ty,Tz)-sh_ubar(Tx-1,Ty,Tz))))-
     :             (((sh_u(Tx,Ty,Tz)-sh_ubar(Tx,Ty,Tz))-
     :               (sh_u(Tx-1,Ty,Tz)-sh_ubar(Tx-1,Ty,Tz)))-
     :              ((sh_u(Tx-1,Ty,Tz)-sh_ubar(Tx-1,Ty,Tz))-
     :               (sh_u(Tx-2,Ty,Tz)-sh_ubar(Tx-2,Ty,Tz) )))))



                 ! now we're done with element i-2, copy it back to udev
                  udev(i,j,k,2) = result

           endif

         end subroutine kernel



         ! The host routine that calls the kernel

         subroutine first_routine(u,ubar,rk_const_1n,rk_const_2n
     :                           ,nx,ny,nz,tema4,temb4,temg4)

               real, dimension (:,:,:,:) :: u
               real, dimension (:,:,:) :: ubar
               real*4 rk_const_1n,rk_const_2n, tema4,temb4,temg4
c               integer :: nx,ny,nz,n4
               integer :: nx,ny,nz

               ! allocatable device arrays
               real*4,device,allocatable,dimension(:,:,:,:) :: udev
               real*4,device,allocatable,dimension(:,:,:) :: ubardev

               type(dim3) :: dimGrid, dimBlock
               integer :: r 

               real ctimeall,ctimekernel,flops,mflopskernel,mflopsall
               integer c1, c2, c3, c4

               ! Begin execution, first determine the sizes of the input arrays
               nx = size( u, 1 )
               ny = size( u, 2 )
               nz = size( u, 3 )
c               n4 = size( u, 4 )

               ! start data xfer-inclusive timer and allocate the
               ! device arrays
               call system_clock( count=c1 )
               allocate( udev(nx,ny,nz,2), ubardev(nx,ny,nz))

               ! Copy arrays to the device
               udev = u(1:nx,1:ny,1:nz,1:2)
               ubardev = ubar(1:nx,1:ny,1:nz)

               ! grid and block dimensions
c               dimGrid = dim3((nx-BSIZEx)/(Bsizex-4)+1,ny/BSIZEy,1) !makes threads for all j/k
               dimGrid = dim3((nx-BSIZEx)/(Bsizex-4)+1,(ny-1)/BSIZEy,1) !only makes threads for the js and ks that we use
               dimBlock = dim3(BSIZEx,BSIZEy,BSIZEz)

               ! Start the data xfer-exclusive timer, launch the GPU kernel,
               ! wait for completion
               call system_clock( count=c2 )
               call kernel<<<dimGrid,dimBlock>>>(udev,ubardev
     :                                       ,rk_const_1n,rk_const_2n
     :                                       ,tema4,temb4,temg4)
               r = cudathreadsynchronize()


               ! Stop data xfer-exclusive timer, copy the results back,
               ! stop data xfer-inclusive timer
               call system_clock( count=c3 )
               u(1:nx,1:ny,1:nz,1:2) = udev
               call system_clock( count=c4 )

               ! Calculate inclusive/exclusive execution times, and report MFLOPS
               flops = (nz-3.0)*(ny-1.0)*(nx-4.0)*52.0
!     :                +(nz-3.0)*(ny-1.0)*(nx-4)*2.0
               ctimekernel = c3 - c2
               mflopskernel = flops / ctimekernel
               ctimeall = c4 - c1
               mflopsall = flops / ctimeall

               ! Print out results
               print *, 'Kernel time excluding data xfer:', ctimekernel,
     :                  ' microseconds'
               print *, 'Megaflops excluding data xfer: ', mflopskernel
               print *, 'Total time including data xfer: ', ctimeall,
     :                  ' microseconds'
               print *, 'Megaflops including data xfer: ', mflopsall

               ! Deallocate device arrays and exit
               deallocate( udev, ubardev )

           end subroutine first_routine
       end module modu








c     ##################################################################
c     ##################################################################
c     ######                                                      ######
c     ######                 PROGRAM LOOPTEST                     ######
c     ######                                                      ######
c     ##################################################################
c     ##################################################################
c

      Program LOOPTEST
      use modu

c######################################################################
c
c     PURPOSE:
c
c     Test loop computations for optimization 
c
c######################################################################
c
c     AUTHOR: DW
c     2/23/2010
c
c######################################################################

c
      implicit none        ! Force explicit declarations
c      include 'time.inc'

      integer nx        ! Number of grid points in the x-direction.
      integer ny        ! Number of grid points in the y-direction.
      integer nz        ! Number of grid points in the z-direction.

      real, allocatable:: u(:,:,:,:) ! total u-velocity (m/s)
      real, allocatable:: w(:,:,:,:) ! total u-velocity (m/s)
      real, allocatable:: ubar(:,:,:) ! total u-velocity (m/s)
      real, allocatable:: testu(:,:,:,:) ! testing to make sure kernel works
c######################################################################
c
c     Miscellaneous local variables:
c     scalars, constants etc.
c
c######################################################################
      integer i,j,k,ubgn,uend,jbgn,jend,vbgn,vend,kbgn,kend,ibgn,iend
      integer nxa,nya,nza,m,n,wbgn,wend,rkorder

      real rk_constant1(3),rk_constant2(3),dx,dy,dz,dxinv,dyinv,dzinv 

      real tema,temb,temc,temd,teme,temf,temg,dt,temrk   

      real kx4,ky4,kv4,tema4,temb4,temg4       





c     timing variables
      double precision :: ttime,incrementtime,addtime,temh,cudatime

c     start of papi variable definition
      REAL :: finaltime,amax,amin,bmax,bmin,umax,umin,wmax,wmin
c     end of papi variable definition

C     TESTING variables
      integer correct,incorrect,tm
      real diff, denom, error

c######################################################################
c
c     begin executable code:
c
c######################################################################


c     read input file parameters/variables
c      open (5,file='loop.input',form='formatted')
c      read (5,*) nx
c      read (5,*) ny
c      read (5,*) nz

      nx = NX_param;
      ny = NY_param;
      nz = NZ_param;


c     allocate variable memory
      allocate (u(nx,ny,nz,3))
      allocate (w(nx,ny,nz,3))
      allocate (ubar(nx,ny,nz))
      allocate (testu(nx,ny,nz,3))

      print *,'grid size=',nx,ny,nz
      print *,'total data size=',nx*ny*nz*7*4                       
c     set more constants associated with the time differencing scheme
c     TODO don't need all of these...clean up the code!
      rkorder = 3
      rk_constant1(1) = 0.0
      rk_constant1(2) = 5./9. 
      rk_constant1(3) = 153./128.
      rk_constant2(1) = 1.0/3.0
      rk_constant2(2) = 15./16.
      rk_constant2(3) = 8./15.
      dt = 1.0
      temrk = dt
      dx = 1000.0
      dy = 1000.0
      dz = 1000.0
      dxinv = 1.0/dx
      dyinv = 1.0/dy
      dzinv = 1.0/dz

      tema4= temrk*dxinv/24.0
      temb4= temrk*dxinv/3.0
      temg4 = temrk*0.06


      tema = 0.25*dxinv*temrk
      temb = 0.25*dyinv*temrk

c     initialize the variables
      DO k= 1,nz
        DO j= 1,ny
          DO i= 1,nx
            temc = 3.0*(sin(2.0*3.141592*dx*(i-1)/(nx*dx))
     :                 +sin(2.0*3.141592*dy*(j-1)/(ny*dy))
     :                 +sin(2.0*3.141592*dz*(k-1)/(nz*dz)))
            u(i,j,k,2) = temc
            w(i,j,k,2) = -temc
            u(i,j,k,1) = u(i,j,k,2)
            w(i,j,k,1) = w(i,j,k,2)
            u(i,j,k,3) = u(i,j,k,2)
            w(i,j,k,3) = w(i,j,k,2)
            ubar(i,j,k) = 2.0
            testu(i,j,k,2) = temc
            testu(i,j,k,1) = u(i,j,k,2)
            testu(i,j,k,3) = u(i,j,k,2)
          END DO
        END DO
      END DO

c     check initial values
      amax = -99999.0
      amin =  99999.0
      bmax = -99999.0
      bmin =  99999.0
      do n = 1,3
      do k=1,nz
        do j=1,ny
          do i=1,nx
            amax = max(u(i,j,k,n),amax)
            amin = min(u(i,j,k,n),amin)
            bmax = max(w(i,j,k,n),bmax)
            bmin = min(w(i,j,k,n),bmin)
          end do
        end do
      end do
c      print *,'umax= ',amax,'umin= ',amin,'wmax= ',bmax
      end do

      m = cudaSetDevice(0)

c     start the main loop  5 time steps
      do m=1,5
c     rk 3 loop 
      do n=1,3



      call first_routine(u,ubar,rk_constant1(n),rk_constant2(n)
     :                   ,nx,ny,nz
     :                   ,tema4,temb4,temg4)



      DO k=2,nz-2  ! scalar limits  u(2) is the q's/forcing.
        DO j=1,ny-1 ! scalar limits u(1) is the updated/previous u
          DO i=3,nx-2

            testu(i,j,k,2)=-testu(i,j,k,2)*rk_constant1(n)

     :  +tema4*((testu(i,j,k,1)+testu(i+2,j,k,1))
     :        *(testu(i+2,j,k,1)-testu(i,j,k,1))
     :        +(testu(i,j,k,1)+testu(i-2,j,k,1))
     :        *(testu(i,j,k,1)-testu(i-2,j,k,1)))
     :  -temb4*((testu(i+1,j,k,1)+testu(i,j,k,1))
     :        *(testu(i+1,j,k,1)-testu(i,j,k,1))
     :       + (testu(i,j,k,1)+testu(i-1,j,k,1))
     :        *(testu(i,j,k,1)-testu(i-1,j,k,1)))
     :  -temg4*(((((testu(i+2,j,k,1)-ubar(i+2,j,k))-
     :             (testu(i+1,j,k,1)-ubar(i+1,j,k)))-
     :            ((testu(i+1,j,k,1)-ubar(i+1,j,k))-
     :             (testu(i,j,k,1)-ubar(i,j,k))))-
     :           (((testu(i+1,j,k,1)-ubar(i+1,j,k))-
     :             (testu(i,j,k,1)-ubar(i,j,k)))-
     :            ((testu(i,j,k,1)-ubar(i,j,k))-
     :             (testu(i-1,j,k,1)-ubar(i-1,j,k)))))-
     :          ((((testu(i+1,j,k,1)-ubar(i+1,j,k))-
     :             (testu(i,j,k,1)-ubar(i,j,k)))-
     :            ((testu(i,j,k,1)-ubar(i,j,k))-
     :             (testu(i-1,j,k,1)-ubar(i-1,j,k))))-
     :           (((testu(i,j,k,1)-ubar(i,j,k))-
     :             (testu(i-1,j,k,1)-ubar(i-1,j,k)))-
     :            ((testu(i-1,j,k,1)-ubar(i-1,j,k))-
     :             (testu(i-2,j,k,1)-ubar(i-2,j,k))))))
          END DO  !  52 calculations...
        END DO
      END DO
!      endif





!     TEST ############################################################

      correct = 0
      incorrect = 0
      DO i=1,nx
        DO j=1,ny
          DO k=1,nz
            DO tm=1,3!1,3

               if (k<=nz-2 .and. j<=ny-1 .and. k>=2
     :                     .and. i>=3 .and. i<=nx-2) then

              diff = abs(u(i,j,k,tm)-testu(i,j,k,tm))
              denom = testu(i,j,k,tm)

              if (denom == 0.0) denom = 0.000000001
              error = diff/denom
              if (diff > 1.0e-5) then
c                print *,'ERR:',u(i,j,k,tm),testu(i,j,k,tm)
c     :                 ,i,j,k,tm
                incorrect = incorrect + 1
              else
C                print *,u(i,j,k,tm),testu(i,j,k,tm)
                correct = correct + 1
              endif

           endif

            END DO
          END DO
        END DO
      END DO
      print *,'KERNEL TEST: Correct:',correct,'Incorrect:',incorrect


!     END TEST  ######################################################



      print *,'-------------------------------------------------'


      DO k=2,nz-2   ! complete the u computations
        DO j=1,ny-1
          DO i=3,nx-2
            testu(i,j,k,1)= testu(i,j,k,1)
     :                     +testu(i,j,k,2)*rk_constant2(n)
          END DO
        END DO
      END DO


      print *,'-------------------------------------------------'


      DO k=2,nz-2   ! complete the u computations
        DO j=1,ny-1
          DO i=3,nx-2
            u(i,j,k,1)= u(i,j,k,1)
     :                     +u(i,j,k,2)*rk_constant2(n)
          END DO
        END DO
      END DO







      print *,'-------------------------------------------------'


      DO k=3,nz-2   ! limits 3,nz-2
        DO j=1,ny-1  ! scalar limits 2,ny-2
          DO i=1,nx-1 ! scalar limits 2,nx-2
            w(i,j,k,2)=-w(i,j,k,2)*rk_constant1(n)
c  vert adv fourth order
     : +tema4*((w(i,j,k,1)+w(i,j,k+2,1))*(w(i,j,k+2,1)-w(i,j,k,1))
     :       +(w(i,j,k-2,1)+w(i,j,k,1))*(w(i,j,k,1)-w(i,j,k-2,1)))
     : -temb4*((w(i,j,k-1,1)+w(i,j,k,1))*(w(i,j,k,1)-w(i,j,k-1,1))
     :       +(w(i,j,k+1,1)+w(i,j,k,1))*(w(i,j,k+1,1)-w(i,j,k,1)))
c compute the fourth order cmix z terms.
     :  -temg4*((((w(i,j,k+2,1)-w(i,j,k+1,1))-
     :            (w(i,j,k+1,1)-w(i,j,k,1)))-
     :           ((w(i,j,k+1,1)-w(i,j,k,1))-
     :            (w(i,j,k,1)-w(i,j,k-1,1))))-
     :          (((w(i,j,k+1,1)-w(i,j,k,1))-
     :            (w(i,j,k,1)-w(i,j,k-1,1)))-
     :           ((w(i,j,k,1)-w(i,j,k-1,1))-
     :            (w(i,j,k-1,1)-w(i,j,k-2,1)))))
          END DO   ! 36 calculations...
        END DO
      END DO


      print *,'-------------------------------------------------'

c     check for bogus solution
      amax = -99999.0
      amin =  99999.0
      bmax = -99999.0
      bmin =  99999.0
      do k=1,nz
        do j=1,ny
          do i=1,nx
            amax = max(u(i,j,k,1),amax)
            amin = min(u(i,j,k,1),amin)
            bmax = max(w(i,j,k,2),bmax)
            bmin = min(w(i,j,k,2),bmin)
          end do
        end do
      end do

c      print *,'umax= ',amax,'umin= ',amin,'wmax= ',bmax,
c     :        'wmin= ',bmin

      end do  !  end of the rk3 loop
      end do  !  end of the main time loop

      stop
      end
c     program end