Hi Balkrishna,
Below is some code that uses OpenMP+CUDA Fortran. I had originally written it for my article on Multi-GPU programming (Account Login | PGI) but decided to just focus on MPI+CUDA Fortran.
The biggest things to watch out for are your device allocatables since they need to be managed by separate threads. Be sure to put to put them in a “private” clause and don’t allocate them till after the GPU context has been created (i.e. after cudaSetDevice has been called). Also, unless you’re using a Kepler, each OpenMP thread should have their own GPU.
Let me know if you have questions,
Mat
% cat life.F90
#ifdef _CUDA
#define DEVICE device,
#else
#define DEVICE
#endif
module life_mod
integer :: NXSIZE, NYSIZE, SEED
parameter(NXSIZE=128, NYSIZE=128, SEED=123)
real :: initPercent=0.4
contains
#ifdef _CUDA
attributes(global) subroutine life_kernel(dOld, dNew, Nx, Ny)
implicit none
integer, dimension(Nx,Ny), device :: dOld, dNew
integer, value :: Nx,Ny
integer :: i, j, ix, iy, neighbors, state
ix = (blockIdx%x-1)*blockDim%x + threadIdx%x
iy = (blockIdx%y-1)*blockDim%y + threadIdx%y
if (ix .gt. 1 .and. iy.gt.1 .and. &
ix .lt. Nx .and. iy .lt. Ny) then
neighbors = dOld(ix,iy-1) + &
dOld(ix,iy+1) + &
dOld(ix+1,iy-1) + &
dOld(ix+1,iy+1) + &
dOld(ix-1,iy-1) + &
dOld(ix-1,iy+1) + &
dOld(ix-1,iy) + &
dOld(ix+1,iy)
state = dOld(ix,iy)
if (state .eq. 0 .and. neighbors .eq. 3) then
dNew(ix,iy) = 1 ! birth
else if (state.eq.1.and.neighbors.ne.2.and.neighbors.ne.3) then
dNew(ix,iy) = 0 ! death
else
dNew(ix,iy) = state ! no change
end if
end if
end subroutine life_kernel
#else
subroutine life(dOld, dNew, Nx, Ny)
implicit none
integer, dimension(Nx,Ny) :: dOld, dNew
integer, value :: Nx,Ny,tid
integer :: i, j,ix,iy, neighbors, state
do ix=2,Nx-1
do iy=2,Ny-1
neighbors = dOld(ix,iy-1) + &
dOld(ix,iy+1) + &
dOld(ix+1,iy-1) + &
dOld(ix+1,iy+1) + &
dOld(ix-1,iy-1) + &
dOld(ix-1,iy+1) + &
dOld(ix-1,iy) + &
dOld(ix+1,iy)
state = dOld(ix,iy)
if (state .eq. 0 .and. neighbors .eq. 3) then
dNew(ix,iy) = 1 ! birth
else if (state.gt.0.and.neighbors.ne.2.and.neighbors.ne.3) then
dNew(ix,iy) = 0 ! death 0
else
dNew(ix,iy) = state ! no change
end if
end do
end do
end subroutine life
#endif
subroutine lifeMain ()
#ifdef _CUDA
use cudafor
#endif
#ifdef _OMP
use omp_lib
#endif
implicit none
integer :: Nx, Ny, Nt, temp, count, steps, i, j, angle, nthds, tnum
integer :: numdev, devnum, istat, myseed, blocksize
integer, allocatable, DEVICE dimension(:,:) :: dOld, dNew
integer, allocatable, dimension(:,:) :: A, B
integer :: start, end
real, allocatable, dimension(:,:) :: rand
character(len=80) :: rfile
integer :: alive
#ifdef _CUDA
type(dim3) :: dimGrid, dimBlock
#else
integer,dimension(3) :: dimGrid,dimBlock
#endif
! account for the halo
Nx = NXSIZE+2
Ny = NYSIZE+2
myseed = SEED
allocate(A(Nx,Ny),B(Nx,Ny),rand(Nx,Ny))
call random_seed(myseed)
call random_number(rand)
A=0
do i=2,Nx-1
do j=2,Ny-1
if (rand(i,j) < initPercent) then
A(i,j) = 1
else
A(i,j) = 0
endif
enddo
enddo
B=A
! Set the inital conditions
count = 0
steps = 0
temp = 0
alive = 1
#ifdef _CUDA
istat = cudaGetDeviceCount(numdev)
#endif
!$omp parallel &
!$omp shared(A,B,count,Nx,Ny,numdev,alive,steps), &
!$omp private(tnum,dOld,dNew,dimGrid,dimBlock,devnum,istat,Nt, &
!$omp i,j,blocksize,start,end)
#ifdef _OMP
nthds = omp_get_num_threads()
tnum = omp_get_thread_num()
#else
nthds = 1
tnum=0
#endif
!$omp master
!$omp end master
!$omp barrier
blocksize = NXSIZE/nthds
Nt = blocksize+2
start = (tnum*blocksize)+1
end = start + blocksize + 1
#ifdef _CUDA
devnum = mod(tnum,numdev)
istat = cudaSetDevice(devnum)
#endif
allocate(dNew(Nt,Ny), dOld(Nt,Ny))
dOld=0
dNew=0
#ifdef _CUDA
dimBlock = dim3(16,16,1)
dimGrid = dim3((Nt+15)/16,(ny+15)/16,1)
#endif
do, while (steps.lt.16384.and.count.lt.10.and.alive.gt.0)
dOld(1:Nt,1:Ny) = A(start:end,1:Ny)
dNew=dOld
#ifdef _CUDA
call life_kernel<<<dimGrid,dimBlock>>>(dOld,dNew,Nt,Ny)
#else
call life(dOld,dNew,Nt,Ny)
#endif
B(start+1:end-1,1:Ny) = dNew(2:Nt-1,1:Ny)
!$omp barrier
!$omp master
A=B
alive = sum(A)
! if (mod(steps,100).eq.0) then
print *, 'Step:', steps, ' Alive:', alive
! endif
if (temp.lt.(alive+2).and.temp.gt.(alive-2)) then
count = count + 1
else
count = 0
endif
temp = alive
steps = steps + 1
!$omp end master
!$omp barrier
end do
deallocate(dOld)
deallocate(dNew)
!$omp end parallel
alive = sum(A)
print *, 'Step:', steps, ' Alive:',alive
print *, 'Init%:', initPercent, ' Actual%:', real(alive)/real(NXSIZE*NYSIZE)
close(21)
deallocate(A,B)
stop
end subroutine lifeMain
end module life_mod
program life
use life_mod
implicit none
call lifeMain()
end program life
% pgfortran -Mpreprocess -mp -fast -Minfo=accel -Mcuda life.F90 -o life.out