Hi yangmh,
Yes, OpenMP can be used with CUDA Fortran and works quite well. Actually my next PGInsider article will be a tutorial on it.
First you must understand that a CPU process/thread can only be attached to a single GPU so in order for a program to use multiple GPUs it must be done from OpenMP, MPI, or other higher parallel layer. Because of this, you need to have the CPU parallel region encompass the CUDA code. Have each process/thread attach to a device and then divide the problem between the devices. In other words, if you have two threads, you’d have two dA1’s each holding half of A.
Here’s some sample code from the OpenMP CUDA Fortran program I wrote for my next article:
subroutine lifeMain ()
#ifdef _CUDA
use cudafor
#endif
use omp_lib
implicit none
integer :: Nx, Ny, Nt, temp, count, steps, i, j, angle, nthds, tnum
integer :: numdev, devnum, istat, myseed, blocksize
integer, allocatable, DEVICE dimension(:,:) :: dOld, dNew
integer, allocatable, dimension(:,:) :: A, B
integer :: start, end
real, allocatable, dimension(:,:) :: rand
character(len=80) :: rfile
integer :: alive
#ifdef _CUDA
type(dim3) :: dimGrid, dimBlock
#else
integer,dimension(3) :: dimGrid,dimBlock
#endif
! account for the halo
Nx = NXSIZE+2
Ny = NYSIZE+2
myseed = SEED
allocate(A(Nx,Ny),B(Nx,Ny),rand(Nx,Ny))
call random_seed(myseed)
call random_number(rand)
A=0
do i=2,Nx-1
do j=2,Ny-1
if (rand(i,j) < initPercent) then
A(i,j) = 1
else
A(i,j) = 0
endif
enddo
enddo
B=A
! Set the initial conditions
count = 0
steps = 0
temp = 0
alive = 1
#ifdef _CUDA
istat = cudaGetDeviceCount(numdev)
#endif
!$omp parallel &
!$omp shared(A,B,count,Nx,Ny,numdev,alive,steps), &
!$omp private(tnum,dOld,dNew,dimGrid,dimBlock,devnum,istat,Nt, &
!$omp i,j,blocksize,start,end)
#ifdef _OMP
nthds = omp_get_num_threads()
tnum = omp_get_thread_num()
#else
nthds = 1
tnum=0
#endif
!$omp master
!$omp end master
!$omp barrier
blocksize = NXSIZE/nthds
Nt = blocksize+2
start = (tnum*blocksize)+1
end = start + blocksize + 1
#ifdef _CUDA
devnum = mod(tnum,numdev)
istat = cudaSetDevice(devnum)
#endif
allocate(dNew(Nt,Ny), dOld(Nt,Ny))
dOld=0
dNew=0
#ifdef _CUDA
dimBlock = dim3(16,16,1)
dimGrid = dim3((Nt+15)/16,(ny+15)/16,1)
#endif
do, while (steps.lt.16384.and.count.lt.10.and.alive.gt.0)
dOld(1:Nt,1:Ny) = A(start:end,1:Ny)
dNew=dOld
#ifdef _CUDA
call life_kernel<<<dimGrid,dimBlock>>>(dOld,dNew,Nt,Ny)
#else
call life(dOld,dNew,Nt,Ny)
#endif
B(start+1:end-1,1:Ny) = dNew(2:Nt-1,1:Ny)
!$omp barrier
!$omp master
A=B
alive = sum(A)
! if (mod(steps,100).eq.0) then
print *, 'Step:', steps, ' Alive:', alive
! endif
if (temp.lt.(alive+2).and.temp.gt.(alive-2)) then
count = count + 1
else
count = 0
endif
temp = alive
steps = steps + 1
! pause
!$omp end master
!$omp barrier
end do
deallocate(dOld)
deallocate(dNew)
#ifdef _CUDA
istat=cudaThreadExit()
#endif
!$omp end parallel
alive = sum(A)
print *, 'Step:', steps, ' Alive:',alive
print *, 'Init%:', initPercent, ' Actual%:', real(alive)/real(NXSIZE*NYSIZE)
close(21)
deallocate(A,B)
stop
end subroutine lifeMain
Hope this helps,
Mat