[1]
Actually, for the code below I was not supposed to add acc derectives outside.And the informational message also told me that"PGF90-W-0155-Accelerator region ignored;" However, I did get a speed up without any error compared with nothing added.(from 6.84 [GFlops] to 8.89 [GFlops] ). How come?
!$acc region
do icnt=1, 999999
if( mod(icnt,100) == 0 ) print 1000, icnt, time1+dt
call cpu_diff3d(f,fn,nx,ny,nz,ce,cw,cn,cs,ct,cb,cc)
flops = flops + dble(nxnynz)13.0
f = fn
time1 = time1 + dt
if( time1+0.5dt >= 0.1 ) exit
end do
!$acc end region
[2]
Well, am I allowed to go a little further? For this calculation, I`ve wrote the code with Gpu kernel and just acc derective both. While the speed is quite different. As I said above, the acc version got a 8.89 [GFlops] right now with a probable mistake. On the other hand, the Gpu version got a 26.57 [GFlops] easily.(the copyin part costs 3.99s while the d_f=f and d_fn=f only costs Time 1.51E-002s) Is that regular? Or does it mean that I still have a lot of rising space for my acc code. Maybe I should paste my test code below even though I know somepart looks strange. Please forgive me since I am just for test.
program test
use openacc
implicit none
integer, parameter :: nx = 128
integer, parameter :: ny = nx
integer, parameter :: nz = nx
integer, parameter :: n = nxnynz
integer :: i, j, k, stat, icnt
real(8) :: clock_start, clock_finish, &
time1, Lx, Ly, Lz, &
dx, dy, dz, dt, &
kx, ky, kz, pi, &
x, y, z, &
kappa, flops, ferr, f0, &
ce, cw, cn, cs, ct, cb, cc, &
ax, ay, az
real(8), dimension(nx,ny,nz) :: f, fn
time1 = 0.0
ferr = 0.0
pi = 4.0atan(1.0)
Lx = 1.0
Ly = 1.0
Lz = 1.0
dx = Lx/dble(nx)
dy = Ly/dble(ny)
dz = Lz/dble(nz)
kx = 2.0pi
ky = kx
kz = kx
kappa = 0.1
flops = 0.0
dt = 0.1dxdx/kappa
ce = kappadt/(dxdx)
cw = kappadt/(dxdx)
cn = kappadt/(dydy)
cs = kappadt/(dydy)
ct = kappadt/(dzdz)
cb = kappadt/(dzdz)
cc = 1.0 - (ce + cw + cn + cs + ct + cb)
call cpu_time(clock_start)
!$acc data copyin(f,fn),create(x,y,z,ce,cw,cn,cs,ct,cb,cc)
!$acc region
do k=1, nz
do j=1, ny
do i=1, nx
x = dx*(dble(i)-0.5)
y = dy*(dble(j)-0.5)
z = dz*(dble(k)-0.5)
f(i,j,k) = 0.125*(1.0 - cos(kxx))(1.0 - cos(kyy))(1.0 - cos(kzz))
end do
end do
end do
!$acc end region
call cpu_time(clock_finish)
write(,*) “1”, (clock_finish - clock_start)
call cpu_time(clock_start)
!!$acc region
do icnt=1, 999999
if( mod(icnt,100) == 0 ) print 1000, icnt, time1+dt
call cpu_diff3d(f,fn,nx,ny,nz,ce,cw,cn,cs,ct,cb,cc)
flops = flops + dble(nxnynz)13.0
f = fn
time1 = time1 + dt
if( time1+0.5dt >= 0.1 ) exit
end do
!!$acc end region
call cpu_time(clock_finish)
print 1002, (clock_finish - clock_start)
print 1003, flops/(clock_finish - clock_start)*1.0e-9
ax = exp(-kappatime1(kxkx))
ay = exp(-kappatime1*(kyky))
az = exp(-kappatime1*(kzkz))
call cpu_time(clock_start)
!$acc region
do k=1, nz
do j=1, ny
do i=1, nx
x = dx(dble(i)-0.5)
y = dy*(dble(j)-0.5)
z = dz*(dble(k)-0.5)
f0 = 0.125*(1.0 - axcos(kxx)) &
(1.0 - aycos(kyy)) &
(1.0 - azcos(kzz))
ferr = ferr + (f(i,j,k) - f0)(f(i,j,k) - f0)
end do
end do
end do
!$acc end region
!$acc end data
call cpu_time(clock_finish)
write(,) “2”, (clock_finish - clock_start)
ferr = sqrt(ferr/dble(nxny*nz))
print 1004, nx, ny, nz, ferr
1000 format(" ", "time(“i4”)=“f7.5)
1001 format(” “, “Elapsed Time= “1pe9.3” [sec] by GPU”)
1002 format(” “, “Elapsed Time= “1pe9.3” [sec] by CPU”)
1003 format(” “, “Performance=“f6.2” [GFlops]”)
1004 format(” ", "Error[“i4”][“i4”][“i4”]= "1pe12.6)
end
subroutine cpu_diff3d(f,fn,nx,ny,nz,ce,cw,cn,cs,ct,cb,cc)
use openacc
implicit none
integer :: nx, ny, nz
integer :: i, j, k, ie, iw, jn, js, kt, kb
real(8) :: ce, cw, cn, cs, ct, cb, cc
real(8), dimension(nx,ny,nz) :: f, fn
!$acc region
do k=1, nz
do j=1, ny
do i=1, nx
ie = i + 1
iw = i - 1
jn = j + 1
js = j - 1
kt = k + 1
kb = k - 1
if( i == nx ) ie = i
if( i == 1 ) iw = i
if( j == ny ) jn = j
if( j == 1 ) js = j
if( k == nz ) kt = k
if( k == 1 ) kb = k
fn(i,j,k) = cc*f(i,j,k) &
- cef(ie,j,k) + cwf(iw,j,k) &
- cnf(i,jn,k) + csf(i,js,k) &
- ctf(i,j,kt) + cbf(i,j,kb)
end do
end do
end do
!$acc end region
return
end subroutine
[3]
For the data swap code from my Gpu kernel version, it is possible to use the pointor just like the C language? How would it be?
attributes(global) subroutine gpu_swap(d_f,d_fn,nx,ny,nz)
implicit none
integer, value :: nx, ny, nz
integer :: i, j, k
real(8), dimension(nx,ny,nz), device :: d_f, d_fn
i = threadidx%x
j = blockidx%x
k = blockidx%y
if( i < nx+1 .and. j < ny+1 .and. k < nz+1 ) then
d_f(i,j,k) = d_fn(i,j,k)
end if
return
end subroutine