Helllo

I am dealing only since a short time with parallel programming. That’s why I might need some help.

I have a small program that is written for OpenMP parallelization:

```
program OMP
implicit none
real :: rinv, u, v, w
integer :: numThreads, in
integer, parameter :: knend = 2000000
integer :: kn, i, j
real, dimension(5,knend) :: F, G, H
real, dimension(6,knend) :: Q
integer :: count1, count_rate1, count_max1
integer :: count2, count_rate2, count_max2
integer :: comp
do i=1,6
do j=1,knend
Q(i,j) = sin(1.0*i)+cos(1.0*j)
end do
end do
call system_clock(count1, count_rate1, count_max1)
c$OMP PARALLEL
c$ numThreads = 4
c$ in = (knend/numThreads) + 1
c$OMP DO SCHEDULE(STATIC,in) PRIVATE(rinv, u, v, w)
DO KN = 1, KNEND
rinv = 1.0 / Q(1,KN)
u = Q(2,KN) * rinv
v = Q(3,KN) * rinv
w = Q(4,KN) * rinv
F(1,KN) = Q(2,KN)
F(2,KN) = Q(2,KN) * u + Q(6,KN)
F(3,KN) = Q(2,KN) * v
F(4,KN) = Q(2,KN) * w
F(5,KN) = (Q(5,KN) + Q(6,KN)) * u
G(1,KN) = Q(3,KN)
G(2,KN) = Q(3,KN) * u
G(3,KN) = Q(3,KN) * v + Q(6,KN)
G(4,KN) = Q(3,KN) * w
G(5,KN) = (Q(5,KN) + Q(6,KN)) * v
H(1,KN) = Q(4,KN)
H(2,KN) = Q(4,KN) * u
H(3,KN) = Q(4,KN) * v
H(4,KN) = Q(4,KN) * w + Q(6,KN)
H(5,KN) = (Q(5,KN) + Q(6,KN)) * w
ENDDO
c$OMP END DO
c$OMP END PARALLEL
call system_clock(count2, count_rate2, count_max2)
comp = count2 - count1
print *
print *, '***************************************'
print *, 'Last value of Matrix F', F(5,knend)
print *, 'Last value of Matrix G', G(5,knend)
print *, 'Last value of Matrix H', H(5,knend)
print *, 'Last value of Matrix Q', Q(6,knend)
print *
print *, comp, ' microseconds on ', numThreads, ' CPUs (OMP)'
print *, '***************************************'
print *
end program OMP
```

Now I want to parallelize this code witht the PGI accelerator. What I have done is the following:

```
program GPU
use accel_lib
implicit none
real :: rinv, u, v, w
integer, parameter :: knend = 2000000
integer :: kn, i, j
real, dimension(5,knend) :: F, G, H
real, dimension(6,knend) :: Q
integer :: count1, count_rate1, count_max1
integer :: count2, count_rate2, count_max2
integer :: cgpu
do i=1,6
do j=1,knend
Q(i,j) = sin(1.0*i)+cos(1.0*j)
end do
end do
call system_clock(count1, count_rate1, count_max1)
!$acc region
do kn = 1, knend
rinv = 1.0 / Q(1,KN)
u = Q(2,KN) * rinv
v = Q(3,KN) * rinv
w = Q(4,KN) * rinv
F(1,KN) = Q(2,KN)
F(2,KN) = Q(2,KN) * u + Q(6,KN)
F(3,KN) = Q(2,KN) * v
F(4,KN) = Q(2,KN) * w
F(5,KN) = (Q(5,KN) + Q(6,KN)) * u
G(1,KN) = Q(3,KN)
G(2,KN) = Q(3,KN) * u
G(3,KN) = Q(3,KN) * v + Q(6,KN)
G(4,KN) = Q(3,KN) * w
G(5,KN) = (Q(5,KN) + Q(6,KN)) * v
H(1,KN) = Q(4,KN)
H(2,KN) = Q(4,KN) * u
H(3,KN) = Q(4,KN) * v
H(4,KN) = Q(4,KN) * w + Q(6,KN)
H(5,KN) = (Q(5,KN) + Q(6,KN)) * w
end do
!$acc end region
call system_clock(count2, count_rate2, count_max2)
cgpu = count2 - count1
print *
print *, '***************************************'
print *, 'Last value of Matrix F', F(5,knend)
print *, 'Last value of Matrix G', G(5,knend)
print *, 'Last value of Matrix H', H(5,knend)
print *, 'Last value of Matrix Q', Q(6,knend)
print *
print *, cgpu, ' microseconds on GPU'
print *, '***************************************'
print *
end program GPU
```

And finally the compiler message tells me that the loop is parallelizable. I read that private scalars don’t have to be treated specially, is this correct:

```
23, Generating copyin(q(1:6,1:2000000))
Generating copyout(f(1:5,1:2000000))
Generating copyout(g(1:5,1:2000000))
Generating copyout(h(1:5,1:2000000))
Generating compute capability 1.0 binary
Generating compute capability 1.3 binary
25, Loop is parallelizable
Accelerator kernel generated
25, !$acc do parallel, vector(32)
Non-stride-1 accesses for array 'f'
Non-stride-1 accesses for array 'q'
Non-stride-1 accesses for array 'g'
Non-stride-1 accesses for array 'h'
CC 1.0 : 14 registers; 20 shared, 88 constant, 0 local memory bytes; 33 occupancy
CC 1.3 : 15 registers; 20 shared, 88 constant, 0 local memory bytes; 25 occupancy
```

Regarding the tutorial (Part2) on the Portland Group webpage the statement: Non-stride-1 accesses fo array…" refers to that i might want to reorganize the program. Now my question is if anybody could show me how the existing OMP program has to be rewritten so it takes full advantage of the GPU (best possible acceleration). I think I could really learn a lot from this example since I have to port many other subroutines etc. from OpenMP to PGI accelerator.

Thank you very much!!!