Hi,
Trying to run OpenACC code compiled with PGI 12.6, I am getting wrong results. After investigation it seems related to the private directive. I have been able to reproduce the problem in the following test code:
program main
implicit none
integer*4, parameter :: ireals=8
integer*4 :: N,nlev,ip,k,kspec
real*8, allocatable :: sohr_ref(:,:),thhr_ref(:,:)
real*8, allocatable :: sohr(:,:),thhr(:,:),zfls(:,:),zflt(:,:),zsmu0(:),dp0(:,:)
real*8 :: cp_d,zfac,g,zepemu
N=1E3
nlev=4
allocate(sohr_ref(N,nlev),thhr_ref(N,nlev))
allocate(sohr(N,nlev),thhr(N,nlev),zfls(N,nlev+1),zflt(N,nlev+1),zsmu0(N),dp0(N,nlev))
!----------------------------------
!init
cp_d = 1005.0_ireals
g = 9.80665_ireals
zepemu = 1.0E-9_ireals !
DO k = 1, nlev
DO ip = 1,N
zfls(ip,k)=8.0_ireals*cos(6.28_ireals*REAL(ip+k,ireals)/REAL(N+nlev,ireals))
zflt(ip,k)=6.0_ireals*cos(6.28_ireals*REAL(ip+k,ireals)/REAL(N+nlev,ireals))
dp0(ip,k) = 1015.0_ireals*cos(6.28_ireals*REAL(ip+k,ireals)/REAL(N+nlev,ireals))
END DO
END DO
DO ip = 1,N
zfls(ip,nlev+1)=8.0_ireals*cos(6.28_ireals*REAL(ip+nlev+1,ireals)/REAL(N+nlev+1,ireals))
zflt(ip,nlev+1)=6.0_ireals*cos(6.28_ireals*REAL(ip+nlev+1,ireals)/REAL(N+nlev+1,ireals))
zsmu0(ip)=cos(6.28_ireals*REAL(ip,ireals)/REAL(N,ireals))
END DO
!----------------------------------
!1: compute on cpu
DO k = 1, nlev
DO ip = 1,N
zfac = g/(cp_d*dp0 (ip,k))
sohr(ip,k) = 0.0
IF (zsmu0(ip) > zepemu) THEN
sohr_ref(ip,k) = zfac * (zfls(ip,k)-zfls(ip,k+1))
ENDIF
thhr_ref(ip,k) = zfac * (zflt(ip,k)-zflt(ip,k+1))
ENDDO
END DO
!----------------------------------
!2: compute on gpu without private
!$acc data create(sohr,thhr,dp0,zfls,zflt,zsmu0)
!$acc update device(dp0,zfls,zflt,zsmu0)
!$acc parallel
DO k = 1, nlev
!$acc loop gang vector
DO ip = 1,N
zfac = g/(cp_d*dp0 (ip,k))
sohr(ip,k) = 0.0
IF (zsmu0(ip) > zepemu) THEN
sohr(ip,k) = zfac * (zfls(ip,k)-zfls(ip,k+1))
ENDIF
thhr(ip,k) = zfac * (zflt(ip,k)-zflt(ip,k+1))
ENDDO
END DO
!$acc end parallel
!$acc update host(sohr,thhr)
!$acc end data
print*, 'Max Diff without private CPU/GPU: sohr', maxval(abs(sohr_ref-sohr))
print*, 'Max Diff without private CPU/GPU: thhr', maxval(abs(thhr_ref-thhr))
!----------------------------------
!3: compute on gpu using private
!$acc data create(sohr,thhr,dp0,zfls,zflt,zsmu0)
!$acc update device(dp0,zfls,zflt,zsmu0)
!$acc parallel private(ip,k,zfac)
DO k = 1, nlev
!$acc loop gang vector
DO ip = 1,N
zfac = g/(cp_d*dp0 (ip,k))
sohr(ip,k) = 0.0
IF (zsmu0(ip) > zepemu) THEN
sohr(ip,k) = zfac * (zfls(ip,k)-zfls(ip,k+1))
ENDIF
thhr(ip,k) = zfac * (zflt(ip,k)-zflt(ip,k+1))
ENDDO
END DO
!$acc end parallel
!$acc update host(sohr,thhr)
!$acc end data
print*, 'Max Diff with private CPU/GPU: sohr', maxval(abs(sohr_ref-sohr))
print*, 'Max Diff with private CPU/GPU: thhr', maxval(abs(thhr_ref-thhr))
end program main
Compiling and running shows different results when using the private directive:
> pgf90 -acc -o test_private_12_6 test_private_12_6.f90
> ./test_private_12_6
Max Diff without private CPU/GPU: sohr 0.000000000000000
Max Diff without private CPU/GPU: thhr 0.000000000000000
Max Diff with private CPU/GPU: sohr 6.1529741284087325E-004
Max Diff with private CPU/GPU: thhr 4.6147305963065491E-004
Xavier