Hi Mat!
One more question. Here is the full testcase.
PROGRAM VORTEX
use openacc
use omp_lib
IMPLICIT NONE
INTEGER :: NX,NY,NZ,npltx,nplty,npltz
PARAMETER(NX=500,NY=500,NZ=500)
REAL*8 :: EPSPH,TH,TWOPI
REAL*8, DIMENSION(:,:,:,:), ALLOCATABLE:: psi1,psi2
iNTEGER :: ix,iy,iz,ix1,iy1,iz1
integer(kind=4) :: ierr, ngpus,zs,ze,iam
allocate(PSI1(NX,NY,NZ,1:2),stat=ierr)
if(ierr /= 0)write(*,*)"allocation error PSI"
allocate(PSI2(NX,NY,NZ,1:2),stat=ierr)
if(ierr /= 0)write(*,*)"allocation error PSI2"
TWOPI = 2.0*ACOS(-1.0E00)
ngpus = acc_get_num_devices(acc_device_nvidia)
print *, "ngpus", ngpus
call omp_set_num_threads(ngpus)
!$OMP PARALLEL shared (ngpus) private(iam,zs,ze)
iam = omp_get_thread_num()
zs = (iam+0)*((NZ+ngpus-1)/ngpus)
ze = (iam+1)*((NZ+ngpus-1)/ngpus)
if(ze.gt.nz) ze = nx
if(zs.eq.0) zs = 1
if(zs.ne.1) zs = zs-1
if(ze.ne.nz) ze = ze+1
print *, iam, zs, ze
! iam = iam + 1
call acc_set_device_num(iam,acc_device_nvidia)
!$acc enter data create( psi1(:,:,zs:ze,:), psi2(:,:,zs:ze,:))
!$OMP END PARALLEL
return
!$acc data create(PSI1,PSI2)
EPSPH = 0.02
TH = TWOPI
!$acc kernels
!!$OMP PARALLEL DO
DO IZ1=1,NZ
DO IY1=1,NY
DO IX1=1,NX
PSI1(IX1,IY1,IZ1,2) = EPSPH*COS(TH)
PSI2(IX1,IY1,IZ1,2) = EPSPH*SIN(TH)
PSI1(IX1,IY1,IZ1,1) = EPSPH*COS(TH)
PSI2(IX1,IY1,IZ1,1) = EPSPH*SIN(TH)
ENDDO
ENDDO
ENDDO
!$acc end kernels
!$acc end data
!!$OMP END PARALLEL DO
deallocate(PSI1,PSI2)
STOP
END
compilation
pgfortran -m64 -ta=nvidia,cc3.5,nodebug,cuda7.0,pin -mcmodel=medium test1.f90 -i8 -Mlarge_arrays -O3 -mp -acc -o test1
launching:
./test1
ngpus 3
0 1 168
1 166 335
2 333 500
FATAL ERROR: variable in data clause was already present on device 3: name=psi1
file:/home-2/..../test1.f90 vortex line:36
psi1 lives at 0x2b58019a5020 size 2000000000 present
Present table dump for device[3]: NVIDIA Tesla GPU 3, compute capability 3.5
host:0x2b58019a5020 device:0x230d9e0000 size:2000000000 presentcount:1 line:36 name:psi1
host:0x2b58019a5020 device:0x2384d40000 size:2000000000 presentcount:1 line:36 name:psi1
call to cuMemAlloc returned error 4: Deinitialized
Failing in Thread:2
next run:
$ PGI_ACC_DEBUG=1 ./test1 2>&1 | grep devid
pgi_uacc_set_device_num(devnum=0,devtype=4,threadid=1) cuda devid=1 dindex=1
pgi_uacc_dataenterstart( file=/home-2/..../test1.f90, function=vortex, line=2:2, line=36, devid=0 )
pgi_uacc_set_device_num(devnum=2,devtype=4,threadid=3) cuda devid=3 dindex=3
pgi_uacc_set_device_num(devnum=1,devtype=4,threadid=2) cuda devid=2 dindex=2
pgi_uacc_dataenterstart( file=/home-2/..../test1.f90, function=vortex, line=2:2, line=36, devid=0 )
pgi_uacc_dataenterstart( file=/home-2/..../test1.f90, function=vortex, line=2:2, line=36, devid=0 )
pgi_uacc_alloc(size=2000000000,devid=3,threadid=2)
pgi_uacc_alloc(size=2000000000,devid=3,threadid=1)
pgi_uacc_alloc(size=2000000000,devid=3,threadid=2) returns 0x230d9e0000
pgi_uacc_alloc(size=2000000000,devid=3,threadid=1) returns 0x2384d40000
pgi_uacc_alloc(size=2000000000,devid=3,threadid=2)
I’m confused with the fact that different devices is set active in different threads. At the same time memory allocation is performed on device 3 only. Could you explain this please? Is it PGI issue or mine?
Alexey