Hi Mat,
You can try the following simple code:
program main
use cudafor
use omp_lib
Implicit None
Integer*4 :: myid,istat,nGPU
Real*8,Device,Target,Allocatable :: ADev(:)
istat = cudaGetDeviceCount(nGPU)
write(6,'(a,i3,a)') 'You have ',nGPU,' devices'
call omp_set_num_threads(nGPU)
c$omp parallel private(myid)
myid = omp_get_thread_num()
istat = cudaSetDevice(myid)
istat = cudaDeviceReset()
Allocate(ADev(1024))
Write(6,'(a7,i3,a11,z20)') 'Device ',myid,
& ', address: ',loc(ADev)
c$omp end parallel
Write(6,'(a)') 'After allocation'
c$omp parallel private(myid)
myid = omp_get_thread_num()
istat = cudaSetDevice(myid)
Write(6,'(a7,i3,a11,z20)') 'Device ',myid,
& ', address: ',loc(ADev)
c$omp end parallel
end program main
compile options: pgfortran test.cuf -o test -Mcuda=ptxinfo,cuda7.5,cc35 -Mfixed -mcmodel=medium -O2 -mp
output:
You have 2 devices
Device 1, address: 1303EE0000
Device 0, address: 1307EC0000
After allocation
Device 0, address: 1307EC0000
Device 1, address: 1307EC0000
It’s strange. After the allocation, the address is the same. How can I fix this?