Hello.
I have a memory access problem when I’m using multiple GPU calculation with cuda and mpi.
At first, I briefly write my code.
program main
use cudafor
use cuda_kernel
use mpi
implicit none
type(dim3):: block,threads
integer:: nx,ny,nz
integer:: ilen,jlen,klen, iprocs, jprocs, kprocs
real(8),managed,allocatable,dimension(:,:,:):: a,b, a_new,b_new, c
! MPI stuff
integer:: myrank, nprocs, tag, ierr, localRank
character(len=10):: localRankStr
call get_environment_variable &
('OMPI_COMM_WORLD_LOCAL_RANK', localRankStr)
read(localRankStr , ' (i10) ' ) localRank
ierr = cudaSetDevice(localRank)
call MPI_init(ierr)
call MPI_comm_rank( MPI_COMM_WORLD , myrank , ierr )
call MPI_comm_size( MPI_COMM_WORlD , nprocs, ierr )
nx = 1024
ny = 1024
nz = 1024
iprocs = 1
jprocs = 1
kprocs = 4
ilen = nx / iprocs
jlen = ny / jprocs
klen = nz / kprocs
blocks = dim3(ilen/16,jlen/16,1)
threads = dim3(16,16,1)
allocate(a(0:ilen+1, 0:jlen+1, 0:klen+1))
allocate(b(0:ilen+1, 0:jlen+1, 0:klen+1))
allocate(c(0:ilen+1, 0:jlen+1, 0:klen+1))
allocate(a_new(0:ilen+1, 0:jlen+1, 0:klen+1))
allocate(b_new(0:ilen+1, 0:jlen+1, 0:klen+1))
! Initial condition of array a and b
a(i,j,k) = ---
b(i,j,k) = ---
state = cudaThreadSynchronize()
! time iterative computation
do timestep = 1, timestep_max
! - - - - - - - - - - - - - Transfer a and b - - - - - - - - - - - - -
call MPI_BARRIER(MPI_COMM_WORLD,ierr)
call mpi_isend(a(0,0,klen),(ilen+2)*(jlen+2),MPI_REAL8,kup,1,&
MPI_COMM_WORLD,sendijp,ierr)
call mpi_isend(a(0,0,1),(ilen+2)*(jlen+2),MPI_REAL8,kdown,1,&
MPI_COMM_WORLD,sendijn,ierr)
call mpi_irecv(a(0,0,0),(ilen+2)*(jlen+2),MPI_REAL8,kdown,1,&
MPI_COMM_WORLD,recvijn,ierr)
call mpi_irecv(a(0,0,klen+1),(ilen+2)*(jlen+2),MPI_REAL8,kup,1,&
MPI_COMM_WORLD,recvijp,ierr)
call MPI_WAIT(sendijp,istatus,ierr)
call MPI_WAIT(sendijn,istatus,ierr)
call MPI_WAIT(recvijn,istatus,ierr)
call MPI_WAIT(recvijp,istatus,ierr)
! kup and kdown indicate the rank in k-direction I skipped that part
! and same as array b
! this subroutine calculate a_new and c by using a and b
call calanew<<<blocks,threads>>>( some parameters )
state = cudaThreadSynchronize()
! - - - - - - - - - - - - - Transfer c - - - - - - - - - - - - -
call MPI_BARRIER(MPI_COMM_WORLD,ierr)
call mpi_isend(c(0,0,klen),(ilen+2)*(jlen+2),MPI_REAL8,kup,1,&
MPI_COMM_WORLD,sendijp,ierr)
call mpi_isend(c(0,0,1),(ilen+2)*(jlen+2),MPI_REAL8,kdown,1,&
MPI_COMM_WORLD,sendijn,ierr)
call mpi_irecv(c(0,0,0),(ilen+2)*(jlen+2),MPI_REAL8,kdown,1,&
MPI_COMM_WORLD,recvijn,ierr)
call mpi_irecv(c(0,0,klen+1),(ilen+2)*(jlen+2),MPI_REAL8,kup,1,&
MPI_COMM_WORLD,recvijp,ierr)
call MPI_WAIT(sendijp,istatus,ierr)
call MPI_WAIT(sendijn,istatus,ierr)
call MPI_WAIT(recvijn,istatus,ierr)
call MPI_WAIT(recvijp,istatus,ierr)
! this subroutine calculate b_new by using a, b and c
call calbnew<<<blocks,threads>>>( some parameters )
state = cudaThreadSynchronize()
! update array
a = a_new
b = b_new
state = cudaThreadSynchronize()
end do
! end of time iterative computation
deallocate(a(0:ilen+1, 0:jlen+1, 0:klen+1))
deallocate(b(0:ilen+1, 0:jlen+1, 0:klen+1))
deallocate(c(0:ilen+1, 0:jlen+1, 0:klen+1))
deallocate(a_new(0:ilen+1, 0:jlen+1, 0:klen+1))
deallocate(b_new(0:ilen+1, 0:jlen+1, 0:klen+1))
state = cudaFree(a)
state = cudaFree(b)
state = cudaFree(c)
state = cudaFree(a_new)
state = cudaFree(b_new)
call MPI_Finalize(ierr)
stop
end program main
When I’am running this code by
mpif90 -o3 -ta=tesla,cuda8.0 test.cuf
mpirun -np 4 ./3
The following error returned:
0: copyover Memcpy (dst=0x0xa00200000, src=0x0xa80c00000, size=2155880448) FAILED: 77(an illegal memory access was encountered)
0: copyover Memcpy (dst=0x0xa00200000, src=0x0xa80c00000, size=2155880448) FAILED: 77(an illegal memory access was encountered)Primary job terminated normally, but 1 process returned
a non-zero exit code… Per user-direction, the job has been aborted.
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:Process name: [[30510,1],0]
Exit code: 127
And the error occurs in ‘update array part’ of the code.
Why this kind of error occur?
The array size per GPU is 10241024256 doesn’t exceed the memory of GPU(Tesla P100 16GB)
Also, I tested single-GPU version of that code for 10241024256 array, it successfully run.
(single version code is almost same except mpi part)
Something wrong in update part?
Thanks all.