Memory access error when using cuda+mpi

Hello.

I have a memory access problem when I’m using multiple GPU calculation with cuda and mpi.
At first, I briefly write my code.

program main
use cudafor
use cuda_kernel
use mpi

implicit none

type(dim3):: block,threads
integer:: nx,ny,nz
integer:: ilen,jlen,klen, iprocs, jprocs, kprocs
real(8),managed,allocatable,dimension(:,:,:):: a,b, a_new,b_new, c

! MPI stuff
integer:: myrank, nprocs, tag, ierr, localRank
character(len=10):: localRankStr

call get_environment_variable &
('OMPI_COMM_WORLD_LOCAL_RANK', localRankStr)
read(localRankStr , ' (i10) ' ) localRank
ierr = cudaSetDevice(localRank)

call MPI_init(ierr)
call MPI_comm_rank( MPI_COMM_WORLD , myrank , ierr )
call MPI_comm_size( MPI_COMM_WORlD , nprocs, ierr )

nx = 1024
ny = 1024
nz = 1024

iprocs = 1
jprocs = 1
kprocs = 4

ilen = nx / iprocs
jlen = ny / jprocs
klen = nz / kprocs

blocks = dim3(ilen/16,jlen/16,1)
threads = dim3(16,16,1)

allocate(a(0:ilen+1, 0:jlen+1, 0:klen+1))
allocate(b(0:ilen+1, 0:jlen+1, 0:klen+1))
allocate(c(0:ilen+1, 0:jlen+1, 0:klen+1))
allocate(a_new(0:ilen+1, 0:jlen+1, 0:klen+1))
allocate(b_new(0:ilen+1, 0:jlen+1, 0:klen+1))

! Initial condition of array a and b
a(i,j,k) = ---
b(i,j,k) = ---
state = cudaThreadSynchronize()

! time iterative computation
do timestep = 1, timestep_max

! - - - - - - - - - - - - - Transfer a and b - - - - - - - - - - - - -
call MPI_BARRIER(MPI_COMM_WORLD,ierr)

call mpi_isend(a(0,0,klen),(ilen+2)*(jlen+2),MPI_REAL8,kup,1,&
							MPI_COMM_WORLD,sendijp,ierr)
call mpi_isend(a(0,0,1),(ilen+2)*(jlen+2),MPI_REAL8,kdown,1,&
							MPI_COMM_WORLD,sendijn,ierr)
call mpi_irecv(a(0,0,0),(ilen+2)*(jlen+2),MPI_REAL8,kdown,1,&
							MPI_COMM_WORLD,recvijn,ierr)
call mpi_irecv(a(0,0,klen+1),(ilen+2)*(jlen+2),MPI_REAL8,kup,1,&
							MPI_COMM_WORLD,recvijp,ierr)
call MPI_WAIT(sendijp,istatus,ierr)
call MPI_WAIT(sendijn,istatus,ierr)
call MPI_WAIT(recvijn,istatus,ierr)
call MPI_WAIT(recvijp,istatus,ierr)

! kup and kdown indicate the rank in k-direction I skipped that part
! and same as array b

! this subroutine calculate a_new and c by using a and b
call calanew<<<blocks,threads>>>( some parameters )
state = cudaThreadSynchronize()


! - - - - - - - - - - - - - Transfer c - - - - - - - - - - - - -
call MPI_BARRIER(MPI_COMM_WORLD,ierr)

call mpi_isend(c(0,0,klen),(ilen+2)*(jlen+2),MPI_REAL8,kup,1,&
							MPI_COMM_WORLD,sendijp,ierr)
call mpi_isend(c(0,0,1),(ilen+2)*(jlen+2),MPI_REAL8,kdown,1,&
							MPI_COMM_WORLD,sendijn,ierr)
call mpi_irecv(c(0,0,0),(ilen+2)*(jlen+2),MPI_REAL8,kdown,1,&
							MPI_COMM_WORLD,recvijn,ierr)
call mpi_irecv(c(0,0,klen+1),(ilen+2)*(jlen+2),MPI_REAL8,kup,1,&
							MPI_COMM_WORLD,recvijp,ierr)
call MPI_WAIT(sendijp,istatus,ierr)
call MPI_WAIT(sendijn,istatus,ierr)
call MPI_WAIT(recvijn,istatus,ierr)
call MPI_WAIT(recvijp,istatus,ierr)

! this subroutine calculate b_new by using a, b and c
call calbnew<<<blocks,threads>>>( some parameters )
state = cudaThreadSynchronize()

! update array
a = a_new
b = b_new
state = cudaThreadSynchronize()

end do
! end of time iterative computation

deallocate(a(0:ilen+1, 0:jlen+1, 0:klen+1))
deallocate(b(0:ilen+1, 0:jlen+1, 0:klen+1))
deallocate(c(0:ilen+1, 0:jlen+1, 0:klen+1))
deallocate(a_new(0:ilen+1, 0:jlen+1, 0:klen+1))
deallocate(b_new(0:ilen+1, 0:jlen+1, 0:klen+1))

state = cudaFree(a)
state = cudaFree(b)
state = cudaFree(c)
state = cudaFree(a_new)
state = cudaFree(b_new)

call MPI_Finalize(ierr)
stop
end program main

When I’am running this code by
mpif90 -o3 -ta=tesla,cuda8.0 test.cuf
mpirun -np 4 ./3

The following error returned:

0: copyover Memcpy (dst=0x0xa00200000, src=0x0xa80c00000, size=2155880448) FAILED: 77(an illegal memory access was encountered)
0: copyover Memcpy (dst=0x0xa00200000, src=0x0xa80c00000, size=2155880448) FAILED: 77(an illegal memory access was encountered)

Primary job terminated normally, but 1 process returned
a non-zero exit code… Per user-direction, the job has been aborted.


mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:

Process name: [[30510,1],0]
Exit code: 127

And the error occurs in ‘update array part’ of the code.
Why this kind of error occur?
The array size per GPU is 10241024256 doesn’t exceed the memory of GPU(Tesla P100 16GB)
Also, I tested single-GPU version of that code for 10241024256 array, it successfully run.
(single version code is almost same except mpi part)
Something wrong in update part?

Thanks all.

Hi Geunwoo,

Typically these types of error are occurring in the kernel before the call the memcpy. Can you try adding something like the following code after each of the kernel launches to see which one is erroring?

call calanew<<<blocks,threads>>>( some parameters ) 
istat = cudaGetLastError()
print *, cudaGetErrorString(istat)

Exactly why the kernels are erroring, I don’t know. But given it’s an illegal address error, I’d first check that your not accessing memory out-of-bounds.

If you either post or send a reproducing example to PGI Customer Service (trs@pgroup.com), I can take a look and see if I can find the problem.

-Mat

Thank you Mat.

I tested cuda erroring code.

call calanew<<<blocks,threads>>>( some parameters )
istat = cudaGetLastError()
print *, cudaGetErrorString(istat)

Actually, error occurred in the kernel!
And following error message returned when I using 4 GPUs.

an illegal memory access was encountered
an illegal memory access was encountered
an illegal memory access was encountered
an illegal memory access was encountered
0: copyover Memcpy (dst=0x0xa00200000, src=0x0xa81c00000, size=2172723264) FAILED: 77(an illegal memory access was encountered)
0: copyover Memcpy (dst=0x0xa00200000, src=0x0xa81c00000, size=2172723264) FAILED: 77(an illegal memory access was encountered)
0: copyover Memcpy (dst=0x0xa00200000, src=0x0xa81c00000, size=2172723264) FAILED: 77(an illegal memory access was encountered)
0: copyover Memcpy (dst=0x0xa00200000, src=0x0xa81c00000, size=2172723264) FAILED: 77(an illegal memory access was encountered)

Primary job terminated normally, but 1 process returned
a non-zero exit code… Per user-direction, the job has been aborted.


mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:

Process name: [[32053,1],2]
Exit code: 127

And when I divided computation domain by 8 GPU, ‘No error’ was returned


I sent you reproducing sample code to PGI customer service.
Please take a look my sample if you have time.
Thank you!

It looks like you arrays are getting large. In this case, please try compiling with the flag “-Mlarge_arrays” to enable large array support.

When I try this with your code, I no longer see the illegal memory address errors.

-Mat

Thank you Mat!

It successfully worked!!
Your comment always helpful.
I really appreciate it!

-Geunwoo