Hi,
I am working on a code with several subroutines calls and I am therefore trying to use the reflected and mirror directives. When I run my code I am getting the following error:
call to cuMemAlloc returned error 2: Out of memory
I have managed to reproduce the error in a test program which somehow reproduces the strucutre of the more complexe code (see below). Here I am using at most 13 double arrays (I am compiling with -r8), of size 4800*60, which should not exceed the memory on the GPU (Fermi C2070). Do you have any idea of what I am doing wrong ? Do you know any tools I could use to try to diagnose my memory problem ?
Thanks for your help,
Xavier
! test programme
module data_field
real, allocatable, dimension(:,:) :: a
END module data_field
module computation
implicit none
contains
subroutine gpu_routine(nvec,nlev,a,b,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
real, intent(inout) :: a(nvec,nlev)
real, intent(in) :: b(nvec,nlev)
real, intent(in), dimension(nvec,nlev) :: &
M1,M2,M3,M4,M5,M6,M7,M8,M9,M10
integer, intent(in) :: nvec,nlev
real :: pp(nvec,nlev)
integer :: i,k, idummy
!$acc reflected(a,b)
!$acc reflected(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
!$acc local(pp)
!$acc region do kernel
do i=1,nvec
do k=1,nlev
pp(i,k)=0.1
end do
end do
!$acc end region
!$acc region
!$acc do kernel, parallel, vector(128)
do i=1,nvec
do k=2,nlev !level
do idummy=1,10 ! just to imitate many operations
a(i,k)=a(i,k-1)+b(i,k)*(log(1+0.1*idummy+b(i,k)+pp(i,k)))
end do
a(i,k)=a(i,k)*( 1+M1(i,k)*M2(i,k)*M3(i,k)*M4(i,k) &
*M5(i,k)*M6(i,k)*M7(i,k)*M8(i,k)*M9(i,k)*M10(i,k))
end do !level
end do !i
!$acc end region
end subroutine gpu_routine
end module computation
program main
USE data_field, only: a
USE computation, only: gpu_routine
implicit none
real, allocatable :: b(:,:)
real, allocatable, dimension(:,:) :: M1,M2,M3,M4,M5,M6, &
M7,M8,M9,M10
integer, parameter :: nlev=60
integer :: n1,n2,nargs,i,k,nt
character*10 arg
integer :: nvec
real :: rt
!$acc mirror(b)
!$acc mirror(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
!$acc mirror(a)
nargs = command_argument_count()
if( nargs == 2 ) then
call getarg( 1, arg )
read(arg,'(i)') n1
call getarg( 2, arg )
read(arg,'(i)') n2
else
write(*,*) 'usage ./Tgpu5 n1 n2'
stop
endif
nvec=n1*n2
allocate(a(nvec,nlev),b(nvec,nlev))
allocate(M1(nvec,nlev),M2(nvec,nlev),M3(nvec,nlev)&
,M4(nvec,nlev),M5(nvec,nlev),M6(nvec,nlev)&
,M7(nvec,nlev),M8(nvec,nlev),M9(nvec,nlev),M10(nvec,nlev))
a=0.0;b=0.1
M1=0.1;M2=0.2;M3=0.3;M4=0.4
M5=0.5;M6=0.6;M7=0.7;M8=0.8
M9=0.9;M10=1.0
! Update data
!$acc update device(a,b)
!$acc update device(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
do nt=1,100
call gpu_routine(nvec,nlev,a,b,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
end do
!$acc update host(a)
print*, 'n1 =', n1, 'n2=', n2, sum(a)/(n1*n2)
write(*,*) 'real kind :', KIND(1.0)
deallocate(a,b)
deallocate(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
end program main
I have been compiling with:
pgf90 -Kieee -r8 -ta=nvidia,cuda3.2,nofma -O3 -Minfo=accel -o Tgpu5 Tgpu5.f90
and I am using pgi 11.3 version
This test code is run with n1=80 and n2=60