cuMemAlloc error 2 with mirror and reflected directives

Hi,

I am working on a code with several subroutines calls and I am therefore trying to use the reflected and mirror directives. When I run my code I am getting the following error:

call to cuMemAlloc returned error 2: Out of memory

I have managed to reproduce the error in a test program which somehow reproduces the strucutre of the more complexe code (see below). Here I am using at most 13 double arrays (I am compiling with -r8), of size 4800*60, which should not exceed the memory on the GPU (Fermi C2070). Do you have any idea of what I am doing wrong ? Do you know any tools I could use to try to diagnose my memory problem ?

Thanks for your help,

Xavier

! test programme 

module data_field
  real, allocatable, dimension(:,:) :: a
END module data_field

module computation
  implicit none
  
contains
subroutine gpu_routine(nvec,nlev,a,b,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
  real, intent(inout) :: a(nvec,nlev)
  real, intent(in) :: b(nvec,nlev)
  real, intent(in), dimension(nvec,nlev) :: &
       M1,M2,M3,M4,M5,M6,M7,M8,M9,M10
  integer, intent(in) :: nvec,nlev
  real :: pp(nvec,nlev)
  integer :: i,k, idummy
  !$acc reflected(a,b)
  !$acc reflected(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
  !$acc local(pp)

!$acc region do kernel
do i=1,nvec
     do k=1,nlev        
           pp(i,k)=0.1
        end do
  end do
  !$acc end region

 !$acc region      
    !$acc do kernel, parallel, vector(128)
    do i=1,nvec       
       do k=2,nlev !level
              
             do idummy=1,10 ! just to imitate many operations
             a(i,k)=a(i,k-1)+b(i,k)*(log(1+0.1*idummy+b(i,k)+pp(i,k)))
             end do
             a(i,k)=a(i,k)*( 1+M1(i,k)*M2(i,k)*M3(i,k)*M4(i,k) &
                  *M5(i,k)*M6(i,k)*M7(i,k)*M8(i,k)*M9(i,k)*M10(i,k))

       end do !level
    end do !i
 !$acc end region
end subroutine gpu_routine

end module computation
  
program main
  USE data_field, only: a
  USE computation, only: gpu_routine
  implicit none
  real, allocatable :: b(:,:)
  real, allocatable, dimension(:,:) :: M1,M2,M3,M4,M5,M6, &
       M7,M8,M9,M10
  integer, parameter :: nlev=60
  integer :: n1,n2,nargs,i,k,nt
  character*10 arg
  integer :: nvec
  real :: rt
  !$acc mirror(b)
  !$acc mirror(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
  !$acc mirror(a)
 
  nargs = command_argument_count()

  if( nargs == 2 ) then
     call getarg( 1, arg )
     read(arg,'(i)') n1
     call getarg( 2, arg )
     read(arg,'(i)') n2
  else
     write(*,*) 'usage ./Tgpu5 n1 n2'
     stop
  endif
 
 nvec=n1*n2
 
 allocate(a(nvec,nlev),b(nvec,nlev))
 allocate(M1(nvec,nlev),M2(nvec,nlev),M3(nvec,nlev)&
      ,M4(nvec,nlev),M5(nvec,nlev),M6(nvec,nlev)&
      ,M7(nvec,nlev),M8(nvec,nlev),M9(nvec,nlev),M10(nvec,nlev))

a=0.0;b=0.1
M1=0.1;M2=0.2;M3=0.3;M4=0.4
M5=0.5;M6=0.6;M7=0.7;M8=0.8
M9=0.9;M10=1.0

! Update data
!$acc update device(a,b)
!$acc update device(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)

 do nt=1,100
    call gpu_routine(nvec,nlev,a,b,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
 end do
 
 !$acc update host(a)

 print*, 'n1 =', n1, 'n2=', n2, sum(a)/(n1*n2)
 write(*,*) 'real kind :', KIND(1.0)

deallocate(a,b)
deallocate(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)

end program main

I have been compiling with:

pgf90 -Kieee -r8 -ta=nvidia,cuda3.2,nofma -O3 -Minfo=accel -o Tgpu5 Tgpu5.f90

and I am using pgi 11.3 version

This test code is run with n1=80 and n2=60

Hi Xavier,

It looks to me like the compiler it trying to allocate the GPU data before the allocate routine is being called. If’ send in a report (TPR#17774) and ask our engineers to investigate.

The work arounds would be to either move the mirrored directive and the GPU arrays into the data_field module. The second is to create a data region using the mirrored directive just after the variables are allocated.

Thanks!
Mat

% cat test1.f90
! test programme

module data_field
  real, allocatable, dimension(:,:) :: a
  real, allocatable :: b(:,:)
  real, allocatable, dimension(:,:) :: M1,M2,M3,M4,M5,M6, &
       M7,M8,M9,M10
  !$acc mirror(b)
  !$acc mirror(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
  !$acc mirror(a)
END module data_field

module computation
  implicit none
 
contains
subroutine gpu_routine(nvec,nlev,a,b,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
  real, intent(inout) :: a(nvec,nlev)
  real, intent(in) :: b(nvec,nlev)
  real, intent(in), dimension(nvec,nlev) :: &
       M1,M2,M3,M4,M5,M6,M7,M8,M9,M10
  integer, intent(in) :: nvec,nlev
  real :: pp(nvec,nlev)
  integer :: i,k, idummy
  !$acc reflected(a,b)
  !$acc reflected(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
  !$acc local(pp)

!$acc region do kernel
do i=1,nvec
     do k=1,nlev       
           pp(i,k)=0.1
        end do
  end do
  !$acc end region

 !$acc region     
    !$acc do kernel, parallel, vector(128)
    do i=1,nvec       
       do k=2,nlev !level
             
             do idummy=1,10 ! just to imitate many operations
             a(i,k)=a(i,k-1)+b(i,k)*(log(1+0.1*idummy+b(i,k)+pp(i,k)))
             end do
             a(i,k)=a(i,k)*( 1+M1(i,k)*M2(i,k)*M3(i,k)*M4(i,k) &
                  *M5(i,k)*M6(i,k)*M7(i,k)*M8(i,k)*M9(i,k)*M10(i,k))

       end do !level
    end do !i
 !$acc end region
end subroutine gpu_routine

end module computation
 
program main
  USE data_field
  USE computation, only: gpu_routine
  implicit none
  integer, parameter :: nlev=60
  integer :: n1,n2,nargs,i,k,nt
  character*10 arg
  integer :: nvec
  real :: rt
 
  nargs = command_argument_count()

  if( nargs == 2 ) then
     call getarg( 1, arg )
     read(arg,'(i)') n1
     call getarg( 2, arg )
     read(arg,'(i)') n2
  else
     write(*,*) 'usage ./Tgpu5 n1 n2'
     stop
  endif
 
 nvec=n1*n2
 
 allocate(a(nvec,nlev),b(nvec,nlev))
 allocate(M1(nvec,nlev),M2(nvec,nlev),M3(nvec,nlev)&
      ,M4(nvec,nlev),M5(nvec,nlev),M6(nvec,nlev)&
      ,M7(nvec,nlev),M8(nvec,nlev),M9(nvec,nlev),M10(nvec,nlev))

a=0.0;b=0.1
M1=0.1;M2=0.2;M3=0.3;M4=0.4
M5=0.5;M6=0.6;M7=0.7;M8=0.8
M9=0.9;M10=1.0

! Update data
!$acc update device(a,b)
!$acc update device(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)

 do nt=1,100
    call gpu_routine(nvec,nlev,a,b,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
 end do
 
 !$acc update host(a)

 print*, 'n1 =', n1, 'n2=', n2, sum(a)/(n1*n2)
 write(*,*) 'real kind :', KIND(1.0)

deallocate(a,b)
deallocate(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)

end program main

% pgf90 -Kieee -r8 -ta=nvidia,cuda3.2,nofma -O3 test1.f90 -V11.4 ; a.out 80 60
 n1 =           80 n2=           60    140.5921228128760     
 real kind :            8



% cat test2.f90 

! test programme

module data_field
  real, allocatable, dimension(:,:) :: a
END module data_field

module computation
  implicit none
 
contains
subroutine gpu_routine(nvec,nlev,a,b,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
  real, intent(inout) :: a(nvec,nlev)
  real, intent(in) :: b(nvec,nlev)
  real, intent(in), dimension(nvec,nlev) :: &
       M1,M2,M3,M4,M5,M6,M7,M8,M9,M10
  integer, intent(in) :: nvec,nlev
  real :: pp(nvec,nlev)
  integer :: i,k, idummy
  !$acc reflected(a,b)
  !$acc reflected(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
  !$acc local(pp)

!$acc region do kernel
do i=1,nvec
     do k=1,nlev       
           pp(i,k)=0.1
        end do
  end do
  !$acc end region

 !$acc region     
    !$acc do kernel, parallel, vector(128)
    do i=1,nvec       
       do k=2,nlev !level
             
             do idummy=1,10 ! just to imitate many operations
             a(i,k)=a(i,k-1)+b(i,k)*(log(1+0.1*idummy+b(i,k)+pp(i,k)))
             end do
             a(i,k)=a(i,k)*( 1+M1(i,k)*M2(i,k)*M3(i,k)*M4(i,k) &
                  *M5(i,k)*M6(i,k)*M7(i,k)*M8(i,k)*M9(i,k)*M10(i,k))

       end do !level
    end do !i
 !$acc end region
end subroutine gpu_routine

end module computation
 
program main
  USE data_field, only: a
  USE computation, only: gpu_routine
  implicit none
  real, allocatable :: b(:,:)
  real, allocatable, dimension(:,:) :: M1,M2,M3,M4,M5,M6, &
       M7,M8,M9,M10
  integer, parameter :: nlev=60
  integer :: n1,n2,nargs,i,k,nt
  character*10 arg
  integer :: nvec
  real :: rt
 
  nargs = command_argument_count()

  if( nargs == 2 ) then
     call getarg( 1, arg )
     read(arg,'(i)') n1
     call getarg( 2, arg )
     read(arg,'(i)') n2
  else
     write(*,*) 'usage ./Tgpu5 n1 n2'
     stop
  endif
 
 nvec=n1*n2
 
 allocate(a(nvec,nlev),b(nvec,nlev))
 allocate(M1(nvec,nlev),M2(nvec,nlev),M3(nvec,nlev)&
      ,M4(nvec,nlev),M5(nvec,nlev),M6(nvec,nlev)&
      ,M7(nvec,nlev),M8(nvec,nlev),M9(nvec,nlev),M10(nvec,nlev))

!$acc data region mirror(b) &
!$acc mirror(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10) &
!$acc mirror(a) 


a=0.0;b=0.1
M1=0.1;M2=0.2;M3=0.3;M4=0.4
M5=0.5;M6=0.6;M7=0.7;M8=0.8
M9=0.9;M10=1.0

! Update data
!$acc update device(a,b)
!$acc update device(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)

 do nt=1,100
    call gpu_routine(nvec,nlev,a,b,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)
 end do
 
 !$acc update host(a)

 print*, 'n1 =', n1, 'n2=', n2, sum(a)/(n1*n2)
 write(*,*) 'real kind :', KIND(1.0)

deallocate(a,b)
deallocate(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10)

!$ACC END DATA REGION

end program main 

% pgf90 -Kieee -r8 -ta=nvidia,cuda3.2,nofma -O3 test2.f90 -V11.4 ; a.out 80 60
 n1 =           80 n2=           60    140.5921228128760     
 real kind :            8

Hi,

Thanks for your reply. The work around works fine for the test case but unfortunatly not for my more complex code. I have just send the full code to the user support.

Xavier

Final comment: The problem I have encountered with mirror directive has been fixed in PGI 11.5, my code is now working.

Xavier