constant coefficients matrices defined in external module

Hi,

I have some constant coefficients matrices which are defined in some modules. I would like to copy them on the GPU in the main program and then use them in some subroutines with a use statement.

Here is an example:

! test programme 
module data_par
  real :: coef(5)
  !$acc local(coef)
  DATA (coef(i),i=1,5)/ .1 , .4, .8, 1.0, 1.2/

end module data_par

module computation
  USE data_par
  implicit none
contains

subroutine gpu_routine(nvec,nlev,a,ic)
  real, intent(inout) :: a(:,:)
  integer, intent(in) :: nvec,nlev,ic
  integer :: i,k
  !$acc reflected(a)  

     !$acc region do kernel 
     do i=1,nvec       
        do k=2,nlev 
           a(i,k)=coef(ic)*(a(i,k)+a(i,k-1))
        end do
     end do
     !$acc end region

end subroutine gpu_routine

end module computation
  
program main
  USE data_par
  USE computation, only: gpu_routine
  implicit none
  real, allocatable :: a(:,:)
  !$acc mirror(a)
  integer, parameter :: n1=10000, nlev=60
  integer :: ic
     
  allocate(a(n1,nlev))
  !init a
  a=0.1
  !$acc update device(a) 
  !$acc update device(coef)   

  do ic=1,5 
     call gpu_routine(n1,nlev,a,ic)
  end do

  !$acc update host(a)
  print*, sum(a)

end program main

Where the coefficient matrix coef is defined in module data_par.

This approach is not doing what I want as I am getting the following messages from the compiler

...
gpu_routine:
     19, Generating reflected(a(:,:))
     21, Generating copyin(coef(ic))
..
 46, update device(coef) is not within a data region for this array

showing that a copyin of coef is generated inside the subroutine gpu_routine and that the update in the main code cannot be generated.

I then tried to replace the

 !$acc local(coef)

with

 !$acc mirror(coef)

Although this is probably not valid since coef is not allocatable. From the compiler output it looks good (no copyin and it generates a update device(coef(:)). However, when I try to run the code I got the following error:

call to cuMemcpyHtoD returned error 1: Invalid value
CUDA driver version: 4000

I guess the array coef was not allocated on the device in this case.

Any idea on how I should proceed ?

Thanks,
Xavier

Hi Xavier,

Any idea on how I should proceed ?

The ‘mirror’ directive would be best, but since it’s only available with allocatable arrays, you need to change coef to an allocatable array and initialize it at runtime.

% cat data_par1.f90 
! test programme
module data_par
  real,allocatable,dimension(:) :: coef
  !$acc mirror(coef)

end module data_par

module computation
  USE data_par
  implicit none
contains

subroutine gpu_routine(nvec,nlev,a,ic)
  real, intent(inout) :: a(:,:)
  integer, intent(in) :: nvec,nlev,ic
  integer :: i,k
  !$acc reflected(a) 

     !$acc region do kernel
     do i=1,nvec       
        do k=2,nlev
           a(i,k)=coef(ic)*(a(i,k)+a(i,k-1))
        end do
     end do
     !$acc end region

end subroutine gpu_routine

end module computation
 
program main
  USE data_par
  USE computation, only: gpu_routine
  implicit none
  real, allocatable :: a(:,:)
  !$acc mirror(a)
  integer, parameter :: n1=10000, nlev=60
  integer :: ic
  
  allocate(coef(5))   
  coef(1)=.1 
  coef(2)=.4
  coef(3)=.8 
  coef(4)=1.0
  coef(5)=1.2
!$acc update device(coef)
  
  allocate(a(n1,nlev))
  !init a
  a=0.1
  !$acc update device(a)
  !$acc update device(coef)   

  do ic=1,5
     call gpu_routine(n1,nlev,a,ic)
  end do

  !$acc update host(a)
  print*, sum(a)

end program main 
% pgf90 data_par1.f90 -Minfo=accel -V11.9 -ta=nvidia
gpu_routine:
     17, Generating reflected(a(:,:))
     19, Generating compute capability 1.0 binary
         Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
     20, Loop is parallelizable
         Accelerator kernel generated
         20, !$acc do parallel, vector(256) ! blockidx%x threadidx%x
             CC 1.0 : 14 registers; 76 shared, 8 constant, 0 local memory bytes; 66% occupancy
             CC 1.3 : 14 registers; 76 shared, 8 constant, 0 local memory bytes; 100% occupancy
             CC 2.0 : 22 registers; 4 shared, 88 constant, 0 local memory bytes; 83% occupancy
     21, Loop carried dependence of 'a' prevents parallelization
         Loop carried backward dependence of 'a' prevents vectorization
main:
     36, Generating local(a(:,:))
     46, Generating !$acc update device(coef(:))
     51, Generating !$acc update device(a(:,:))
     52, Generating !$acc update device(coef(:))
     58, Generating !$acc update host(a(:,:))
% a.out
launch kernel  file=/local/home/colgrove/data_par1.f90 function=gpu_routine line=20 device=0 grid=40 block=256
launch kernel  file=/local/home/colgrove/data_par1.f90 function=gpu_routine line=20 device=0 grid=40 block=256
launch kernel  file=/local/home/colgrove/data_par1.f90 function=gpu_routine line=20 device=0 grid=40 block=256
launch kernel  file=/local/home/colgrove/data_par1.f90 function=gpu_routine line=20 device=0 grid=40 block=256
launch kernel  file=/local/home/colgrove/data_par1.f90 function=gpu_routine line=20 device=0 grid=40 block=256
   1.0310200E+10

Hope this helps,
Mat

Thanks for your reply. The solution with allocatable is fine for testing.
However we have quite a lot of those matrices defined with data statment and it would be really great if we could keep this part of the code as it is.

I was wondering if it would not be possible using the “device present” directive (mentioned in v1.3 documentation) and a data region around the call to gpu_routine:

module data_par
  real :: coef(5)
  DATA (coef(i),i=1,5)/ .1 , .4, .8, 1.0, 1.2/

end module data_par

module computation
  USE data_par
  implicit none
contains

subroutine gpu_routine(nvec,nlev,a,ic)
  real, intent(inout) :: a(:,:)
  integer, intent(in) :: nvec,nlev,ic
  integer :: i,k
  !$acc reflected(a)  

  !$acc device present(coef)

     !$acc region do kernel 
     do i=1,nvec       
        do k=2,nlev 
           a(i,k)=coef(ic)*(a(i,k)+a(i,k-1))
        end do
     end do
     !$acc end region

end subroutine gpu_routine

end module computation
  
program main
  USE data_par
  USE computation, only: gpu_routine
  implicit none
  real, allocatable :: a(:,:)
  !$acc mirror(a)
  integer, parameter :: n1=10000, nlev=60
  integer :: ic
     
  allocate(a(n1,nlev))
  !init a
  a=0.1

  !$acc data region local(coef)
  
  !$acc update device(a) 
  !$acc update device(coef)   

  do ic=1,5 
     call gpu_routine(n1,nlev,a,ic)
  end do

  !$acc update host(a)
  
  !$acc end data region
print*, sum(a)

end program main

This feature is not yet supported (I got a “PGF90-S-0034-Syntax error at or near DEVICE (Tgpu7.f90: 20)” error), but would this be a valid usage of this directive sometime in the future ?

Xavier