I have simplified the code and given three examples. The first one is not working, and the error message is clear now:
PGF90-S-0155-acc routine cannot be used for contained subprograms that refer to host subprogram data: n (prova.f90)
0 inform, 0 warnings, 1 severes, 0 fatal for norm_comp
Thus it seems it cannot be done at present.
I include three examples of the SAME algorithm (a poor implementation of a matrix x vector) the first one written in f90 with contains and the second one by passing after the contains all variables defined as in f77. If I cannot use OpenACC in f90 is a problem.
The two codes are only written in a different language and they should be equivalent for a reasonable compiler/paradigm (Pgf90/OpenACC).
If this is not possible I cannot port my application code in a reasonable time on GPU, because going back to f77 is a real pain.
Notice also that If I use the contains in the main, the program also works (third case). So, in my opinion it should be worth that Pgi/OpenACC could allow the first code working.
Many thanks for any help (it may be there is a workaround?).
This is the sample case that cannot be compiled:
program prova
#ifdef _OPENACC
use openacc
#endif
implicit none
integer n,i,j
real8, dimension(:,:), allocatable:: a
real8, dimension(:), allocatable:: b,c
real8 csum
#ifdef _OPENACC
integer mygpu, myrealgpu, num_devices, my_device_type
!$acc routine(norm_comp) seq
my_device_type = acc_device_nvidia
mygpu = 0
call acc_set_device_type(my_device_type)
num_devices = acc_get_num_devices(my_device_type)
write(6,) ’ Number of devices available: ',num_devices
call acc_set_device_num(mygpu,my_device_type)
write(6,) ‘Trying to use GPU:’,mygpu
myrealgpu = acc_get_device_num(my_device_type)
write(6,) 'Actually I am using GPU: ',myrealgpu
if(mygpu.ne.myrealgpu) then
write(6,) ‘I cannot use the requested GPU:’,mygpu
stop
endif
#endif
write(6,) ‘Input N leading dimension square matrix A’
read(,) N
allocate(b(N),c(N))
allocate(a(N,N))
do i=1,N
do j=1,N
a(i,j)=dsin(dble(i-j)/N)
enddo
a(i,i)=1.d0
b(i)=cos(dble(i)**2-3i+1) ! random init
enddo
call matvec(n,a,b,c)
csum=0.d0
!$acc parallel loop reduction(+:csum)
do i=1,N
csum=csum+c(i)**2
enddo
!$acc end parallel loop
csum=sqrt(csum)
!$acc parallel loop
do i=1,N
b(i)=c(i)/csum
enddo
!$acc end parallel loop
write(6,) ’ Final b → A b ’
do i=1,N
write(6,*) i,b(i)
enddo
stop
end program prova
subroutine matvec(n,a,b,c)
implicit none
integer i,n
real*8 csum,a(n,n),b(n),c(n)
!$acc routine(norm_comp) seq
!$acc parallel loop
do i=1,N
call norm_comp
! csum=0.d0
!!$acc loop reduction(+:csum)
! do j=1,N
! csum=csum+A(i,j)b(j)
! enddo
c(i)=csum
enddo
!$acc end parallel loop
contains
subroutine norm_comp
implicit none
real8 csum
integer j
!$acc routine seq
csum=0.d0
do j=1,N
csum=csum+A(i,j)*b(j)
enddo
end subroutine norm_comp
end subroutine matvec
Instead the following can be done by passing all the arguments of the subroutine as in a standard f77 case:
program prova
#ifdef _OPENACC
use openacc
#endif
implicit none
integer n,i,j
real8, dimension(:,:), allocatable:: a
real8, dimension(:), allocatable:: b,c
real8 csum
#ifdef _OPENACC
integer mygpu, myrealgpu, num_devices, my_device_type
!$acc routine(norm_comp) seq
my_device_type = acc_device_nvidia
mygpu = 0
call acc_set_device_type(my_device_type)
num_devices = acc_get_num_devices(my_device_type)
write(6,) ’ Number of devices available: ',num_devices
call acc_set_device_num(mygpu,my_device_type)
write(6,) ‘Trying to use GPU:’,mygpu
myrealgpu = acc_get_device_num(my_device_type)
write(6,) 'Actually I am using GPU: ',myrealgpu
if(mygpu.ne.myrealgpu) then
write(6,) ‘I cannot use the requested GPU:’,mygpu
stop
endif
#endif
write(6,) ‘Input N leading dimension square matrix A’
read(,) N
allocate(b(N),c(N))
allocate(a(N,N))
do i=1,N
do j=1,N
a(i,j)=dsin(dble(i-j)/N)
enddo
a(i,i)=1.d0
b(i)=cos(dble(i)**2-3*i+1) ! random init
enddo
call matvec(n,a,b,c)
csum=0.d0
!$acc parallel loop reduction(+:csum)
do i=1,N
csum=csum+c(i)**2
enddo
!$acc end parallel loop
csum=sqrt(csum)
!$acc parallel loop
do i=1,N
b(i)=c(i)/csum
enddo
!$acc end parallel loop
write(6,) ’ Final b → A b ’
do i=1,N
write(6,) i,b(i)
enddo
stop
end program prova
subroutine matvec(n,a,b,c)
implicit none
integer i,n
real8 csum,a(n,n),b(n),c(n)
!$acc routine(norm_comp) vector
!$acc parallel loop
do i=1,N
call norm_comp(i,n,a,b)
! csum=0.d0
!!$acc loop reduction(+:csum)
! do j=1,N
! csum=csum+A(i,j)b(j)
! enddo
c(i)=csum
enddo
!$acc end parallel loop
end subroutine matvec
subroutine norm_comp(i,n,a,b)
implicit none
integer i,j,n
real8 a(n,n),b(n)
real8 csum
!$acc routine seq
csum=0.d0
do j=1,N
csum=csum+A(i,j)*b(j)
enddo
end subroutine norm_comp
The third case with the main that includes with contains some subroutine supposed to
work in the accelerator:
program prova
#ifdef _OPENACC
use openacc
#endif
implicit none
integer n,i,j
real8, dimension(:,:), allocatable:: a
real8, dimension(:), allocatable:: b,c
real8 csum
#ifdef _OPENACC
integer mygpu, myrealgpu, num_devices, my_device_type
!$acc routine(norm_comp) seq
my_device_type = acc_device_nvidia
mygpu = 0
call acc_set_device_type(my_device_type)
num_devices = acc_get_num_devices(my_device_type)
write(6,) ’ Number of devices available: ',num_devices
call acc_set_device_num(mygpu,my_device_type)
write(6,) ‘Trying to use GPU:’,mygpu
myrealgpu = acc_get_device_num(my_device_type)
write(6,) 'Actually I am using GPU: ',myrealgpu
if(mygpu.ne.myrealgpu) then
write(6,*) ‘I cannot use the requested GPU:’,mygpu
stop
endif
#endif
write(6,*) ‘Input N leading dimension square matrix A’
read(,) N
allocate(b(N),c(N))
allocate(a(N,N))
do i=1,N
do j=1,N
a(i,j)=dsin(dble(i-j)/N)
enddo
a(i,i)=1.d0
b(i)=cos(dble(i)**2-3*i+1) ! random init
enddo
call matvec
csum=0.d0
!$acc parallel loop reduction(+:csum)
do i=1,N
csum=csum+c(i)**2
enddo
!$acc end parallel loop
csum=sqrt(csum)
!$acc parallel loop
do i=1,N
b(i)=c(i)/csum
enddo
!$acc end parallel loop
write(6,) ’ Final b → A b ’
do i=1,N
write(6,) i,b(i)
enddo
stop
contains
subroutine matvec
implicit none
integer i
!$acc routine(norm_comp) vector
!$acc parallel loop
do i=1,N
call norm_comp(n,i,csum,a,b)
! csum=0.d0
!!$acc loop reduction(+:csum)
! do j=1,N
! csum=csum+A(i,j)b(j)
! enddo
c(i)=csum
enddo
!$acc end parallel loop
end subroutine matvec
end program prova
subroutine norm_comp(n,i,csum,a,b)
implicit none
integer, intent(in):: N
real8, intent(out):: csum
real8, intent(in) :: b(N)
real8, intent(in) :: A(N,N)
integer i,j
!$acc routine seq
csum=0.d0
do j=1,N
csum=csum+A(i,j)*b(j)
enddo
end subroutine norm_comp