I have simplified the code and given three examples. The first one is not working, and the error message is clear now:

PGF90-S-0155-acc routine cannot be used for contained subprograms that refer to host subprogram data: n (prova.f90)

0 inform, 0 warnings, 1 severes, 0 fatal for norm_comp

Thus it seems it cannot be done at present.

I include three examples of the SAME algorithm (a poor implementation of a matrix x vector) the first one written in f90 with contains and the second one by passing after the contains all variables defined as in f77. If I cannot use OpenACC in f90 is a problem.

The two codes are only written in a different language and they should be equivalent for a reasonable compiler/paradigm (Pgf90/OpenACC).

If this is not possible I cannot port my application code in a reasonable time on GPU, because going back to f77 is a real pain.

Notice also that If I use the contains in the main, the program also works (third case). So, in my opinion it should be worth that Pgi/OpenACC could allow the first code working.

Many thanks for any help (it may be there is a workaround?).

This is the sample case that cannot be compiled:

program prova

#ifdef _OPENACC

use openacc

#endif

implicit none

integer n,i,j

real*8, dimension(:,:), allocatable:: a*

real8, dimension(:), allocatable:: b,c

real*8 csum*

#ifdef _OPENACC

integer mygpu, myrealgpu, num_devices, my_device_type

!$acc routine(norm_comp) seq

my_device_type = acc_device_nvidia

mygpu = 0

call acc_set_device_type(my_device_type)

num_devices = acc_get_num_devices(my_device_type)

write(6,) ’ Number of devices available: ',num_devices

call acc_set_device_num(mygpu,my_device_type)

write(6,*) ‘Trying to use GPU:’,mygpu*

myrealgpu = acc_get_device_num(my_device_type)

write(6,) 'Actually I am using GPU: ',myrealgpu

if(mygpu.ne.myrealgpu) then

write(6,*) ‘I cannot use the requested GPU:’,mygpu*

stop

endif

#endif

write(6,) ‘Input N leading dimension square matrix A’

read(*,*) N

allocate(b(N),c(N))

allocate(a(N,N))

do i=1,N

do j=1,N

a(i,j)=dsin(dble(i-j)/N)

enddo

a(i,i)=1.d0

b(i)=cos(dble(i)**2-3*i+1) ! random init*

enddo

call matvec(n,a,b,c)

csum=0.d0

!$acc parallel loop reduction(+:csum)

do i=1,N

csum=csum+c(i)**2

enddo

!$acc end parallel loop

csum=sqrt(csum)

!$acc parallel loop

do i=1,N

b(i)=c(i)/csum

enddo

!$acc end parallel loop

write(6,) ’ Final b --> A b ’

do i=1,N

write(6,*) i,b(i)

enddo

stop

end program prova

subroutine matvec(n,a,b,c)

implicit none

integer i,n

real*8 csum,a(n,n),b(n),c(n)

!$acc routine(norm_comp) seq

!$acc parallel loop

do i=1,N

call norm_comp

! csum=0.d0

!!$acc loop reduction(+:csum)

! do j=1,N

! csum=csum+A(i,j)*b(j)*

! enddo

c(i)=csum

enddo

!$acc end parallel loop

contains

subroutine norm_comp

implicit none

real8 csum

integer j

!$acc routine seq

csum=0.d0

do j=1,N

csum=csum+A(i,j)*b(j)

enddo

end subroutine norm_comp

end subroutine matvec

Instead the following can be done by passing all the arguments of the subroutine as in a standard f77 case:

program prova

#ifdef _OPENACC

use openacc

#endif

implicit none

integer n,i,j

real*8, dimension(:,:), allocatable:: a*

real8, dimension(:), allocatable:: b,c

real*8 csum*

#ifdef _OPENACC

integer mygpu, myrealgpu, num_devices, my_device_type

!$acc routine(norm_comp) seq

my_device_type = acc_device_nvidia

mygpu = 0

call acc_set_device_type(my_device_type)

num_devices = acc_get_num_devices(my_device_type)

write(6,) ’ Number of devices available: ',num_devices

call acc_set_device_num(mygpu,my_device_type)

write(6,*) ‘Trying to use GPU:’,mygpu*

myrealgpu = acc_get_device_num(my_device_type)

write(6,) 'Actually I am using GPU: ',myrealgpu

if(mygpu.ne.myrealgpu) then

write(6,*) ‘I cannot use the requested GPU:’,mygpu*

stop

endif

#endif

write(6,) ‘Input N leading dimension square matrix A’

read(*,*) N

allocate(b(N),c(N))

allocate(a(N,N))

do i=1,N

do j=1,N

a(i,j)=dsin(dble(i-j)/N)

enddo

a(i,i)=1.d0

b(i)=cos(dble(i)**2-3*i+1) ! random init

enddo

call matvec(n,a,b,c)

csum=0.d0

!$acc parallel loop reduction(+:csum)

do i=1,N

csum=csum+c(i)**2

enddo

!$acc end parallel loop

csum=sqrt(csum)

!$acc parallel loop

do i=1,N

b(i)=c(i)/csum

enddo

!$acc end parallel loop

write(6,*) ’ Final b --> A b ’*

do i=1,N

write(6,) i,b(i)

enddo

stop

end program prova

subroutine matvec(n,a,b,c)

implicit none

integer i,n

real*8 csum,a(n,n),b(n),c(n)*

!$acc routine(norm_comp) vector

!$acc parallel loop

do i=1,N

call norm_comp(i,n,a,b)

! csum=0.d0

!!$acc loop reduction(+:csum)

! do j=1,N

! csum=csum+A(i,j)*b(j)*

! enddo

c(i)=csum

enddo

!$acc end parallel loop

end subroutine matvec

subroutine norm_comp(i,n,a,b)

implicit none

integer i,j,n

real8 a(n,n),b(n)

real8 csum

!$acc routine seq

csum=0.d0

do j=1,N

csum=csum+A(i,j)*b(j)

enddo

end subroutine norm_comp

The third case with the main that includes with contains some subroutine supposed to

work in the accelerator:

program prova

#ifdef _OPENACC

use openacc

#endif

implicit none

integer n,i,j

real*8, dimension(:,:), allocatable:: a*

real8, dimension(:), allocatable:: b,c

real*8 csum*

#ifdef _OPENACC

integer mygpu, myrealgpu, num_devices, my_device_type

!$acc routine(norm_comp) seq

my_device_type = acc_device_nvidia

mygpu = 0

call acc_set_device_type(my_device_type)

num_devices = acc_get_num_devices(my_device_type)

write(6,) ’ Number of devices available: ',num_devices

call acc_set_device_num(mygpu,my_device_type)

write(6,*) ‘Trying to use GPU:’,mygpu*

myrealgpu = acc_get_device_num(my_device_type)

write(6,) 'Actually I am using GPU: ',myrealgpu

if(mygpu.ne.myrealgpu) then

write(6,*) ‘I cannot use the requested GPU:’,mygpu

stop

endif

#endif

write(6,*) ‘Input N leading dimension square matrix A’

read(*,*) N

allocate(b(N),c(N))

allocate(a(N,N))

do i=1,N

do j=1,N

a(i,j)=dsin(dble(i-j)/N)

enddo

a(i,i)=1.d0

b(i)=cos(dble(i)**2-3*i+1) ! random init

enddo

call matvec

csum=0.d0

!$acc parallel loop reduction(+:csum)

do i=1,N

csum=csum+c(i)**2

enddo

!$acc end parallel loop

csum=sqrt(csum)

!$acc parallel loop

do i=1,N

b(i)=c(i)/csum

enddo

!$acc end parallel loop

write(6,*) ’ Final b --> A b ’*

do i=1,N

write(6,) i,b(i)

enddo

stop

contains

subroutine matvec

implicit none

integer i

!$acc routine(norm_comp) vector

!$acc parallel loop

do i=1,N

call norm_comp(n,i,csum,a,b)

! csum=0.d0

!!$acc loop reduction(+:csum)

! do j=1,N

! csum=csum+A(i,j)*b(j)*

! enddo

c(i)=csum

enddo

!$acc end parallel loop

end subroutine matvec

end program prova

subroutine norm_comp(n,i,csum,a,b)

implicit none

integer, intent(in):: N

real8, intent(out):: csum

real*8, intent(in) :: b(N)*

real8, intent(in) :: A(N,N)

integer i,j

!$acc routine seq

csum=0.d0

do j=1,N

csum=csum+A(i,j)*b(j)

enddo

end subroutine norm_comp