Hello,
I have tried creating a minimal working example using exactly the same pragmas that is there in the original code. Here is the compilation message for cc13 and cc20:
CC 1.3 : 21 registers; 2168 shared, 12 constant, 0 local memory bytes; 50% occupancy
CC 2.0 : 38 registers; 2064 shared, 120 constant, 0 local memory bytes; 33% occupancy
I can give the actual code if needed, just that I need to extract it and make it individually compilable, if this example does not serve the purpose, then on Monday I will do it. Thanks for all the help. I have used CUDA 4.2 and PGI 12.3. Here is the entire code:
PROGRAM simpleFD25
IMPLICIT NONE
INTEGER :: nx, ny, nz !grid points and stencil order
REAL, DIMENSION(5) :: c
REAL :: time1, time2
INTEGER :: i,j,k,l
REAL, ALLOCATABLE :: u(:,:,:)
REAL, ALLOCATABLE :: r(:,:,:)
!$acc mirror(r)
!prompt user to enter input
WRITE(*,'(A)',ADVANCE="NO") "Enter NX NY NZ: "
READ(*,*) nx, ny, nz
!init
ALLOCATE (u(0:nx,0:ny,0:nz), r(0:nx,0:ny,0:nz))
u = 0.; r = 0.
c = (/1.,1.,1.,1.,1./)
FORALL (i=1:nx, j=1:ny, k=1:nz)
u(i,j,k) = float(i+j+k)/(nx+nz+ny)
END FORALL
!compute
CALL cpu_time(time1)
!$acc data region copyin(c,u)
!$acc region
DO l=1,4
!$acc do parallel(32) unroll(2)
DO i=4,nx-4
!$acc do parallel(64)
DO j=4,ny-4
!$acc do vector(512)
DO k=4,nz-4
r(i,j,k) = c(5) * u(i,j,k) + ( c(l) * u(i+l,j,k) + c(l) * u(i-l,j,k) ) &
+ ( c(l) * u(i,j+l,k) + c(l) * u(i,j-l,k) ) &
+ ( c(l) * u(i,j,k+l) + c(l) * u(i,j,k-l) )
END DO
END DO
END DO
END DO
!$acc end region
!$acc update host(r)
!$acc end data region
CALL cpu_time(time2)
WRITE(*,*) "Time taken = ", (time2 - time1), "secs"
!deallocate
DEALLOCATE(u, r)
END PROGRAM simpleFD2