Could you tell me how to get profiler information using Nsight systems on Linux system ?
I compiled the following source code and typed as follows in order to get profiler.
As you know, “loop1” was called many times. However the Nsight systems seems to get profiler information just only one time.
I don’t understand why the Nsight systems get profiler information just only one time.
Could you let me know why and how to get profiler information for “loop1” without modifying source code ?
% nsys profile -t openacc,nvtx --capture-range=nvtx --nvtx-capture=“loop1” --env-var=NSYS_NVTX_PROFILER_REGISTER_ONLY=0 ./laplace2
** NVTX Range Summary (nvtx_sum):
±---------±-----------------±----------±----------±----------±----------±----------±-------------±--------±------+
| Time (%) | Total Time (sec) | Instances | Avg (sec) | Med (sec) | Min (sec) | Max (sec) | StdDev (sec) | Style | Range |
±---------±-----------------±----------±----------±----------±----------±----------±-------------±--------±------+
| 100.0 | 0.0116 | 1 | 0.0116 | 0.0116 | 0.0116 | 0.0116 | 0.0000 | PushPop | loop1 |
±---------±-----------------±----------±----------±----------±----------±----------±-------------±--------±------+
program laplace
!
! Copyright 2013 SofTek Kato
!
#if defined(_NVTX_)
use nvtx
#endif
implicit none
integer(4) :: rc
integer(4) :: nn, nm
integer(4) :: n, m, iter, iter_max, i, j, k
parameter(nn=16384)
parameter(nm=16384)
real (kind=8) :: tol = 1.0d-6
real (kind=8) :: error = 1.0d0
real (kind=8), dimension (:,:) :: A (nn,nm)
real (kind=8), dimension (:,:) :: Anew(nn,nm)
! Elapsed time function initialized -- SofTek
integer ic, icr, icm
real*8 cpu0, cpu1, elapsed, t_ac
real*8 second
external second
call system_clock(ic,icr,icm)
t_ac= 1.0/real(icr,8)
print '(1x,a,e10.5)','Time measurement accuracy : ',t_ac
!
n = nn
m = nm
iter_max = 1000
A = 0.0d0
Anew = 0.0d0
cpu0 = second()
do i = 1,n
do j = 1, n
A(0, j) = 1.0d0
Anew(0, j) = 1.0d0
end do
end do
cpu1 = second()
elapsed = (cpu1 -cpu0) * t_ac
print *, "Jacobi relaxation Calculation:", n," x ", m, "mesh"
print *, "Initial : ",elapsed, "sec"
cpu0 = second()
iter = 0
#if defined(_NVTX_)
rc = nvtxRangePush("main_loop")
#endif
!$acc data copy(A), create(Anew)
do while ( iter .le. iter_max-1 .and. error .gt. tol )
error =0.d0
#if defined(_NVTX_)
rc = nvtxRangePush("loop1")
#endif
!$acc kernels
do j = 2, m-1
do i = 2, n-1
Anew(i,j) = 0.25 * ( A(i,j+1) + A(i,j-1) &
+ A(i-1,j) + A(i+1,j) )
error = max(error, abs(Anew(i,j) - A(i,j)))
end do
end do
!$acc end kernels
#if defined(_NVTX_)
rc = nvtxRangePop()
#endif
#if defined(_NVTX_)
rc = nvtxRangePush("loop2")
#endif
!$acc kernels
do j = 2, m-1
do i = 2, n-1
A(i,j) = Anew(i,j)
end do
end do
!$acc end kernels
#if defined(_NVTX_)
rc = nvtxRangePop()
#endif
if ( mod (iter,100) == 0 ) print *, iter, error
iter = iter + 1
end do
!$acc end data
#if defined(_NVTX_)
rc = nvtxRangePop()
#endif
cpu1 = second()
! Printout Elapsed time (SofTek)
elapsed = (cpu1 -cpu0) * t_ac
print *, "total : ",elapsed, "sec"
end program
function second() result(rtime)
implicit none
!
integer :: ic,ir,im
real(8) :: rtime
!
call system_clock(ic,ir,im)
!
rtime= real(ic,8)
!
return
end function second