Hi mat.
here is the code that is very simple :
lucas@lucas-desktop ~ $ cat test.f90
module add
use cudafor
contains
attributes(global) subroutine add_test(a)
integer :: a(:)
integer :: i
i=threadIdx%x+(blockIdx%x-1)*blockDim%x
a(i)=i
end subroutine add_test
end module
program test
use add
implicit none
integer,device :: d_a(256)
integer :: a(256)
integer :: i
do i=1,10
call add_test<<<2,128>>>(d_a)
enddo
a=d_a
print*,"a(1)=",a(1)
end program
and I compile it with:
pgfortran -o test test.f90 -Mcuda=cc3.5
and I set the value of COMPUTE_PROFILE to 1.
here is the output of log file:
# CUDA_PROFILE_LOG_VERSION 2.0
# CUDA_DEVICE 0 GeForce GTX 750
# CUDA_CONTEXT 1
# TIMESTAMPFACTOR 138512dc017456cd
method,gputime,cputime,occupancy
method=[ memcpyHtoD ] gputime=[ 0.960 ] cputime=[ 5.718 ]
method=[ add_add_test_ ] gputime=[ 3.200 ] cputime=[ 14.544 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.727 ]
method=[ add_add_test_ ] gputime=[ 1.536 ] cputime=[ 4.605 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.706 ]
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.314 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.864 ] cputime=[ 3.619 ]
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.208 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.589 ]
method=[ add_add_test_ ] gputime=[ 1.536 ] cputime=[ 4.188 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.864 ] cputime=[ 3.515 ]
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.244 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.588 ]
method=[ add_add_test_ ] gputime=[ 1.536 ] cputime=[ 4.154 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.567 ]
method=[ add_add_test_ ] gputime=[ 1.536 ] cputime=[ 3.979 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.486 ]
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.182 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.516 ]
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.092 ] occupancy=[ 1.000 ]
method=[ memcpyDtoH ] gputime=[ 2.080 ] cputime=[ 12.546 ]
I’m very confused that why there is a data copy from host to device every time. In fact ,there is no need to do that .