CUDA Fortran profile problem

Hi mat.
here is the code that is very simple :

    lucas@lucas-desktop ~ $ cat test.f90 
module add
	use cudafor
	contains
	attributes(global) subroutine add_test(a)
		integer :: a(:)

		integer :: i
		i=threadIdx%x+(blockIdx%x-1)*blockDim%x
		a(i)=i
	end subroutine add_test
end module

program test
	use add
	implicit none
	integer,device :: d_a(256)
	integer :: a(256)
	integer :: i	

	do i=1,10
		call add_test<<<2,128>>>(d_a)
	enddo
	a=d_a
	print*,"a(1)=",a(1)

end program

and I compile it with:

 pgfortran -o test test.f90 -Mcuda=cc3.5

and I set the value of COMPUTE_PROFILE to 1.
here is the output of log file:

# CUDA_PROFILE_LOG_VERSION 2.0
# CUDA_DEVICE 0 GeForce GTX 750
# CUDA_CONTEXT 1
# TIMESTAMPFACTOR 138512dc017456cd
method,gputime,cputime,occupancy
method=[ memcpyHtoD ] gputime=[ 0.960 ] cputime=[ 5.718 ] 
method=[ add_add_test_ ] gputime=[ 3.200 ] cputime=[ 14.544 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.727 ] 
method=[ add_add_test_ ] gputime=[ 1.536 ] cputime=[ 4.605 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.706 ] 
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.314 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.864 ] cputime=[ 3.619 ] 
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.208 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.589 ] 
method=[ add_add_test_ ] gputime=[ 1.536 ] cputime=[ 4.188 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.864 ] cputime=[ 3.515 ] 
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.244 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.588 ] 
method=[ add_add_test_ ] gputime=[ 1.536 ] cputime=[ 4.154 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.567 ] 
method=[ add_add_test_ ] gputime=[ 1.536 ] cputime=[ 3.979 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.486 ] 
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.182 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.516 ] 
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.092 ] occupancy=[ 1.000 ] 
method=[ memcpyDtoH ] gputime=[ 2.080 ] cputime=[ 12.546 ]

I’m very confused that why there is a data copy from host to device every time. In fact ,there is no need to do that .

Hi uestc0626,

It’s the F90 descriptor. The descriptor needs to be built, copied to the device, and then torn down every time you call the subroutine.

The work arounds are to either pass “a” as an assumed size array or declare “a” in the module and don’t pass it.

For example:

module add
    use cudafor
    contains
    attributes(global) subroutine add_test(a)
       integer :: a(*)
       integer :: i
       i=threadIdx%x+(blockIdx%x-1)*blockDim%x
       a(i)=i
    end subroutine add_test
 end module

 program test
    use add
    implicit none
    integer,device :: d_a(256)
    integer :: a(256)
    integer :: i

    do i=1,10
       call add_test<<<2,128>>>(d_a)
    enddo
    a=d_a
    print*,"a(1)=",a(1)

 end program

or

module add
    use cudafor
    integer,device :: d_a(256)
    integer :: a(256)
    contains
    attributes(global) subroutine add_test()
       integer :: i
       i=threadIdx%x+(blockIdx%x-1)*blockDim%x
       d_a(i)=i
    end subroutine add_test
 end module

 program test
    use add
    implicit none
    integer :: i

    do i=1,10
       call add_test<<<2,128>>>()
    enddo
    a=d_a
    print*,"a(1)=",a(1)

 end program

Hope this helps,
Mat