CUDA Fortran profile problem

linux0626 · July 28, 2014, 4:12pm

Hi mat.
here is the code that is very simple :

    lucas@lucas-desktop ~ $ cat test.f90 
module add
	use cudafor
	contains
	attributes(global) subroutine add_test(a)
		integer :: a(:)

		integer :: i
		i=threadIdx%x+(blockIdx%x-1)*blockDim%x
		a(i)=i
	end subroutine add_test
end module

program test
	use add
	implicit none
	integer,device :: d_a(256)
	integer :: a(256)
	integer :: i	

	do i=1,10
		call add_test<<<2,128>>>(d_a)
	enddo
	a=d_a
	print*,"a(1)=",a(1)

end program

and I compile it with:

 pgfortran -o test test.f90 -Mcuda=cc3.5

and I set the value of COMPUTE_PROFILE to 1.
here is the output of log file:

# CUDA_PROFILE_LOG_VERSION 2.0
# CUDA_DEVICE 0 GeForce GTX 750
# CUDA_CONTEXT 1
# TIMESTAMPFACTOR 138512dc017456cd
method,gputime,cputime,occupancy
method=[ memcpyHtoD ] gputime=[ 0.960 ] cputime=[ 5.718 ] 
method=[ add_add_test_ ] gputime=[ 3.200 ] cputime=[ 14.544 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.727 ] 
method=[ add_add_test_ ] gputime=[ 1.536 ] cputime=[ 4.605 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.706 ] 
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.314 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.864 ] cputime=[ 3.619 ] 
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.208 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.589 ] 
method=[ add_add_test_ ] gputime=[ 1.536 ] cputime=[ 4.188 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.864 ] cputime=[ 3.515 ] 
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.244 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.588 ] 
method=[ add_add_test_ ] gputime=[ 1.536 ] cputime=[ 4.154 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.567 ] 
method=[ add_add_test_ ] gputime=[ 1.536 ] cputime=[ 3.979 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.486 ] 
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.182 ] occupancy=[ 1.000 ] 
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.516 ] 
method=[ add_add_test_ ] gputime=[ 1.568 ] cputime=[ 4.092 ] occupancy=[ 1.000 ] 
method=[ memcpyDtoH ] gputime=[ 2.080 ] cputime=[ 12.546 ]

I’m very confused that why there is a data copy from host to device every time. In fact ,there is no need to do that .

MatColgrove · August 1, 2014, 5:11pm

Hi uestc0626,

It’s the F90 descriptor. The descriptor needs to be built, copied to the device, and then torn down every time you call the subroutine.

The work arounds are to either pass “a” as an assumed size array or declare “a” in the module and don’t pass it.

For example:

module add
    use cudafor
    contains
    attributes(global) subroutine add_test(a)
       integer :: a(*)
       integer :: i
       i=threadIdx%x+(blockIdx%x-1)*blockDim%x
       a(i)=i
    end subroutine add_test
 end module

 program test
    use add
    implicit none
    integer,device :: d_a(256)
    integer :: a(256)
    integer :: i

    do i=1,10
       call add_test<<<2,128>>>(d_a)
    enddo
    a=d_a
    print*,"a(1)=",a(1)

 end program

or

module add
    use cudafor
    integer,device :: d_a(256)
    integer :: a(256)
    contains
    attributes(global) subroutine add_test()
       integer :: i
       i=threadIdx%x+(blockIdx%x-1)*blockDim%x
       d_a(i)=i
    end subroutine add_test
 end module

 program test
    use add
    implicit none
    integer :: i

    do i=1,10
       call add_test<<<2,128>>>()
    enddo
    a=d_a
    print*,"a(1)=",a(1)

 end program

Hope this helps,
Mat

Topic		Replies	Views
Oddity in declaring constant memory in host subprograms Legacy PGI Compilers	3	3190	December 28, 2015
type mismatch error Legacy PGI Compilers	5	6444	February 15, 2010
how does this newbie fix his code? Legacy PGI Compilers	8	3415	March 27, 2012
copyout Memcpy failed, unspecified launch failure Legacy PGI Compilers	3	5291	March 9, 2012
cuda fortran module data Legacy PGI Compilers	6	8139	September 9, 2010
Error running simple CUDA Fortran program Legacy PGI Compilers	9	21316	February 26, 2010
CUDA - Call device subroutine Legacy PGI Compilers	4	3324	March 29, 2010
issue with 2d array copy back to host Legacy PGI Compilers	3	5915	September 19, 2014
Cuda mpi fortran Legacy PGI Compilers	5	687	October 12, 2021
Interface block is needed for device routines Legacy PGI Compilers	3	1862	September 17, 2019

CUDA Fortran profile problem

Related topics