I am writing a program. In this program, I want to copy a sub-array from host to device. But the performance is poor. So I test the memory copy performance.
module test
use cudafor
contains
attributes(global) subroutine cal_I(a,b)
real :: a(90,90,90,3),b(90,90,90,3)
end subroutine cal_I
end module test
program main
use cudafor
use test
type ( cudaEvent ) :: startEvent , stopEvent, StepStartEvt, StepStopEvt
real(4) :: time
integer :: istat
real :: a(100,100,100,3),b(100,100,100,3)
real :: a1(90,90,90,3),b1(90,90,90,3)
real, device :: dev_a1(90,90,90,3), dev_b1(90,90,90,3)
integer :: i
a=1.0
b=2.0
istat = cudaEventCreate ( startEvent )
istat = cudaEventCreate ( stopEvent )
istat = cudaEventRecord ( startEvent , 0)
dev_a1(:,:,:,:) = a(1:90,1:90,1:90,:)
dev_b1(:,:,:,:) = b(1:90,1:90,1:90,:)
istat = cudaEventRecord ( stopEvent , 0)
istat = cudaEventSynchronize ( stopEvent )
istat = cudaEventElapsedTime (time , startEvent , stopEvent )
write (* ,*) ' Time for copy1: ', time
istat = cudaEventRecord ( startEvent , 0)
do i=1,3
dev_a1(:,:,:,i) = a(1:90,1:90,1:90,i)
dev_b1(:,:,:,i) = b(1:90,1:90,1:90,i)
enddo
istat = cudaEventRecord ( stopEvent , 0)
istat = cudaEventSynchronize ( stopEvent )
istat = cudaEventElapsedTime (time , startEvent , stopEvent )
write (* ,*) ' Time for copy2: ', time
istat = cudaEventRecord ( startEvent , 0)
dev_a1 = a1
dev_b1 = b1
istat = cudaEventRecord ( stopEvent , 0)
istat = cudaEventSynchronize ( stopEvent )
istat = cudaEventElapsedTime (time , startEvent , stopEvent )
write (* ,*) ' Time for copy3: ', time
istat = cudaEventRecord ( startEvent , 0)
do i=1,3
dev_a1(:,:,:,i) = a1(:,:,:,i)
dev_b1(:,:,:,i) = b1(:,:,:,i)
enddo
istat = cudaEventRecord ( stopEvent , 0)
istat = cudaEventSynchronize ( stopEvent )
istat = cudaEventElapsedTime (time , startEvent , stopEvent )
write (* ,*) ' Time for copy4: ', time
call cal_I<<<1>>>(dev_a1,dev_b1)
end program main
In the first case, I do not use loop to copy subarrays, while in the second case, I use loop to copy subarrays.
In the 3rd case, I do not use loop to copy array, while in the 4th case, I use loop to copy array. I run the program on M2050, and get the result.
Time for copy1: 28.94928
Time for copy2: 44.00723
Time for copy3: 4.118048
Time for copy4: 43.05971
According to the result, the time for copy the entire array is least. So should I copy the subarray to a array which is on host, then copy the entire array to device?