less speed of accelerator directives

this is my code.i used accelerator directives.


program ex
implicit none
real :: a(256,256),b(256,256),c(256,256),t1,t2
integer i,j,k,sm
sm=0
  do j = 1,256
      do i = 1,256
         a(i,j) = 1
		 b(i,j) = 1
		 c(i,j) = 0.0
      enddo
   enddo
   call cpu_time(t1)
!$acc region
  do i=1,256
   
      do j=1,256
       
           sm=0
           do k=1,256
		             
               sm=sm+a(i,k)*b(k,j)
           c(i,j)=sm
           end do
      end do
      end do
!$acc end region
	  call cpu_time(t2)
	  print*,"cpu time=",t2-t1
	  !print*,c
	  end program ex

then the execution time is 75 mili seconds
but when i use same code with “CUDA FORTRAN” implimentation the execution time is only 5 mili seconds.
how can i get more speed by directives??

Hi kuldeep gupta,

How are you doing your timing for both programs? Can you post your CUDA Fortran code?

My best guess is that the CUDA Fortran code is taking longer then you think. Users often make the mistake of using CPUtime to measure CUDA Fortran instead of CUDA event counters. This can lead to incorrect results since CUDA kernel are launched asynchronously.

  • Mat

here is “CUDA FORTRAN” code,it is same as given in PGI manual.


! start the module containing the matrix multiply kernel
module mmul_mod
    use cudafor
    contains

! mmul_kernel computes A*B into C where A is NxM, B is MxL, C is then NxL

    attributes(global) subroutine mmul_kernel( A, B, C, N, M, L )
       real,device :: A(N,M), B(M,L), C(N,L)
       integer, value :: N, M, L
       integer :: i, j, kb, k, tx, ty

! submatrices are declared to be in CUDA shared memory

       real, shared :: Asub(16,16), Bsub(16,16)

! the value of C(i,j) being computed, a temporary scalar

       real :: Cij

! Start execution, first get my thread indices

       tx = threadidx%x
       ty = threadidx%y

! This thread computes C(i,j) = sum(A(i,:) * B(:,j))

       i = (blockidx%x-1) * 16 + tx
       j = (blockidx%y-1) * 16 + ty

       Cij = 0.0

! Do the k loop in chunks of 16, the block size

       do kb = 1, M, 16

! Fill the submatrices; each of 16x16 threads in the thread block
! loads one element of Asub and Bsub

          Asub(tx,ty) = A(i,kb+ty-1)
          Bsub(tx,ty) = B(kb+tx-1,j)

! Wait until all elements are filled

          call syncthreads()

! Multiply the two submatrices; ! Each of the 16x16 threads accumulates the
! dot product for its element of C(i,j)

          do k = 1,16
             Cij = Cij + Asub(tx,k) * Bsub(k,ty)
          enddo

! Synchronize to make sure all threads are done reading the submatrices before 
! overwriting them in the next iteration of the kb loop

          call syncthreads()

       enddo

! Each of the 16x16 threads stores its element to the global C array

       C(i,j) = Cij

    end subroutine mmul_kernel

! The host routine to drive the matrix multiplication

    subroutine mmul( A, B, C )

! assumed shape input arrays

       real, dimension(:,:) :: A, B, C

! Array dimensions

       integer :: N, M, L

! allocatable device arrays

       real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev

! dim3 variables to define the grid and block shapes

       type(dim3) :: dimGrid, dimBlock
       integer :: r

! Get the array sizes

       real ctimeall, ctimekernel, flops, mflopskernel, mflopsall
       integer c1, c2, c3, c4

! Begin execution, first determine the sizes of the input arrays

       N = size( A, 1 )
       M = size( A, 2 )
       L = size( B, 2 )

! Start data xfer-inclusive timer and allocate the device arrays using 
! F90 ALLOCATE

       call system_clock( count=c1 )
       allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )

! Copy A and B to the device using F90 array assignments

       Adev = A(1:N,1:M)
       Bdev = B(1:M,1:L)

! Create the grid and block dimensions

       dimGrid = dim3( N/16, L/16, 1 )
       dimBlock = dim3( 16, 16, 1 )

! Start data xfer-exclusive timer, launch the GPU kernel, wait for completion

       call system_clock( count=c2 )
       call mmul_kernel<<<dimGrid>>>( Adev, Bdev, Cdev, N, M, L )
       r = cudathreadsynchronize()

! Stop data xfer-exlusive timer, copy the results back, stop data xfer-
! inclusive timer

       call system_clock( count=c3 )
       C(1:N,1:L) = Cdev
       call system_clock( count=c4 )

! Calculate inclusive/exclusive execution times, and report MFLOPS

       flops = float(N) * float(M) * float(L)
       ctimekernel = c3 - c2
       mflopskernel = flops / ctimekernel
       ctimeall = c4 - c1
       mflopsall = flops / ctimeall

!  Print out results

       print *, 'Kernel time excluding data xfer:', ctimekernel, ' microseconds'
       

! Deallocate device arrays and exit

       deallocate( Adev, Bdev, Cdev )

    end subroutine mmul
end module mmul_mod

! Main program to initialize arrays, invoke mmul, check results

program matmul
   use mmul_mod
   real,dimension(:,:),allocatable :: A,B,C,CC
   integer N, M, L
   integer idevice, istat

! Begin execution

   N = 256
   M = 256
   L = 256
   idevice = 0
   print *,' arrays sized ', N, ' by ', M, ' by ', L
   allocate(A(N,M),B(M,L),C(N,L),CC(N,L))

! Initialize the A and B arrays;  zero out the C array to be computed
! on the GPU, and the CC array to be computed on the host

   do j = 1,M
      do i = 1,N
         A(i,j) = 1
      enddo
   enddo
   do j = 1,L
      do i = 1,M
         B(i,j) = 1
      enddo
   enddo
   do j = 1,L
      do i = 1,N
         CC(i,j) = 0.0
         C(i,j) = 0.0
      enddo
   enddo

! Initialize CPU device

  istat = cudaSetDevice(idevice)  

! Call matrix multiply subroutine to execute on the GPU to compute C

  
   call mmul( A, B, C )
  
  !print*,C

  
end program

Hi kuldeep gupta,

How are you doing your timings?

Setting CUDA_PROFILE to 1 in my environment, I show the CUDA Fortran kernel takes 205 ms with 134 ms to transfer data. The PGI Accelerator version takes 344ms in the kernel and 134 ms to transfer data.

Note that in both cases, I uncommented out the print “c” statement. Otherwise, the compiler will optimise away some of your code.

  • Mat

ok.

one more thing i observed is that if i don’t use “!$acc region” i.e. accelerator directives but compiled it with -ta=nvidia flag it also gives same execution time when i write accelerator directives.

so i am confused whether directives are working or not??

Hi kuldeep gupta,

Again, I’m guessing here since you don’t provide details as to your execution times or how you are measuring your results.

Going back to your first example, I updated it so that at least a part of “C” is printed. The compiler optimisation could eliminated dead-code like this giving you the false impression that the CPU code is faster since nothing is actually computed.

Next, I initialised the GPU before your timers. At least on Linux, the GPU takes ~1 second per device to warm-up so can skew timings when running these very small programs.

Finally, I set the environment variable “ACC_NOTIFY” to show when a kernel is launched.

% cat ex.F90
program ex
#ifdef _ACCEL
  use accel_lib
#endif
implicit none
real :: a(256,256),b(256,256),c(256,256),t1,t2
integer i,j,k,sm
sm=0
#ifdef _ACCEL
 call acc_init(acc_device_nvidia)
#endif
  do j = 1,256
      do i = 1,256
         a(i,j) = 1
       b(i,j) = 1
       c(i,j) = 0.0
      enddo
   enddo
   call cpu_time(t1)
!$acc region
  do i=1,256

      do j=1,256

           sm=0
           do k=1,256

               sm=sm+a(i,k)*b(k,j)
           c(i,j)=sm
           end do
      end do
      end do
!$acc end region
     call cpu_time(t2)
     print*,"cpu time=",t2-t1
     print*,c(12,12)
     end program ex

% pgf90 ex.F90 -fast -Mpreprocess -o ex_cpu.out
% pgf90 ex.F90 -ta=nvidia,time -fast -Mpreprocess -o ex_gpu.out
% setenv ACC_NOTIFY 1
% ./ex_cpu.out
 cpu time=   6.2897921E-02
    256.0000
% ./ex_gpu.out
launch kernel  file=/tmp/qa/ex.F90 function=ex line=26 device=0 grid=16x16 block=16x16
 cpu time=   1.1019707E-03
    256.0000

Accelerator Kernel Timing data
/tmp/qa/ex.F90
  ex
    20: region entered 1 time
        time(us): total=1097
                  kernels=352 data=337
        26: kernel launched 1 times
            grid: [16x16]  block: [16x16]
            time(us): total=352 max=352 min=352 avg=352
acc_init.c
  acc_init
    42: region entered 1 time
        time(us): init=114125

So, the GPU version is roughly 50 times faster.

so i am confused whether directives are working or not??

This shows you what I did and hopefully you can the figure out it on your side.

  • Mat

Also, if you fix your code so that the “c(i,j)=sm” is placed after the “k” loop, the PGI accelerator kernel time reduces to 123ms.

  • Mat