copy data from device

Hi, all:
I compiled the following code using PGI Fortran 12.2 but the results were wrong. It was modified from the example on the website computing the product C of two matrices A and B.
I found the data copy instruction doing something wrong because the results were right while compiled by PGI Visual Fortran 10.9.

C( (1+(Mn-1)*NB):min(N,Mn*NB), 1:L)= Cdev(1:min(NB,N-(Mn-1)*NB), 1:L)

Once the data copy instruction is replaced by the other one, everything is fine…

 C( (1+(Mn-1)*NB):min(N,Mn*NB), 1:L)= Cdev(:,:)

Is it a bug in the 12.2 version?

module mmul_mod
    use cudafor

    attributes(global) subroutine mmul_kernel( Adev, Bdev, Cdev, NB, M, L, Nn)
       integer, value :: NB, M, L, Nn
       real*8,device :: Adev(NB,M), Bdev(M,L), Cdev(NB,L)
       integer :: i, j, kb, k, tx, ty
       real*8 :: Cij

       tx = threadidx%x
       ty = threadidx%y

       i = (blockidx%x-1) * 16 + tx
       j = (blockidx%y-1) * 16 + ty

       Cij = 0.d0
       if ( i<=NB .and. j <= L)then
         do k=1, M
           Cij = Cij + Adev(i,k) * Bdev(k, j)
         end do
         Cdev(i,j) = Cij
         call syncthreads()
       end if

    end subroutine mmul_kernel
! The host routine to drive the matrix multiplication

    subroutine mmul( A, B, C )
       real*8, dimension(:,:) :: A, B, C
       integer :: N, M, L, NB
       integer :: Nn, Mm, Ll
       real*8, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev
       real*8, device, dimension(16,16) :: Asub,Bsub
       real*8, dimension(16,16) :: Ahsub,Bhsub
       type(dim3) :: dimGrid, dimBlock
       integer :: r,i,j
       real ctimeall, ctimekernel, flops, mflopskernel, mflopsall
       integer c1, c2, c3, c4

       N = size( A, 1 )
       M = size( A, 2 )
       L = size( B, 2 )

	   Nn= (N-1)/NB +1
	   print*,N, M,L,NB,Nn
	   allocate( Adev(NB,M), Bdev(M,L), Cdev(NB,L) )

       dimGrid = dim3( NB/16, (L-1)/16+1, 1 )
       dimBlock = dim3( 16, 16, 1 )

	   call system_clock( count=c1 )

	   Bdev(1:M,1:L) = B(1:M,1:L)
	   do Mn=1, Nn 

         print*, "Using GPU without share memory", min(NB,N-(Mn-1)*NB), 1+(Mn-1)*NB, min(N,Mn*NB)

         Adev(1:min(NB,N-(Mn-1)*NB),1:M) = A((1+(Mn-1)*NB):min(N,Mn*NB), 1:M)
         call mmul_kernel<<<dimGrid>>>( Adev, Bdev, Cdev, NB, M, L, Mn )
         r = cudathreadsynchronize()

!        C( (1+(Mn-1)*NB):min(N,Mn*NB), 1:L)= Cdev(1:min(NB,N-(Mn-1)*NB), 1:L)
        C( (1+(Mn-1)*NB):min(N,Mn*NB), 1:L)= Cdev(:,:)

       end do
       call system_clock( count=c4 )

	   ! Calculate inclusive/exclusive execution times, and report MFLOPS

       flops = float(N) * float(M) * float(L)
       ctimeall = c4 - c1
       mflopsall = flops / ctimeall

!  Print out results

       print *, 'Total time including data xfer: ', ctimeall, ' microseconds' , ctimeall/1.E6, ' seconds'
       print *, 'Megaflops including data xfer:  ', mflopsall
       print *, ' '
       deallocate( Adev, Bdev, Cdev)
    end subroutine mmul
end module mmul_mod

! Main program to initialize arrays, invoke mmul, check results

program matmul
   use mmul_mod
   real*8,dimension(:,:),allocatable :: A,B,C,CC
   integer N, M, L, i,j
   integer idevice, istat
   real:: s1, s2

! Begin execution
   print*,"input N, M,L"
   print*,"Ex. 1025, 1024, 512"
   read(*,*)N, M, L
   idevice = 0
   print *,' arrays sized ', N, ' by ', M, ' by ', L

! Initialize the A and B arrays;  zero out the C array to be computed
! on the GPU, and the CC array to be computed on the host

   do j = 1,M
      do i = 1,N
         A(i,j) = dble(i*10 + j*100)
   do j = 1,L
      do i = 1,M
         B(i,j) = dble(i-j)

   do j = 1,L
      do i = 1,N
         CC(i,j) = 0.d0
         C(i,j) = 0.d0

! Perform matrix multiply on host to compute CC
print *,'OpenMP 2 thread proceeding'
call cpu_time(s1)
!$omp parallel do num_threads(2)
   do j = 1,L
      do i = 1,N
         do k = 1,M
            CC(i,j) = CC(i,j) + A(i,k)*B(k,j)
!$omp end parallel do
call cpu_time(s2)
print *,'OpenMP 2 threads:', s2-s1, 'seconds'
print*,'  '

! Initialize CPU device
  istat = cudaSetDevice(idevice)  

! Call matrix multiply subroutine to execute on the GPU to compute C

   print *,'PGI CUDA proceeding'
   call mmul( A, B, C )

! Check for errors
   print*, " "
   print*, "Check for errors between PGI CUDA Fortran and CPU"
   ierr = 0
   do j = 1,L
      do i = 1,N
         diff = abs(C(i,j) - CC(i,j))
         denom = CC(i,j)
         if ( denom == 0.0 ) denom = 1.0
         error = diff / denom
         if ( error > 2.d-5 ) then
            ierr = ierr + 1
            if ( ierr <= 10 ) then
               print *, 'C(',i,',',j,') = ',C(i,j), ' should be ', CC(i,j), ' error=', error

   if( ierr == 0 )then
      print *, ' No errors found'
      print *, ierr, ' ERRORS FOUND!!!'
end program

Hi cyfengMIT,

Yes, this does look like a compiler issue so I wrote up a report (TPR#18544) and sent it to our compiler engineers. For the 2012 release, they did make many improvements on optimising sub-array data movement but obviously made a mistake with your code. Thank you for bringing it to our attention and we’ll work on getting it fixed as soon as we can.

  • Mat


TPR 18544 was fixed in the 12.5 release.

Thanks again for your report.