Hi, all:

I compiled the following code using PGI Fortran 12.2 but the results were wrong. It was modified from the example on the website computing the product C of two matrices A and B. http://www.pgroup.com/lit/articles/insider/v1n3a2.htm

I found the data copy instruction doing something wrong because the results were right while compiled by PGI Visual Fortran 10.9.

```
C( (1+(Mn-1)*NB):min(N,Mn*NB), 1:L)= Cdev(1:min(NB,N-(Mn-1)*NB), 1:L)
```

Once the data copy instruction is replaced by the other one, everything is fine…

```
C( (1+(Mn-1)*NB):min(N,Mn*NB), 1:L)= Cdev(:,:)
```

Is it a bug in the 12.2 version?

```
module mmul_mod
use cudafor
contains
attributes(global) subroutine mmul_kernel( Adev, Bdev, Cdev, NB, M, L, Nn)
integer, value :: NB, M, L, Nn
real*8,device :: Adev(NB,M), Bdev(M,L), Cdev(NB,L)
integer :: i, j, kb, k, tx, ty
real*8 :: Cij
tx = threadidx%x
ty = threadidx%y
i = (blockidx%x-1) * 16 + tx
j = (blockidx%y-1) * 16 + ty
Cij = 0.d0
if ( i<=NB .and. j <= L)then
do k=1, M
Cij = Cij + Adev(i,k) * Bdev(k, j)
end do
Cdev(i,j) = Cij
else
call syncthreads()
end if
end subroutine mmul_kernel
!------------------------------------------------------------------------------------
! The host routine to drive the matrix multiplication
subroutine mmul( A, B, C )
real*8, dimension(:,:) :: A, B, C
integer :: N, M, L, NB
integer :: Nn, Mm, Ll
real*8, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev
real*8, device, dimension(16,16) :: Asub,Bsub
real*8, dimension(16,16) :: Ahsub,Bhsub
type(dim3) :: dimGrid, dimBlock
integer :: r,i,j
real ctimeall, ctimekernel, flops, mflopskernel, mflopsall
integer c1, c2, c3, c4
N = size( A, 1 )
M = size( A, 2 )
L = size( B, 2 )
NB=1024
Nn= (N-1)/NB +1
print*,N, M,L,NB,Nn
allocate( Adev(NB,M), Bdev(M,L), Cdev(NB,L) )
dimGrid = dim3( NB/16, (L-1)/16+1, 1 )
dimBlock = dim3( 16, 16, 1 )
call system_clock( count=c1 )
Bdev(1:M,1:L) = B(1:M,1:L)
do Mn=1, Nn
print*, "Using GPU without share memory", min(NB,N-(Mn-1)*NB), 1+(Mn-1)*NB, min(N,Mn*NB)
Adev(1:min(NB,N-(Mn-1)*NB),1:M) = A((1+(Mn-1)*NB):min(N,Mn*NB), 1:M)
call mmul_kernel<<<dimGrid>>>( Adev, Bdev, Cdev, NB, M, L, Mn )
r = cudathreadsynchronize()
! C( (1+(Mn-1)*NB):min(N,Mn*NB), 1:L)= Cdev(1:min(NB,N-(Mn-1)*NB), 1:L)
C( (1+(Mn-1)*NB):min(N,Mn*NB), 1:L)= Cdev(:,:)
end do
call system_clock( count=c4 )
! Calculate inclusive/exclusive execution times, and report MFLOPS
flops = float(N) * float(M) * float(L)
ctimeall = c4 - c1
mflopsall = flops / ctimeall
! Print out results
print *, 'Total time including data xfer: ', ctimeall, ' microseconds' , ctimeall/1.E6, ' seconds'
print *, 'Megaflops including data xfer: ', mflopsall
print *, ' '
deallocate( Adev, Bdev, Cdev)
end subroutine mmul
end module mmul_mod
! Main program to initialize arrays, invoke mmul, check results
program matmul
use mmul_mod
real*8,dimension(:,:),allocatable :: A,B,C,CC
integer N, M, L, i,j
integer idevice, istat
real:: s1, s2
! Begin execution
print*,"input N, M,L"
print*,"Ex. 1025, 1024, 512"
read(*,*)N, M, L
idevice = 0
print *,' arrays sized ', N, ' by ', M, ' by ', L
allocate(A(N,M),B(M,L),C(N,L),CC(N,L))
! Initialize the A and B arrays; zero out the C array to be computed
! on the GPU, and the CC array to be computed on the host
do j = 1,M
do i = 1,N
A(i,j) = dble(i*10 + j*100)
enddo
enddo
do j = 1,L
do i = 1,M
B(i,j) = dble(i-j)
enddo
enddo
do j = 1,L
do i = 1,N
CC(i,j) = 0.d0
C(i,j) = 0.d0
enddo
enddo
CC=0.d0
! Perform matrix multiply on host to compute CC
print *,'OpenMP 2 thread proceeding'
call cpu_time(s1)
!$omp parallel do num_threads(2)
do j = 1,L
do i = 1,N
do k = 1,M
CC(i,j) = CC(i,j) + A(i,k)*B(k,j)
enddo
enddo
enddo
!$omp end parallel do
call cpu_time(s2)
print *,'OpenMP 2 threads:', s2-s1, 'seconds'
print*,' '
! Initialize CPU device
istat = cudaSetDevice(idevice)
! Call matrix multiply subroutine to execute on the GPU to compute C
print *,'PGI CUDA proceeding'
call mmul( A, B, C )
! Check for errors
print*, " "
print*, "Check for errors between PGI CUDA Fortran and CPU"
ierr = 0
do j = 1,L
do i = 1,N
diff = abs(C(i,j) - CC(i,j))
denom = CC(i,j)
if ( denom == 0.0 ) denom = 1.0
error = diff / denom
if ( error > 2.d-5 ) then
ierr = ierr + 1
if ( ierr <= 10 ) then
print *, 'C(',i,',',j,') = ',C(i,j), ' should be ', CC(i,j), ' error=', error
endif
endif
enddo
enddo
if( ierr == 0 )then
print *, ' No errors found'
else
print *, ierr, ' ERRORS FOUND!!!'
endif
end program
```