here is “CUDA FORTRAN” code,it is same as given in PGI manual.

```
! start the module containing the matrix multiply kernel
module mmul_mod
use cudafor
contains
! mmul_kernel computes A*B into C where A is NxM, B is MxL, C is then NxL
attributes(global) subroutine mmul_kernel( A, B, C, N, M, L )
real,device :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
! submatrices are declared to be in CUDA shared memory
real, shared :: Asub(16,16), Bsub(16,16)
! the value of C(i,j) being computed, a temporary scalar
real :: Cij
! Start execution, first get my thread indices
tx = threadidx%x
ty = threadidx%y
! This thread computes C(i,j) = sum(A(i,:) * B(:,j))
i = (blockidx%x-1) * 16 + tx
j = (blockidx%y-1) * 16 + ty
Cij = 0.0
! Do the k loop in chunks of 16, the block size
do kb = 1, M, 16
! Fill the submatrices; each of 16x16 threads in the thread block
! loads one element of Asub and Bsub
Asub(tx,ty) = A(i,kb+ty-1)
Bsub(tx,ty) = B(kb+tx-1,j)
! Wait until all elements are filled
call syncthreads()
! Multiply the two submatrices; ! Each of the 16x16 threads accumulates the
! dot product for its element of C(i,j)
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
! Synchronize to make sure all threads are done reading the submatrices before
! overwriting them in the next iteration of the kb loop
call syncthreads()
enddo
! Each of the 16x16 threads stores its element to the global C array
C(i,j) = Cij
end subroutine mmul_kernel
! The host routine to drive the matrix multiplication
subroutine mmul( A, B, C )
! assumed shape input arrays
real, dimension(:,:) :: A, B, C
! Array dimensions
integer :: N, M, L
! allocatable device arrays
real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev
! dim3 variables to define the grid and block shapes
type(dim3) :: dimGrid, dimBlock
integer :: r
! Get the array sizes
real ctimeall, ctimekernel, flops, mflopskernel, mflopsall
integer c1, c2, c3, c4
! Begin execution, first determine the sizes of the input arrays
N = size( A, 1 )
M = size( A, 2 )
L = size( B, 2 )
! Start data xfer-inclusive timer and allocate the device arrays using
! F90 ALLOCATE
call system_clock( count=c1 )
allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )
! Copy A and B to the device using F90 array assignments
Adev = A(1:N,1:M)
Bdev = B(1:M,1:L)
! Create the grid and block dimensions
dimGrid = dim3( N/16, L/16, 1 )
dimBlock = dim3( 16, 16, 1 )
! Start data xfer-exclusive timer, launch the GPU kernel, wait for completion
call system_clock( count=c2 )
call mmul_kernel<<<dimGrid>>>( Adev, Bdev, Cdev, N, M, L )
r = cudathreadsynchronize()
! Stop data xfer-exlusive timer, copy the results back, stop data xfer-
! inclusive timer
call system_clock( count=c3 )
C(1:N,1:L) = Cdev
call system_clock( count=c4 )
! Calculate inclusive/exclusive execution times, and report MFLOPS
flops = float(N) * float(M) * float(L)
ctimekernel = c3 - c2
mflopskernel = flops / ctimekernel
ctimeall = c4 - c1
mflopsall = flops / ctimeall
! Print out results
print *, 'Kernel time excluding data xfer:', ctimekernel, ' microseconds'
! Deallocate device arrays and exit
deallocate( Adev, Bdev, Cdev )
end subroutine mmul
end module mmul_mod
! Main program to initialize arrays, invoke mmul, check results
program matmul
use mmul_mod
real,dimension(:,:),allocatable :: A,B,C,CC
integer N, M, L
integer idevice, istat
! Begin execution
N = 256
M = 256
L = 256
idevice = 0
print *,' arrays sized ', N, ' by ', M, ' by ', L
allocate(A(N,M),B(M,L),C(N,L),CC(N,L))
! Initialize the A and B arrays; zero out the C array to be computed
! on the GPU, and the CC array to be computed on the host
do j = 1,M
do i = 1,N
A(i,j) = 1
enddo
enddo
do j = 1,L
do i = 1,M
B(i,j) = 1
enddo
enddo
do j = 1,L
do i = 1,N
CC(i,j) = 0.0
C(i,j) = 0.0
enddo
enddo
! Initialize CPU device
istat = cudaSetDevice(idevice)
! Call matrix multiply subroutine to execute on the GPU to compute C
call mmul( A, B, C )
!print*,C
end program
```