Device code can only access a module’s device data if it’s contained in the same module. So in this case, you just need to make mult a contained subroutine within “anymodule”

It seems to be more than that. Even with NO device data, it seems that a global ( kernel ) routine must be in a module.

Is this the case? This is not documented as far as I can see. CUDA/fortran sec. 3.1.4 does state a device&host routine must be in a module.

Putting the global routines in a module will unfortunately enforce parameter type checking, and I am having problems porting ( for performance ) a fortran-77 style code which aliases arrays as formal parameters.

For example, deleting the module definition module/end-module and use statements from the example code gives the same sort of unsat. external message.

/tmp/pgfortran_tOg63owqMhb.o: In function `mmul_': /home/users/saraha/./matmul.CUF:121: undefined reference to `

mmul_kernel_’

```
! start the module containing the matrix multiply kernel
!module mmul_mod
! use cudafor
! contains
! mmul_kernel computes A*B into C where A is NxM, B is MxL, C is then NxL
attributes(global) subroutine mmul_kernel( A, B, C, N, M, L )
use cudafor
real,device :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
! submatrices are declared to be in CUDA shared memory
real, shared :: Asub(16,16), Bsub(16,16)
! the value of C(i,j) being computed, a temporary scalar
real :: Cij
! Start execution, first get my thread indices
tx = threadidx%x
ty = threadidx%y
! This thread computes C(i,j) = sum(A(i,:) * B(:,j))
i = (blockidx%x-1) * 16 + tx
j = (blockidx%y-1) * 16 + ty
Cij = 0.0
! Do the k loop in chunks of 16, the block size
do kb = 1, M, 16
! Fill the submatrices; each of 16x16 threads in the thread block
! loads one element of Asub and Bsub
Asub(tx,ty) = A(i,kb+ty-1)
Bsub(tx,ty) = B(kb+tx-1,j)
! Wait until all elements are filled
call syncthreads()
! Multiply the two submatrices; ! Each of the 16x16 threads accumulates the
! dot product for its element of C(i,j)
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
! Synchronize to make sure all threads are done reading the submatrices before
! overwriting them in the next iteration of the kb loop
call syncthreads()
enddo
! Each of the 16x16 threads stores its element to the global C array
C(i,j) = Cij
end subroutine mmul_kernel
! The host routine to drive the matrix multiplication
subroutine mmul( A, B, C )
use cudafor
! assumed shape input arrays
real, dimension(:,:) :: A, B, C
! Array dimensions
integer :: N, M, L
! allocatable device arrays
real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev
! dim3 variables to define the grid and block shapes
type(dim3) :: dimGrid, dimBlock
integer :: r
! Get the array sizes
real ctimeall, ctimekernel, flops, mflopskernel, mflopsall
integer c1, c2, c3, c4
! Begin execution, first determine the sizes of the input arrays
N = size( A, 1 )
M = size( A, 2 )
L = size( B, 2 )
! Start data xfer-inclusive timer and allocate the device arrays using
! F90 ALLOCATE
call system_clock( count=c1 )
allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )
! Copy A and B to the device using F90 array assignments
Adev = A(1:N,1:M)
Bdev = B(1:M,1:L)
! Create the grid and block dimensions
dimGrid = dim3( N/16, L/16, 1 )
dimBlock = dim3( 16, 16, 1 )
! Start data xfer-exclusive timer, launch the GPU kernel, wait for completion
call system_clock( count=c2 )
call mmul_kernel<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, N, M, L )
r = cudathreadsynchronize()
! Stop data xfer-exlusive timer, copy the results back, stop data xfer-
! inclusive timer
call system_clock( count=c3 )
C(1:N,1:L) = Cdev
call system_clock( count=c4 )
! Calculate inclusive/exclusive execution times, and report MFLOPS
flops = float(N) * float(M) * float(L)
ctimekernel = c3 - c2
mflopskernel = flops / ctimekernel
ctimeall = c4 - c1
mflopsall = flops / ctimeall
! Print out results
print *, 'Kernel time excluding data xfer:', ctimekernel, ' microseconds'
print *, 'Megaflops excluding data xfer: ', mflopskernel
print *, 'Total time including data xfer: ', ctimeall, ' microseconds'
print *, 'Megaflops including data xfer: ', mflopsall
! Deallocate device arrays and exit
deallocate( Adev, Bdev, Cdev )
end subroutine mmul
!end module mmul_mod
! Main program to initialize arrays, invoke mmul, check results
program matmul
! use mmul_mod
use cudafor
real,dimension(:,:),allocatable :: A,B,C,CC
integer N, M, L
integer idevice, istat
! Begin execution
N = 512
M = 1024
L = 512
idevice = 0
print *,' arrays sized ', N, ' by ', M, ' by ', L
allocate(A(N,M),B(M,L),C(N,L),CC(N,L))
! Initialize the A and B arrays; zero out the C array to be computed
! on the GPU, and the CC array to be computed on the host
do j = 1,M
do i = 1,N
A(i,j) = i*10 + j*1000
enddo
enddo
do j = 1,L
do i = 1,M
B(i,j) = i-j
enddo
enddo
do j = 1,L
do i = 1,N
CC(i,j) = 0.0
C(i,j) = 0.0
enddo
enddo
! Initialize CPU device
istat = cudaSetDevice(idevice)
! Call matrix multiply subroutine to execute on the GPU to compute C
print *,'calling mmul'
call mmul( A, B, C )
print *,' C(1,1) = ', C(1,1)
print *,' C(2,2) = ', C(2,2)
! Perform matrix multiply on host to compute CC
do i = 1,N
do j = 1,L
do k = 1,M
CC(i,j) = CC(i,j) + A(i,k)*B(k,j)
enddo
enddo
enddo
! Check for errors
ierr = 0
do j = 1,L
do i = 1,N
diff = abs(C(i,j) - CC(i,j))
denom = CC(i,j)
if ( denom == 0.0 ) denom = 1.0
error = diff / denom
if ( error > 2.0e-5 ) then
ierr = ierr + 1
if ( ierr <= 10 ) then
print *, 'C(',i,',',j,') = ',C(i,j), ' should be ', CC(i,j), ' error=', error
endif
endif
enddo
enddo
if( ierr == 0 )then
print *, ' No errors found'
else
print *, ierr, ' ERRORS FOUND!!!'
endif
end program
```