I am conducting my thesis on cuda fortran and recently i bumbed into an unusual problem that i recreated also using the matrix multiplication example code from PGI (https://www.pgroup.com/lit/samples/matmul.CUF). Creating a global version of matrix multiplication and running the program for *single precision* gives of course faster execution of shared memory version than the global memory one. **Although** when running for *double precision* the shared memory version is *much slower* than the global one!!!

Profiling the kernel for shared memory with the command:

**pgprof --events l1_shared_bank_conflict**

I get no bank conflicts! So whats slowing down the shared kernel???

Running and profiling the cases on **NVidia Quadro M2200** for a tile of **16x16**:

Single precision:

Global memory version kernel time: 6.7 ms

Shared memory version kernel time: 3.3 ms

Double precision:

Global memory version kernel time: 9.76 ms

Shared memory version kernel time: 19 ms (!!!)

Running the cases on **NVidia Quadro M2200** for a tile of **8x8**:

Single precision:

Global memory version kernel time: 4.54 ms

Shared memory version kernel time: 2.34 ms

Double precision:

Global memory version kernel time: 6.67 ms

Shared memory version kernel time: 10.42 ms (!!!)

Also running the cases on **Nvidia Titan Xp** for a tile of **8x8**:

Single precision:

Global memory version kernel time: 1.28 ms

Shared memory version kernel time: 0.413 ms

Double precision:

Global memory version kernel time: 1.23 ms

Shared memory version kernel time: 1.335 ms (!!!)

Below I attach the two programs with the four cases!

**Code for single precision (Global and shared versions)**

```
! start the module containing the matrix multiply kernel
module mmul_mod
use cudafor
contains
!Global version
attributes(global) subroutine mmul_kernelglobal( A, B, C, N, M, L )
real,device :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
real :: Cij
tx = threadidx%x
ty = threadidx%y
i = (blockidx%x-1) * blockDim%x + tx
j = (blockidx%y-1) * blockDim%y + ty
Cij = 0.0
do k = 1,M
Cij = Cij + A(i,k) * B(k,j)
enddo
C(i,j) = Cij
end subroutine mmul_kernelglobal
! mmul_kernel computes A*B into C where A is NxM, B is MxL, C is then NxL
attributes(global) subroutine mmul_kernelshared( A, B, C, N, M, L )
real,device :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
! submatrices are declared to be in CUDA shared memory
real, shared :: Asub(16,16), Bsub(16,16)
! the value of C(i,j) being computed, a temporary scalar
real :: Cij
! Start execution, first get my thread indices
tx = threadidx%x
ty = threadidx%y
! This thread computes C(i,j) = sum(A(i,:) * B(:,j))
i = (blockidx%x-1) * 16 + tx
j = (blockidx%y-1) * 16 + ty
Cij = 0.0
! Do the k loop in chunks of 16, the block size
do kb = 1, M, 16
! Fill the submatrices; each of 16x16 threads in the thread block
! loads one element of Asub and Bsub
Asub(tx,ty) = A(i,kb+ty-1)
Bsub(tx,ty) = B(kb+tx-1,j)
! Wait until all elements are filled
call syncthreads()
! Multiply the two submatrices; ! Each of the 16x16 threads accumulates the
! dot product for its element of C(i,j)
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
! Synchronize to make sure all threads are done reading the submatrices before
! overwriting them in the next iteration of the kb loop
call syncthreads()
enddo
! Each of the 16x16 threads stores its element to the global C array
C(i,j) = Cij
end subroutine mmul_kernelshared
! The host routine to drive the matrix multiplication
subroutine mmul( A, B, C )
! assumed shape input arrays
real, dimension(:,:) :: A, B, C
! Array dimensions
integer :: N, M, L
! allocatable device arrays
real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev
! dim3 variables to define the grid and block shapes
type(dim3) :: dimGrid, dimBlock
integer :: r
! Get the array sizes
real ctimeall, ctimekernel, flops, mflopskernel, mflopsall
integer c1, c2, c3, c4
! Begin execution, first determine the sizes of the input arrays
N = size( A, 1 )
M = size( A, 2 )
L = size( B, 2 )
! Start data xfer-inclusive timer and allocate the device arrays using
! F90 ALLOCATE
call system_clock( count=c1 )
allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )
! Copy A and B to the device using F90 array assignments
Adev = A(1:N,1:M)
Bdev = B(1:M,1:L)
! Create the grid and block dimensions
dimGrid = dim3( N/16, L/16, 1 )
dimBlock = dim3( 16, 16, 1 )
! Start data xfer-exclusive timer, launch the GPU kernel, wait for completion
call system_clock( count=c2 )
!call mmul_kernelglobal<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, N, M, L )
call mmul_kernelshared<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, N, M, L )
r = cudathreadsynchronize()
! Stop data xfer-exlusive timer, copy the results back, stop data xfer-
! inclusive timer
call system_clock( count=c3 )
C(1:N,1:L) = Cdev
call system_clock( count=c4 )
! Calculate inclusive/exclusive execution times, and report MFLOPS
flops = float(N) * float(M) * float(L)
ctimekernel = c3 - c2
mflopskernel = flops / ctimekernel
ctimeall = c4 - c1
mflopsall = flops / ctimeall
! Print out results
print *, 'Kernel time excluding data xfer:', ctimekernel, ' microseconds'
print *, 'Megaflops excluding data xfer: ', mflopskernel
print *, 'Bandwidth: ', (float(N) * float(M) + float(M) * float(L) + 2 * float(N) * float(L)) * 4 / ctimekernel
print *, 'Total time including data xfer: ', ctimeall, ' microseconds'
print *, 'Megaflops including data xfer: ', mflopsall
! Deallocate device arrays and exit
deallocate( Adev, Bdev, Cdev )
end subroutine mmul
end module mmul_mod
! Main program to initialize arrays, invoke mmul, check results
program matmul
use mmul_mod
real,dimension(:,:),allocatable :: A,B,C,CC
integer N, M, L
integer idevice, istat
! Begin execution
N = 512
M = 1024
L = 512
idevice = 0
print *,' arrays sized ', N, ' by ', M, ' by ', L
allocate(A(N,M),B(M,L),C(N,L),CC(N,L))
! Initialize the A and B arrays; zero out the C array to be computed
! on the GPU, and the CC array to be computed on the host
do j = 1,M
do i = 1,N
A(i,j) = i*10 + j*1000
enddo
enddo
do j = 1,L
do i = 1,M
B(i,j) = i-j
enddo
enddo
do j = 1,L
do i = 1,N
CC(i,j) = 0.0
C(i,j) = 0.0
enddo
enddo
! Initialize CPU device
istat = cudaSetDevice(idevice)
! Call matrix multiply subroutine to execute on the GPU to compute C
print *,'calling mmul'
call mmul( A, B, C )
print *,' C(1,1) = ', C(1,1)
print *,' C(2,2) = ', C(2,2)
! Perform matrix multiply on host to compute CC
do i = 1,N
do j = 1,L
do k = 1,M
CC(i,j) = CC(i,j) + A(i,k)*B(k,j)
enddo
enddo
enddo
! Check for errors
ierr = 0
do j = 1,L
do i = 1,N
diff = abs(C(i,j) - CC(i,j))
denom = CC(i,j)
if ( denom == 0.0 ) denom = 1.0
error = diff / denom
if ( error > 2.0e-5 ) then
ierr = ierr + 1
if ( ierr <= 10 ) then
print *, 'C(',i,',',j,') = ',C(i,j), ' should be ', CC(i,j), ' error=', error
endif
endif
enddo
enddo
if( ierr == 0 )then
print *, ' No errors found'
else
print *, ierr, ' ERRORS FOUND!!!'
endif
end program
```

**Code for double precision (Global and shared memory)** here i putted doubleprecision instead of real(8) just to test if theres a difference but of course there wasn’t.

```
! start the module containing the matrix multiply kernel
module mmul_mod
use cudafor
contains
!Global version
attributes(global) subroutine mmul_kernelglobal( A, B, C, N, M, L )
double precision,device :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
double precision :: Cij
tx = threadidx%x
ty = threadidx%y
i = (blockidx%x-1) * 16 + tx
j = (blockidx%y-1) * 16 + ty
Cij = 0.0
do k = 1,M
Cij = Cij + A(i,k) * B(k,j)
enddo
C(i,j) = Cij
end subroutine mmul_kernelglobal
! mmul_kernel computes A*B into C where A is NxM, B is MxL, C is then NxL
attributes(global) subroutine mmul_kernelshared( A, B, C, N, M, L )
double precision,device :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
! submatrices are declared to be in CUDA shared memory
double precision, shared :: Asub(16,16), Bsub(16,16)
! the value of C(i,j) being computed, a temporary scalar
double precision :: Cij
! Start execution, first get my thread indices
tx = threadidx%x
ty = threadidx%y
! This thread computes C(i,j) = sum(A(i,:) * B(:,j))
i = (blockidx%x-1) * 16 + tx
j = (blockidx%y-1) * 16 + ty
Cij = 0.0
! Do the k loop in chunks of 16, the block size
do kb = 1, M, 16
! Fill the submatrices; each of 16x16 threads in the thread block
! loads one element of Asub and Bsub
Asub(tx,ty) = A(i,kb+ty-1)
Bsub(tx,ty) = B(kb+tx-1,j)
! Wait until all elements are filled
call syncthreads()
! Multiply the two submatrices; ! Each of the 16x16 threads accumulates the
! dot product for its element of C(i,j)
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
! Synchronize to make sure all threads are done reading the submatrices before
! overwriting them in the next iteration of the kb loop
call syncthreads()
enddo
! Each of the 16x16 threads stores its element to the global C array
C(i,j) = Cij
end subroutine mmul_kernelshared
! The host routine to drive the matrix multiplication
subroutine mmul( A, B, C )
! assumed shape input arrays
double precision, dimension(:,:) :: A, B, C
! Array dimensions
integer :: N, M, L
! allocatable device arrays
double precision, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev
! dim3 variables to define the grid and block shapes
type(dim3) :: dimGrid, dimBlock
integer :: r
! Get the array sizes
double precision ctimeall, ctimekernel, flops, mflopskernel, mflopsall
integer c1, c2, c3, c4
! Begin execution, first determine the sizes of the input arrays
N = size( A, 1 )
M = size( A, 2 )
L = size( B, 2 )
! Start data xfer-inclusive timer and allocate the device arrays using
! F90 ALLOCATE
call system_clock( count=c1 )
allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )
! Copy A and B to the device using F90 array assignments
Adev = A(1:N,1:M)
Bdev = B(1:M,1:L)
! Create the grid and block dimensions
dimGrid = dim3( N/16, L/16, 1 )
dimBlock = dim3( 16, 16, 1 )
! Start data xfer-exclusive timer, launch the GPU kernel, wait for completion
call system_clock( count=c2 )
!call mmul_kernelglobal<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, N, M, L )
call mmul_kernelshared<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, N, M, L )
r = cudathreadsynchronize()
! Stop data xfer-exlusive timer, copy the results back, stop data xfer-
! inclusive timer
call system_clock( count=c3 )
C(1:N,1:L) = Cdev
call system_clock( count=c4 )
! Calculate inclusive/exclusive execution times, and report MFLOPS
flops = float(N) * float(M) * float(L)
ctimekernel = c3 - c2
mflopskernel = flops / ctimekernel
ctimeall = c4 - c1
mflopsall = flops / ctimeall
! Print out results
print *, 'Kernel time excluding data xfer:', ctimekernel, ' microseconds'
print *, 'Megaflops excluding data xfer: ', mflopskernel
print *, 'Bandwidth: ', (float(N) * float(M) + float(M) * float(L) + 2 * float(N) * float(L)) * 4 / ctimekernel
print *, 'Total time including data xfer: ', ctimeall, ' microseconds'
print *, 'Megaflops including data xfer: ', mflopsall
! Deallocate device arrays and exit
deallocate( Adev, Bdev, Cdev )
end subroutine mmul
end module mmul_mod
! Main program to initialize arrays, invoke mmul, check results
program matmul
use mmul_mod
double precision,dimension(:,:),allocatable :: A,B,C,CC
integer N, M, L
integer idevice, istat
! Begin execution
N = 512
M = 1024
L = 512
idevice = 0
print *,' arrays sized ', N, ' by ', M, ' by ', L
allocate(A(N,M),B(M,L),C(N,L),CC(N,L))
! Initialize the A and B arrays; zero out the C array to be computed
! on the GPU, and the CC array to be computed on the host
do j = 1,M
do i = 1,N
A(i,j) = i*10 + j*1000
enddo
enddo
do j = 1,L
do i = 1,M
B(i,j) = i-j
enddo
enddo
do j = 1,L
do i = 1,N
CC(i,j) = 0.0
C(i,j) = 0.0
enddo
enddo
! Initialize CPU device
istat = cudaSetDevice(idevice)
! Call matrix multiply subroutine to execute on the GPU to compute C
print *,'calling mmul'
call mmul( A, B, C )
print *,' C(1,1) = ', C(1,1)
print *,' C(2,2) = ', C(2,2)
! Perform matrix multiply on host to compute CC
do i = 1,N
do j = 1,L
do k = 1,M
CC(i,j) = CC(i,j) + A(i,k)*B(k,j)
enddo
enddo
enddo
! Check for errors
ierr = 0
do j = 1,L
do i = 1,N
diff = abs(C(i,j) - CC(i,j))
denom = CC(i,j)
if ( denom == 0.0 ) denom = 1.0
error = diff / denom
if ( error > 2.0e-5 ) then
ierr = ierr + 1
if ( ierr <= 10 ) then
print *, 'C(',i,',',j,') = ',C(i,j), ' should be ', CC(i,j), ' error=', error
endif
endif
enddo
enddo
if( ierr == 0 )then
print *, ' No errors found'
else
print *, ierr, ' ERRORS FOUND!!!'
endif
end program
```