I am conducting my thesis on cuda fortran and recently i bumbed into an unusual problem that i recreated also using the matrix multiplication example code from PGI (https://www.pgroup.com/lit/samples/matmul.CUF). Creating a global version of matrix multiplication and running the program for single precision gives of course faster execution of shared memory version than the global memory one. Although when running for double precision the shared memory version is much slower than the global one!!!
Profiling the kernel for shared memory with the command:
pgprof --events l1_shared_bank_conflict
I get no bank conflicts! So whats slowing down the shared kernel???
Running and profiling the cases on NVidia Quadro M2200 for a tile of 16x16:
Single precision:
Global memory version kernel time: 6.7 ms
Shared memory version kernel time: 3.3 ms
Double precision:
Global memory version kernel time: 9.76 ms
Shared memory version kernel time: 19 ms (!!!)
Running the cases on NVidia Quadro M2200 for a tile of 8x8:
Single precision:
Global memory version kernel time: 4.54 ms
Shared memory version kernel time: 2.34 ms
Double precision:
Global memory version kernel time: 6.67 ms
Shared memory version kernel time: 10.42 ms (!!!)
Also running the cases on Nvidia Titan Xp for a tile of 8x8:
Single precision:
Global memory version kernel time: 1.28 ms
Shared memory version kernel time: 0.413 ms
Double precision:
Global memory version kernel time: 1.23 ms
Shared memory version kernel time: 1.335 ms (!!!)
Below I attach the two programs with the four cases!
Code for single precision (Global and shared versions)
! start the module containing the matrix multiply kernel
module mmul_mod
use cudafor
contains
!Global version
attributes(global) subroutine mmul_kernelglobal( A, B, C, N, M, L )
real,device :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
real :: Cij
tx = threadidx%x
ty = threadidx%y
i = (blockidx%x-1) * blockDim%x + tx
j = (blockidx%y-1) * blockDim%y + ty
Cij = 0.0
do k = 1,M
Cij = Cij + A(i,k) * B(k,j)
enddo
C(i,j) = Cij
end subroutine mmul_kernelglobal
! mmul_kernel computes A*B into C where A is NxM, B is MxL, C is then NxL
attributes(global) subroutine mmul_kernelshared( A, B, C, N, M, L )
real,device :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
! submatrices are declared to be in CUDA shared memory
real, shared :: Asub(16,16), Bsub(16,16)
! the value of C(i,j) being computed, a temporary scalar
real :: Cij
! Start execution, first get my thread indices
tx = threadidx%x
ty = threadidx%y
! This thread computes C(i,j) = sum(A(i,:) * B(:,j))
i = (blockidx%x-1) * 16 + tx
j = (blockidx%y-1) * 16 + ty
Cij = 0.0
! Do the k loop in chunks of 16, the block size
do kb = 1, M, 16
! Fill the submatrices; each of 16x16 threads in the thread block
! loads one element of Asub and Bsub
Asub(tx,ty) = A(i,kb+ty-1)
Bsub(tx,ty) = B(kb+tx-1,j)
! Wait until all elements are filled
call syncthreads()
! Multiply the two submatrices; ! Each of the 16x16 threads accumulates the
! dot product for its element of C(i,j)
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
! Synchronize to make sure all threads are done reading the submatrices before
! overwriting them in the next iteration of the kb loop
call syncthreads()
enddo
! Each of the 16x16 threads stores its element to the global C array
C(i,j) = Cij
end subroutine mmul_kernelshared
! The host routine to drive the matrix multiplication
subroutine mmul( A, B, C )
! assumed shape input arrays
real, dimension(:,:) :: A, B, C
! Array dimensions
integer :: N, M, L
! allocatable device arrays
real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev
! dim3 variables to define the grid and block shapes
type(dim3) :: dimGrid, dimBlock
integer :: r
! Get the array sizes
real ctimeall, ctimekernel, flops, mflopskernel, mflopsall
integer c1, c2, c3, c4
! Begin execution, first determine the sizes of the input arrays
N = size( A, 1 )
M = size( A, 2 )
L = size( B, 2 )
! Start data xfer-inclusive timer and allocate the device arrays using
! F90 ALLOCATE
call system_clock( count=c1 )
allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )
! Copy A and B to the device using F90 array assignments
Adev = A(1:N,1:M)
Bdev = B(1:M,1:L)
! Create the grid and block dimensions
dimGrid = dim3( N/16, L/16, 1 )
dimBlock = dim3( 16, 16, 1 )
! Start data xfer-exclusive timer, launch the GPU kernel, wait for completion
call system_clock( count=c2 )
!call mmul_kernelglobal<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, N, M, L )
call mmul_kernelshared<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, N, M, L )
r = cudathreadsynchronize()
! Stop data xfer-exlusive timer, copy the results back, stop data xfer-
! inclusive timer
call system_clock( count=c3 )
C(1:N,1:L) = Cdev
call system_clock( count=c4 )
! Calculate inclusive/exclusive execution times, and report MFLOPS
flops = float(N) * float(M) * float(L)
ctimekernel = c3 - c2
mflopskernel = flops / ctimekernel
ctimeall = c4 - c1
mflopsall = flops / ctimeall
! Print out results
print *, 'Kernel time excluding data xfer:', ctimekernel, ' microseconds'
print *, 'Megaflops excluding data xfer: ', mflopskernel
print *, 'Bandwidth: ', (float(N) * float(M) + float(M) * float(L) + 2 * float(N) * float(L)) * 4 / ctimekernel
print *, 'Total time including data xfer: ', ctimeall, ' microseconds'
print *, 'Megaflops including data xfer: ', mflopsall
! Deallocate device arrays and exit
deallocate( Adev, Bdev, Cdev )
end subroutine mmul
end module mmul_mod
! Main program to initialize arrays, invoke mmul, check results
program matmul
use mmul_mod
real,dimension(:,:),allocatable :: A,B,C,CC
integer N, M, L
integer idevice, istat
! Begin execution
N = 512
M = 1024
L = 512
idevice = 0
print *,' arrays sized ', N, ' by ', M, ' by ', L
allocate(A(N,M),B(M,L),C(N,L),CC(N,L))
! Initialize the A and B arrays; zero out the C array to be computed
! on the GPU, and the CC array to be computed on the host
do j = 1,M
do i = 1,N
A(i,j) = i*10 + j*1000
enddo
enddo
do j = 1,L
do i = 1,M
B(i,j) = i-j
enddo
enddo
do j = 1,L
do i = 1,N
CC(i,j) = 0.0
C(i,j) = 0.0
enddo
enddo
! Initialize CPU device
istat = cudaSetDevice(idevice)
! Call matrix multiply subroutine to execute on the GPU to compute C
print *,'calling mmul'
call mmul( A, B, C )
print *,' C(1,1) = ', C(1,1)
print *,' C(2,2) = ', C(2,2)
! Perform matrix multiply on host to compute CC
do i = 1,N
do j = 1,L
do k = 1,M
CC(i,j) = CC(i,j) + A(i,k)*B(k,j)
enddo
enddo
enddo
! Check for errors
ierr = 0
do j = 1,L
do i = 1,N
diff = abs(C(i,j) - CC(i,j))
denom = CC(i,j)
if ( denom == 0.0 ) denom = 1.0
error = diff / denom
if ( error > 2.0e-5 ) then
ierr = ierr + 1
if ( ierr <= 10 ) then
print *, 'C(',i,',',j,') = ',C(i,j), ' should be ', CC(i,j), ' error=', error
endif
endif
enddo
enddo
if( ierr == 0 )then
print *, ' No errors found'
else
print *, ierr, ' ERRORS FOUND!!!'
endif
end program
Code for double precision (Global and shared memory) here i putted doubleprecision instead of real(8) just to test if theres a difference but of course there wasn’t.
! start the module containing the matrix multiply kernel
module mmul_mod
use cudafor
contains
!Global version
attributes(global) subroutine mmul_kernelglobal( A, B, C, N, M, L )
double precision,device :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
double precision :: Cij
tx = threadidx%x
ty = threadidx%y
i = (blockidx%x-1) * 16 + tx
j = (blockidx%y-1) * 16 + ty
Cij = 0.0
do k = 1,M
Cij = Cij + A(i,k) * B(k,j)
enddo
C(i,j) = Cij
end subroutine mmul_kernelglobal
! mmul_kernel computes A*B into C where A is NxM, B is MxL, C is then NxL
attributes(global) subroutine mmul_kernelshared( A, B, C, N, M, L )
double precision,device :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
! submatrices are declared to be in CUDA shared memory
double precision, shared :: Asub(16,16), Bsub(16,16)
! the value of C(i,j) being computed, a temporary scalar
double precision :: Cij
! Start execution, first get my thread indices
tx = threadidx%x
ty = threadidx%y
! This thread computes C(i,j) = sum(A(i,:) * B(:,j))
i = (blockidx%x-1) * 16 + tx
j = (blockidx%y-1) * 16 + ty
Cij = 0.0
! Do the k loop in chunks of 16, the block size
do kb = 1, M, 16
! Fill the submatrices; each of 16x16 threads in the thread block
! loads one element of Asub and Bsub
Asub(tx,ty) = A(i,kb+ty-1)
Bsub(tx,ty) = B(kb+tx-1,j)
! Wait until all elements are filled
call syncthreads()
! Multiply the two submatrices; ! Each of the 16x16 threads accumulates the
! dot product for its element of C(i,j)
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
! Synchronize to make sure all threads are done reading the submatrices before
! overwriting them in the next iteration of the kb loop
call syncthreads()
enddo
! Each of the 16x16 threads stores its element to the global C array
C(i,j) = Cij
end subroutine mmul_kernelshared
! The host routine to drive the matrix multiplication
subroutine mmul( A, B, C )
! assumed shape input arrays
double precision, dimension(:,:) :: A, B, C
! Array dimensions
integer :: N, M, L
! allocatable device arrays
double precision, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev
! dim3 variables to define the grid and block shapes
type(dim3) :: dimGrid, dimBlock
integer :: r
! Get the array sizes
double precision ctimeall, ctimekernel, flops, mflopskernel, mflopsall
integer c1, c2, c3, c4
! Begin execution, first determine the sizes of the input arrays
N = size( A, 1 )
M = size( A, 2 )
L = size( B, 2 )
! Start data xfer-inclusive timer and allocate the device arrays using
! F90 ALLOCATE
call system_clock( count=c1 )
allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )
! Copy A and B to the device using F90 array assignments
Adev = A(1:N,1:M)
Bdev = B(1:M,1:L)
! Create the grid and block dimensions
dimGrid = dim3( N/16, L/16, 1 )
dimBlock = dim3( 16, 16, 1 )
! Start data xfer-exclusive timer, launch the GPU kernel, wait for completion
call system_clock( count=c2 )
!call mmul_kernelglobal<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, N, M, L )
call mmul_kernelshared<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, N, M, L )
r = cudathreadsynchronize()
! Stop data xfer-exlusive timer, copy the results back, stop data xfer-
! inclusive timer
call system_clock( count=c3 )
C(1:N,1:L) = Cdev
call system_clock( count=c4 )
! Calculate inclusive/exclusive execution times, and report MFLOPS
flops = float(N) * float(M) * float(L)
ctimekernel = c3 - c2
mflopskernel = flops / ctimekernel
ctimeall = c4 - c1
mflopsall = flops / ctimeall
! Print out results
print *, 'Kernel time excluding data xfer:', ctimekernel, ' microseconds'
print *, 'Megaflops excluding data xfer: ', mflopskernel
print *, 'Bandwidth: ', (float(N) * float(M) + float(M) * float(L) + 2 * float(N) * float(L)) * 4 / ctimekernel
print *, 'Total time including data xfer: ', ctimeall, ' microseconds'
print *, 'Megaflops including data xfer: ', mflopsall
! Deallocate device arrays and exit
deallocate( Adev, Bdev, Cdev )
end subroutine mmul
end module mmul_mod
! Main program to initialize arrays, invoke mmul, check results
program matmul
use mmul_mod
double precision,dimension(:,:),allocatable :: A,B,C,CC
integer N, M, L
integer idevice, istat
! Begin execution
N = 512
M = 1024
L = 512
idevice = 0
print *,' arrays sized ', N, ' by ', M, ' by ', L
allocate(A(N,M),B(M,L),C(N,L),CC(N,L))
! Initialize the A and B arrays; zero out the C array to be computed
! on the GPU, and the CC array to be computed on the host
do j = 1,M
do i = 1,N
A(i,j) = i*10 + j*1000
enddo
enddo
do j = 1,L
do i = 1,M
B(i,j) = i-j
enddo
enddo
do j = 1,L
do i = 1,N
CC(i,j) = 0.0
C(i,j) = 0.0
enddo
enddo
! Initialize CPU device
istat = cudaSetDevice(idevice)
! Call matrix multiply subroutine to execute on the GPU to compute C
print *,'calling mmul'
call mmul( A, B, C )
print *,' C(1,1) = ', C(1,1)
print *,' C(2,2) = ', C(2,2)
! Perform matrix multiply on host to compute CC
do i = 1,N
do j = 1,L
do k = 1,M
CC(i,j) = CC(i,j) + A(i,k)*B(k,j)
enddo
enddo
enddo
! Check for errors
ierr = 0
do j = 1,L
do i = 1,N
diff = abs(C(i,j) - CC(i,j))
denom = CC(i,j)
if ( denom == 0.0 ) denom = 1.0
error = diff / denom
if ( error > 2.0e-5 ) then
ierr = ierr + 1
if ( ierr <= 10 ) then
print *, 'C(',i,',',j,') = ',C(i,j), ' should be ', CC(i,j), ' error=', error
endif
endif
enddo
enddo
if( ierr == 0 )then
print *, ' No errors found'
else
print *, ierr, ' ERRORS FOUND!!!'
endif
end program