shared memory double precision problem!

I am conducting my thesis on cuda fortran and recently i bumbed into an unusual problem that i recreated also using the matrix multiplication example code from PGI (https://www.pgroup.com/lit/samples/matmul.CUF). Creating a global version of matrix multiplication and running the program for single precision gives of course faster execution of shared memory version than the global memory one. Although when running for double precision the shared memory version is much slower than the global one!!!

Profiling the kernel for shared memory with the command:
pgprof --events l1_shared_bank_conflict
I get no bank conflicts! So whats slowing down the shared kernel???


Running and profiling the cases on NVidia Quadro M2200 for a tile of 16x16:
Single precision:
Global memory version kernel time: 6.7 ms
Shared memory version kernel time: 3.3 ms

Double precision:
Global memory version kernel time: 9.76 ms
Shared memory version kernel time: 19 ms (!!!)

Running the cases on NVidia Quadro M2200 for a tile of 8x8:
Single precision:
Global memory version kernel time: 4.54 ms
Shared memory version kernel time: 2.34 ms

Double precision:
Global memory version kernel time: 6.67 ms
Shared memory version kernel time: 10.42 ms (!!!)

Also running the cases on Nvidia Titan Xp for a tile of 8x8:
Single precision:
Global memory version kernel time: 1.28 ms
Shared memory version kernel time: 0.413 ms

Double precision:
Global memory version kernel time: 1.23 ms
Shared memory version kernel time: 1.335 ms (!!!)


Below I attach the two programs with the four cases!

Code for single precision (Global and shared versions)

! start the module containing the matrix multiply kernel
module mmul_mod
    use cudafor
    contains


    !Global version
        attributes(global) subroutine mmul_kernelglobal( A, B, C, N, M, L )
       real,device :: A(N,M), B(M,L), C(N,L)
       integer, value :: N, M, L
       integer :: i, j, kb, k, tx, ty
       real :: Cij

       tx = threadidx%x
       ty = threadidx%y
       i = (blockidx%x-1) * blockDim%x + tx
       j = (blockidx%y-1) * blockDim%y + ty

       Cij = 0.0

          do k = 1,M
             Cij = Cij + A(i,k) * B(k,j)
          enddo

       C(i,j) = Cij

    end subroutine mmul_kernelglobal



    ! mmul_kernel computes A*B into C where A is NxM, B is MxL, C is then NxL

    attributes(global) subroutine mmul_kernelshared( A, B, C, N, M, L )
       real,device :: A(N,M), B(M,L), C(N,L)
       integer, value :: N, M, L
       integer :: i, j, kb, k, tx, ty

! submatrices are declared to be in CUDA shared memory

       real, shared :: Asub(16,16), Bsub(16,16)

! the value of C(i,j) being computed, a temporary scalar

       real :: Cij

! Start execution, first get my thread indices

       tx = threadidx%x
       ty = threadidx%y

! This thread computes C(i,j) = sum(A(i,:) * B(:,j))

       i = (blockidx%x-1) * 16 + tx
       j = (blockidx%y-1) * 16 + ty

       Cij = 0.0

! Do the k loop in chunks of 16, the block size

       do kb = 1, M, 16

! Fill the submatrices; each of 16x16 threads in the thread block
! loads one element of Asub and Bsub

          Asub(tx,ty) = A(i,kb+ty-1)
          Bsub(tx,ty) = B(kb+tx-1,j)

! Wait until all elements are filled

          call syncthreads()

! Multiply the two submatrices; ! Each of the 16x16 threads accumulates the
! dot product for its element of C(i,j)

          do k = 1,16
             Cij = Cij + Asub(tx,k) * Bsub(k,ty)
          enddo

! Synchronize to make sure all threads are done reading the submatrices before 
! overwriting them in the next iteration of the kb loop

          call syncthreads()

       enddo

! Each of the 16x16 threads stores its element to the global C array

       C(i,j) = Cij

    end subroutine mmul_kernelshared

! The host routine to drive the matrix multiplication

    subroutine mmul( A, B, C )

! assumed shape input arrays

       real, dimension(:,:) :: A, B, C

! Array dimensions

       integer :: N, M, L

! allocatable device arrays

       real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev

! dim3 variables to define the grid and block shapes

       type(dim3) :: dimGrid, dimBlock
       integer :: r

! Get the array sizes

       real ctimeall, ctimekernel, flops, mflopskernel, mflopsall
       integer c1, c2, c3, c4

! Begin execution, first determine the sizes of the input arrays

       N = size( A, 1 )
       M = size( A, 2 )
       L = size( B, 2 )

! Start data xfer-inclusive timer and allocate the device arrays using 
! F90 ALLOCATE

       call system_clock( count=c1 )
       allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )

! Copy A and B to the device using F90 array assignments

       Adev = A(1:N,1:M)
       Bdev = B(1:M,1:L)

! Create the grid and block dimensions

       dimGrid = dim3( N/16, L/16, 1 )
       dimBlock = dim3( 16, 16, 1 )

! Start data xfer-exclusive timer, launch the GPU kernel, wait for completion

       call system_clock( count=c2 )
       !call mmul_kernelglobal<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, N, M, L )
       call mmul_kernelshared<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, N, M, L )
       r = cudathreadsynchronize()

! Stop data xfer-exlusive timer, copy the results back, stop data xfer-
! inclusive timer

       call system_clock( count=c3 )
       C(1:N,1:L) = Cdev
       call system_clock( count=c4 )

! Calculate inclusive/exclusive execution times, and report MFLOPS
       
       flops = float(N) * float(M) * float(L)
       ctimekernel = c3 - c2
       mflopskernel = flops / ctimekernel
       ctimeall = c4 - c1
       mflopsall = flops / ctimeall

!  Print out results

       print *, 'Kernel time excluding data xfer:', ctimekernel, ' microseconds'
       print *, 'Megaflops excluding data xfer:  ', mflopskernel
       print *, 'Bandwidth:  ', (float(N) * float(M) + float(M) * float(L) + 2 * float(N) * float(L)) * 4 / ctimekernel
       print *, 'Total time including data xfer: ', ctimeall, ' microseconds' 
       print *, 'Megaflops including data xfer:  ', mflopsall

! Deallocate device arrays and exit

       deallocate( Adev, Bdev, Cdev )

    end subroutine mmul
end module mmul_mod

! Main program to initialize arrays, invoke mmul, check results

program matmul
   use mmul_mod
   real,dimension(:,:),allocatable :: A,B,C,CC
   integer N, M, L
   integer idevice, istat

! Begin execution

   N = 512
   M = 1024
   L = 512
   idevice = 0
   print *,' arrays sized ', N, ' by ', M, ' by ', L
   allocate(A(N,M),B(M,L),C(N,L),CC(N,L))

! Initialize the A and B arrays;  zero out the C array to be computed
! on the GPU, and the CC array to be computed on the host

   do j = 1,M
      do i = 1,N
         A(i,j) = i*10 + j*1000
      enddo
   enddo
   do j = 1,L
      do i = 1,M
         B(i,j) = i-j
      enddo
   enddo
   do j = 1,L
      do i = 1,N
         CC(i,j) = 0.0
         C(i,j) = 0.0
      enddo
   enddo

! Initialize CPU device

  istat = cudaSetDevice(idevice)  

! Call matrix multiply subroutine to execute on the GPU to compute C

   print *,'calling mmul'
   call mmul( A, B, C )
   print *,' C(1,1) = ', C(1,1)
   print *,' C(2,2) = ', C(2,2)

! Perform matrix multiply on host to compute CC

   do i = 1,N
      do j = 1,L
         do k = 1,M
            CC(i,j) = CC(i,j) + A(i,k)*B(k,j)
         enddo
      enddo
   enddo

! Check for errors

   ierr = 0
   do j = 1,L
      do i = 1,N
         diff = abs(C(i,j) - CC(i,j))
         denom = CC(i,j)
         if ( denom == 0.0 ) denom = 1.0
         error = diff / denom
         if ( error > 2.0e-5 ) then
            ierr = ierr + 1
            if ( ierr <= 10 ) then
               print *, 'C(',i,',',j,') = ',C(i,j), ' should be ', CC(i,j), ' error=', error
            endif
         endif
      enddo
   enddo

   if( ierr == 0 )then
      print *, ' No errors found'
   else
      print *, ierr, ' ERRORS FOUND!!!'
   endif

end program

Code for double precision (Global and shared memory) here i putted doubleprecision instead of real(8) just to test if theres a difference but of course there wasn’t.

! start the module containing the matrix multiply kernel
module mmul_mod
    use cudafor
    contains


    !Global version
        attributes(global) subroutine mmul_kernelglobal( A, B, C, N, M, L )
       double precision,device :: A(N,M), B(M,L), C(N,L)
       integer, value :: N, M, L
       integer :: i, j, kb, k, tx, ty
       double precision :: Cij

       tx = threadidx%x
       ty = threadidx%y
       i = (blockidx%x-1) * 16 + tx
       j = (blockidx%y-1) * 16 + ty

       Cij = 0.0

          do k = 1,M
             Cij = Cij + A(i,k) * B(k,j)
          enddo

       C(i,j) = Cij

    end subroutine mmul_kernelglobal



    ! mmul_kernel computes A*B into C where A is NxM, B is MxL, C is then NxL

    attributes(global) subroutine mmul_kernelshared( A, B, C, N, M, L )
       double precision,device :: A(N,M), B(M,L), C(N,L)
       integer, value :: N, M, L
       integer :: i, j, kb, k, tx, ty

! submatrices are declared to be in CUDA shared memory

       double precision, shared :: Asub(16,16), Bsub(16,16)

! the value of C(i,j) being computed, a temporary scalar

       double precision :: Cij

! Start execution, first get my thread indices

       tx = threadidx%x
       ty = threadidx%y

! This thread computes C(i,j) = sum(A(i,:) * B(:,j))

       i = (blockidx%x-1) * 16 + tx
       j = (blockidx%y-1) * 16 + ty

       Cij = 0.0

! Do the k loop in chunks of 16, the block size

       do kb = 1, M, 16

! Fill the submatrices; each of 16x16 threads in the thread block
! loads one element of Asub and Bsub

          Asub(tx,ty) = A(i,kb+ty-1)
          Bsub(tx,ty) = B(kb+tx-1,j)

! Wait until all elements are filled

          call syncthreads()

! Multiply the two submatrices; ! Each of the 16x16 threads accumulates the
! dot product for its element of C(i,j)

          do k = 1,16
             Cij = Cij + Asub(tx,k) * Bsub(k,ty)
          enddo

! Synchronize to make sure all threads are done reading the submatrices before 
! overwriting them in the next iteration of the kb loop

          call syncthreads()

       enddo

! Each of the 16x16 threads stores its element to the global C array

       C(i,j) = Cij

    end subroutine mmul_kernelshared

! The host routine to drive the matrix multiplication

    subroutine mmul( A, B, C )

! assumed shape input arrays

       double precision, dimension(:,:) :: A, B, C

! Array dimensions

       integer :: N, M, L

! allocatable device arrays

       double precision, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev

! dim3 variables to define the grid and block shapes

       type(dim3) :: dimGrid, dimBlock
       integer :: r

! Get the array sizes

       double precision ctimeall, ctimekernel, flops, mflopskernel, mflopsall
       integer c1, c2, c3, c4

! Begin execution, first determine the sizes of the input arrays

       N = size( A, 1 )
       M = size( A, 2 )
       L = size( B, 2 )

! Start data xfer-inclusive timer and allocate the device arrays using 
! F90 ALLOCATE

       call system_clock( count=c1 )
       allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )

! Copy A and B to the device using F90 array assignments

       Adev = A(1:N,1:M)
       Bdev = B(1:M,1:L)

! Create the grid and block dimensions

       dimGrid = dim3( N/16, L/16, 1 )
       dimBlock = dim3( 16, 16, 1 )

! Start data xfer-exclusive timer, launch the GPU kernel, wait for completion

       call system_clock( count=c2 )
       !call mmul_kernelglobal<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, N, M, L )
       call mmul_kernelshared<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, N, M, L )
       r = cudathreadsynchronize()

! Stop data xfer-exlusive timer, copy the results back, stop data xfer-
! inclusive timer

       call system_clock( count=c3 )
       C(1:N,1:L) = Cdev
       call system_clock( count=c4 )

! Calculate inclusive/exclusive execution times, and report MFLOPS
       
       flops = float(N) * float(M) * float(L)
       ctimekernel = c3 - c2
       mflopskernel = flops / ctimekernel
       ctimeall = c4 - c1
       mflopsall = flops / ctimeall

!  Print out results

       print *, 'Kernel time excluding data xfer:', ctimekernel, ' microseconds'
       print *, 'Megaflops excluding data xfer:  ', mflopskernel
       print *, 'Bandwidth:  ', (float(N) * float(M) + float(M) * float(L) + 2 * float(N) * float(L)) * 4 / ctimekernel
       print *, 'Total time including data xfer: ', ctimeall, ' microseconds' 
       print *, 'Megaflops including data xfer:  ', mflopsall

! Deallocate device arrays and exit

       deallocate( Adev, Bdev, Cdev )

    end subroutine mmul
end module mmul_mod

! Main program to initialize arrays, invoke mmul, check results

program matmul
   use mmul_mod
   double precision,dimension(:,:),allocatable :: A,B,C,CC
   integer N, M, L
   integer idevice, istat

! Begin execution

   N = 512
   M = 1024
   L = 512
   idevice = 0
   print *,' arrays sized ', N, ' by ', M, ' by ', L
   allocate(A(N,M),B(M,L),C(N,L),CC(N,L))

! Initialize the A and B arrays;  zero out the C array to be computed
! on the GPU, and the CC array to be computed on the host

   do j = 1,M
      do i = 1,N
         A(i,j) = i*10 + j*1000
      enddo
   enddo
   do j = 1,L
      do i = 1,M
         B(i,j) = i-j
      enddo
   enddo
   do j = 1,L
      do i = 1,N
         CC(i,j) = 0.0
         C(i,j) = 0.0
      enddo
   enddo

! Initialize CPU device

  istat = cudaSetDevice(idevice)  

! Call matrix multiply subroutine to execute on the GPU to compute C

   print *,'calling mmul'
   call mmul( A, B, C )
   print *,' C(1,1) = ', C(1,1)
   print *,' C(2,2) = ', C(2,2)

! Perform matrix multiply on host to compute CC

   do i = 1,N
      do j = 1,L
         do k = 1,M
            CC(i,j) = CC(i,j) + A(i,k)*B(k,j)
         enddo
      enddo
   enddo

! Check for errors

   ierr = 0
   do j = 1,L
      do i = 1,N
         diff = abs(C(i,j) - CC(i,j))
         denom = CC(i,j)
         if ( denom == 0.0 ) denom = 1.0
         error = diff / denom
         if ( error > 2.0e-5 ) then
            ierr = ierr + 1
            if ( ierr <= 10 ) then
               print *, 'C(',i,',',j,') = ',C(i,j), ' should be ', CC(i,j), ' error=', error
            endif
         endif
      enddo
   enddo

   if( ierr == 0 )then
      print *, ' No errors found'
   else
      print *, ierr, ' ERRORS FOUND!!!'
   endif

end program

It’s hard to know for sure, but it may be that using twice as much shared memory is hurting your occupancy. You can use the NVIDIA occupancy calculator to determine if the amount of shared memory use per block is limiting the total number of blocks running, for your GPU.