CUDA Fortran matrix-multiply 10x slower than CUDA C version

I have two CUDA programs (one in C and one in Fortran) that perform the same computation, however the kernel call in the C version is 10 times faster for some reason.

In my attempt to discover why a different CUDA (fortran) program was running slowly, I compared the performance of the CUDA C Matrix-Multiply found in section 3.2.2 of the NVIDIA CUDA Programming Guide (version 3.0) to the performance of a fortran version that I believe uses the same algorithm. I timed the kernel calls (I’m not worried about memory copying times right now) when multiplying two 960x960 matrices of floats, and the kernel in the C version takes 0.009 seconds while the Fortran kernel takes 0.108 seconds (10 x longer). Clearly, I’m missing something. Can anyone explain this?

The code is below. I’m using a block size of 16x16, I’m assuming the matrices are square, and I’m assuming their width and height are divisible by the block width (16) and height (16).

Thanks very much
Jim

CODE:

C version
(main method first, then matmul function, kernel at the end)

//matrix multiply example from NVICIA_CUDA_Programming_Guide_3.0 page 24
//SHARED memory version

#include <stdio.h>
#include <cuda.h>
#include <ctime>

typedef struct {
     int width;
     int height;
     int stride;
     float* elements;
} Matrix;

#define BLOCK_SIZE 16

__global__ void MatMulKernel(Matrix, Matrix, Matrix);
void MatMul(Matrix, Matrix, Matrix);
__device__ float GetElement(const Matrix, int, int);
__device__ void SetElement(Matrix, int, int, float);

int main(int argc, char *argv[])
{
     // for now, assume matrices A and B have same dimensions

     int matrix_width;
     int matrix_height;
     if (argc < 3) {
          printf("\nERROR: not enough arguments.\n");
          printf("arg1: (int)matrix_width, arg2: (int)matrix_height\n");
          printf("\nQuitting program.\n\n");
          return 1;
     } else {
          // assume argv[0] and argv[1] are ints
          matrix_width = atoi(argv[1]);
          matrix_height = atoi(argv[2]);
          printf("\nmatrix_width: %i\nmatrix_height: %i\n\n", matrix_width, matrix_height);
          if (matrix_width != matrix_height || matrix_width%16 != 0 || matrix_height%16 != 0) {
               printf("\nERROR: cannot multiply matrices\n");
               printf("For simplicity, matrix must be square,\n");
               printf("and matrix height and width must be multiples of 16 (BLOCKSIZE)\n\n");
               return 1;
          }

     }

     Matrix A,B,C;
     A.width = B.width = C.width = matrix_width;
     A.height = B.height = C.height = matrix_height;
     size_t size = matrix_width*matrix_height*sizeof(float);
     A.elements = (float *)malloc(size);
     B.elements = (float *)malloc(size);
     C.elements = (float *)malloc(size);

     // initialize
     for (int i = 0; i < A.height; ++i) {
          for (int j = 0; j < A.width; ++j) {
               A.elements[i*A.width + j] = (float)i-j;
               B.elements[i*B.width + j] = (float)i+j;
               C.elements[i*C.width + j] = (float)0;
          }
     }

     MatMul(A,B,C);

     return 0;
}

//host code, assume Matrix dimensions are multiples of BLOCK_SIZE
void MatMul(Matrix A, Matrix B, Matrix C)
{

     Matrix d_A;  // d means device
     d_A.width = d_A.stride = A.width;
     d_A.height = A.height;
     size_t size = A.width*A.height*sizeof(float);
     cudaMalloc((void**)&d_A.elements, size);
     cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);

     Matrix d_B; 
     d_B.width = d_B.stride = B.width;
     d_B.height = B.height;
     size = B.width*B.height*sizeof(float);
     cudaMalloc((void**)&d_B.elements, size);
     cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);

     Matrix d_C; 
     d_C.width = d_C.stride = C.width;
     d_C.height = C.height;
     size = C.width*C.height*sizeof(float);
     cudaMalloc((void**)&d_C.elements, size);
     
     // kernel invocation
     dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
     dim3 dimGrid(B.width/dimBlock.x, A.height/dimBlock.y);

     long float time1 = ((long float)clock())/CLOCKS_PER_SEC;
     MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
     cudaThreadSynchronize();
     long float time2 = ((long float)clock())/CLOCKS_PER_SEC;
     printf("Time (seconds): %lf\n", (long float)(time2-time1));

     // Get result
     cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);

     printf("\n Testing result... (this may take a few seconds) \n\n");

     // Test Matrix
     Matrix CC;
     CC.width = C.width;
     CC.height = C.height;
     CC.stride = C.stride;
     CC.elements = (float *)malloc(C.width*C.height*sizeof(float));

     // Test Multiply
     float temp = 0;
     for(int j = 0; j < B.width; ++j) {
          for (int i = 0; i < A.height; ++i) {
               temp = 0;
               for (int k = 0; k < A.width; ++k) {
                    temp+= A.elements[i*A.width + k] * B.elements[k*B.width + j];
               }
               CC.elements[i*CC.width + j] = temp;
          }
     }


     for (int i = 0; i < C.height; ++i) {
          for (int j = 0; j < C.width; ++j) {
//               printf("(%i,%i): %f ?= %f\n", i, j, CC.elements[i*CC.width + j], C.elements[i*C.width + j]);
               if (CC.elements[i*CC.width + j] != C.elements[i*C.width + j]) {
                    printf("FAILURE\n");
                    printf("(%i,%i): %f ?= %f\n", i, j, CC.elements[i*CC.width + j], C.elements[i*C.width + j]);
               }
          }
     }

     cudaFree(d_A.elements);
     cudaFree(d_B.elements);
     cudaFree(d_C.elements);
}

//kernel
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
     int blockRow = blockIdx.y;
     int blockCol = blockIdx.x;

     // Each thread block computes one sub-matrix Csub of C
     // This is in GLOBAL memory?
     Matrix Csub = GetSubMatrix(C, blockRow, blockCol);

     // Each thread computes on element of Csub
     // by accumulating results into Cvalue
     float Cvalue = 0;

     // Thread row and column within Csub
     int threadRow = threadIdx.y;
     int threadCol = threadIdx.x;

     // loop over all the sub-matrices of A and B that are
     // required to compute Csub
     // Multiply each pair of sub-matrices together
     // and accumulate the results
     for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {
     
          // Get sub-matrices Asub of A and Bsub of B
          Matrix Asub = GetSubMatrix(A, blockRow, m);
          Matrix Bsub = GetSubMatrix(B, m, blockCol);

          // Shared memory used to store Asub and Bsub
          __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
          __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

          // Load Asub and Bsub from device memory to shared memory
          // Each thread loads one element of each sub-matrix
          As[threadRow][threadCol] = GetElement(Asub, threadRow, threadCol);
          Bs[threadRow][threadCol] = GetElement(Bsub, threadRow, threadCol);

          __syncthreads();

          // Multiply Asub and Bsub
          for (int e = 0; e < BLOCK_SIZE; ++e) {
               Cvalue += As[threadRow][e] * Bs[e][threadCol];
          }

          // Syhcnronize to make sure that the preceding 
          // computation is done before loading two new 
          // sub-matrices of A and B in the next iteration
          __syncthreads();
     }

     // Write Csub to device memory
     // Each thread writes one element
     SetElement(Csub, threadRow, threadCol, Cvalue);
}


__device__ float GetElement(const Matrix A, int row, int col)
{
     return A.elements[row * A.stride + col];
}

__device__ void SetElement(Matrix A, int row, int col, float value)
{
     A.elements[row * A.stride + col] = value;
}

// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
     Matrix Asub;
     Asub.width = BLOCK_SIZE;
     Asub.height = BLOCK_SIZE;
     Asub.stride = A.stride;
     Asub.elements = &A.elements[A.stride*BLOCK_SIZE * row + BLOCK_SIZE * col];

     return Asub;
}

Fortran version
(functions are in opposite order, sorry – kernel first, then mat-mul function, then main method)

       ! start the module containing the matrix multiply kernel

       module mmul_mod
           use cudafor
           contains
       
           ! mmul_kernel computes A*B into C (A is NxM, B is MxL, C is NxL)
       
           attributes(global) subroutine mmul_kernel(A,B,C,N,M,L)
               real,device :: A(N,M), B(M,L), C(N,L) 
               integer, value :: N, M, L
               integer :: i, j, kb, k, threadcol, threadrow

               real, shared :: Asub(16,16), Bsub(16,16)

               ! the value of C(i,j) being computed, a temporary scalar

               real :: Cij

               ! Start execution, first get my thread indices

               threadcol = threadidx%x
               threadrow = threadidx%y

               ! This thread computes C(i,j) = sum(A(i,:) * B(:,j))

               j = (blockidx%x-1) * 16 + threadcol
               i = (blockidx%y-1) * 16 + threadrow

               Cij = 0.0

               ! Do the k loop in chunks of 16, the block size

               do kb = 1, M, 16

                   ! submatrices are declared to be in CUDA shared memory

!                   real, shared :: Asub(16,16), Bsub(16,16)

                   ! Fill the submatrices; each of 16x16 threads in the thread
                   ! block loads one element of Asub and Bsub

                   Asub(threadrow,threadcol) = A(i,kb+threadcol-1)   
                   Bsub(threadrow,threadcol) = B(kb+threadrow-1,j)   

                   ! Wait until all elements are filled

                   call syncthreads()

                   ! Multiply the two submatrices; ! Each of the 16x16 threads
                   ! accumulates the dot product for its element of C(i,j)

                   do k = 1,16
                       Cij = Cij + Asub(threadrow,k) * Bsub(k,threadcol)
                   enddo

                   ! Synchronize to make sure all threads are done reading the
                   ! submatrices before overwriting them in the next iteration
                   ! of hte kb loop

                   call syncthreads()

               enddo

               ! Each of the 16x16 threads stores its element to the global C array

               C(i,j) = Cij

           end subroutine mmul_kernel

           ! The host routine to drive the matrix multiplication

           subroutine mmul( A, B, C )

           ! assume shape input arrays

               real, dimension (:,:) :: A, B, C

               ! Array dimensions

               integer :: N, M, L

               ! allocatable device arrays

               real,device,allocatable,dimension(:,:) :: Adev,Bdev,Cdev

               ! dim2 variables to define the grid and block shapes

               type(dim3) :: dimGrid, dimBlock
               integer :: r              

               ! Get the array sizes

               real ctimeall,ctimekernel,flops,mflopskernel,mflopsall
               integer c1, c2, c3, c4

               ! Begin execution, first determine the sizes of the input arrays

               N = size( A, 1 )
               M = size( A, 2 )
               L = size( B, 2 )

               ! start data xfer-inclusive timer and allocate the
               ! device arrays using F90 ALLOCATE

               call system_clock( count=c1 )
               allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )

               ! Copy A and B to the device using F90 array assignments

               Adev = A(1:N,1:M)
               Bdev = B(1:M,1:L)   

               ! Create the grid and block dimensions

               dimGrid = dim3( L/16, N/16, 1 ) 
               dimBlock = dim3( 16, 16, 1)

               ! Start the data xfer-exclusive timer, launch the GPU kernel,
               ! wait for completion

               call system_clock( count=c2 )
               call mmul_kernel<<<dimGrid,dimBlock>>>(Adev,Bdev,Cdev,
     : N,M,L)
               r = cudathreadsynchronize()

               ! Stop data xfer-exclusive timer, copy the results back,
               ! stop data xfer-inclusive timer

               call system_clock( count=c3 )
               C(1:N,1:L) = Cdev
               call system_clock( count=c4 )

               ! Calculate inclusive/exclusive execution times, and report MFLOPS

               flops = float(N) * float(M) * float(L) * 2
                                                  ! QUESTION: souldn't this be *2
               ctimekernel = c3 - c2
               mflopskernel = flops / ctimekernel
               ctimeall = c4 - c1
               mflopsall = flops / ctimeall

               ! Print out results

               print *, 'Kernel time excluding data xfer:', ctimekernel,
     :                  ' microseconds'
               print *, 'Megaflops excluding data xfer: ', mflopskernel
               print *, 'Total time including data xfer: ', ctimeall, 
     :                  ' microseconds'
               print *, 'Megaflops including data xfer: ', mflopsall

               ! Deallocate device arrays and exit

               deallocate( Adev, Bdev, Cdev )

           end subroutine mmul
       end module mmul_mod

       ! Main program to initialize arrays, invoke mmul, check results

       program matmul
           use mmul_mod
           real,dimension(:,:),allocatable :: A,B,C,CC
           integer N, M, L
           integer idevice, istat

           ! Begin execution

           N = 960 !512
           M = 960 !1024
           L = 960 !512
           idevice = 0
           print *,' arrays sized ', N, ' by ', M, ' by ', L
           allocate(A(N,M),B(M,L),C(N,L),CC(N,L))

           ! Initialize the A and B arrays;
           ! zero out the C array to be computed on the GPU,
           ! and the CC array to be computed on the host

           do j = 1,M
               do i = 1,N
                   A(i,j) = i+j ! i*10 + j+1000
              !     print *,'A(',i,',',j,')=',A(i,j)
               enddo
           enddo
           do j = 1,L
               do i = 1,M
!                   B(i,j) = -1*(i == j)
                   B(i,j) = 10.0*i+3.0*j
               !    print *,'B(',i,',',j,')=',B(i,j)
               enddo
           enddo
           do j = 1,L
               do i = 1,N
                   CC(i,j) = 0.0
                   C(i,j) = 0.0
               enddo
           enddo

           ! Initialize CPU device

           istat = cudaSetDevice(idevice)      !df

           ! Call matrix multiply subroutine to execute
           ! on the GPU to compute C

           print *,'calling mmul'
           call mmul (A, B, C )
           print *,' C(1,1) = ', C(1,1)
           print *,' C(2,2) = ', C(2,2)

           ! Perform matrix multiply on host to compute CC

           do i = 1,N
               do j = 1,L
                   do k = 1,M
                       CC(i,j) = CC(i,j) + A(i,k)*B(k,j)
                   enddo
               enddo
           enddo

           ! Check for errors

           ierr = 0
           do j = 1,L
               do i = 1,N
                   diff = abs(C(i,j) - CC(i,j))
                   denom = CC(i,j)
                   if ( denom == 0.0 ) denom = 1.0
                   error = diff / denom
                   if ( error > 2.0e-5 ) then
                       ierr = ierr + 1
                       if ( ierr <= 10 )  then
                           print *, 'C(',i,',',j,') = ',C(i,j),
     :                       ' should be ', CC(i,j), ' error=', error
                       endif
                   endif
               enddo
           enddo

           if( ierr == 0 ) then
               print *, ' No errors found'
           else
               print *, ierr, ' ERRORS FOUND!!'
           endif

       end program

I don’t know if this is an answer, but when I time CUDA Fortran, I tend to use the cudaEvent calls (echoing a CUDA C program I once saw). That is:

type(cudaEvent) :: kernel_start, kernel_end
real :: kernel_time
...
istat = cudaEventCreate(kernel_start)
istat = cudaEventCreate(kernel_end)
...
istat = cudaEventRecord(kernel_start,0)
call myKernel<<<...>>>
istat = cudaEventRecord(kernel_end,0)
istat = cudaThreadSynchronize()
istat = cudaEventElapsedTime(kernel_time,kernel_start,kernel_end)

I guess I do this in case the language timers have an issue with the asynchronous kernel call (and thus return way too fast timings). But you do include a thread synch so…hmm.

I guess you could try the cudaEvent method and see if it makes a difference in CUDA Fortran or C?

Thanks, I just tried the CudaEvent method in both programs and I am still getting the same timing results. I had aready checked my results with the cuda profiler, and now I have even more evidence that my timing code isn’t the problem.

Any other ideas?

I’ve made a little bit of progress. I switched the indexing inside the Fortran kernel (since C matrices are stored in row-major order and Fortran matrices are stored in column-major order). Now it is basically identical to the example from the Introduction to PGI CUDA Fortran, and multiplies two 960x960 matrices in .035 seconds which is only 4x slower than the C version (.009 seconds). (block size is still 16x16)

Does anyone have any idea why the Fortran version is still significantly slower than the C version?

Thanks

The edited Fortran code is below:

! start the module containing the matrix multiply kernel
       module mmul_mod
       use cudafor
       contains

! mmul_kernel computes A*B into C where A is NxM, B is MxL, C is then NxL

       attributes(global) subroutine mmul_kernel( A, B, C, N, M, L )
              real,device :: A(N,M), B(M,L), C(N,L)
              integer, value :: N, M, L
              integer :: i, j, kb, k, tx, ty

! submatrices are declared to be in CUDA shared memory

       real, shared :: Asub(16,16), Bsub(16,16)

! the value of C(i,j) being computed, a temporary scalar

       real :: Cij

! Start execution, first get my thread indices

       tx = threadidx%x
       ty = threadidx%y

! This thread computes C(i,j) = sum(A(i,:) * B(:,j))

       i = (blockidx%x-1) * 16 + tx
       j = (blockidx%y-1) * 16 + ty

       Cij = 0.0

! Do the k loop in chunks of 16, the block size

       do kb = 1, M, 16

! Fill the submatrices; each of 16x16 threads in the thread block
! loads one element of Asub and Bsub

              Asub(tx,ty) = A(i,kb+ty-1)
              Bsub(tx,ty) = B(kb+tx-1,j)

! Wait until all elements are filled

              call syncthreads()

! Multiply the two submatrices; ! Each of the 16x16 threads accumulates the
! dot product for its element of C(i,j)

              do k = 1,16
                     Cij = Cij + Asub(tx,k) * Bsub(k,ty)
              enddo

! Synchronize to make sure all threads are done reading the submatrices before
! overwriting them in the next iteration of hte kb loop

              call syncthreads()

       enddo

! Each of the 16x16 threads stores its element to the global C array

       C(i,j) = Cij

       end subroutine mmul_kernel

! The host routine to drive the matrix multiplication

       subroutine mmul( A, B, C )

! assume shape input arrays

              real, dimension (:,:) :: A, B, C

! Array dimensions

              integer :: N, M, L

! allocatable device arrays

              real,device,allocatable,dimension(:,:) :: Adev,Bdev,Cdev

! dim2 variables to define the grid and block shapes

              type(dim3) :: dimGrid, dimBlock
              integer :: r

! Get the array sizes

              real ctimeall, ctimekernel, flops, mflopskernel, mflopsall
              integer c1, c2, c3, c4

! Begin execution, first determine the sizes of the input arrays

              N = size( A, 1 )
              M = size( A, 2 )
              L = size( B, 2 )

!start data xfer-inclusive timer and allocate the device arrays using
! F90 ALLOCATE

              call system_clock( count=c1 )
              allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )

! Copy A and B to the device using F90 array assignments

              Adev = A(1:N,1:M)
              Bdev = B(1:M,1:L)

! Create the grid and block dimensions

              dimGrid = dim3( N/16, L/16, 1 )
              dimBlock = dim3( 16, 16, 1)

! Start the data xfer-exclusive timer, launch the GPU kernel, wait for completion

              call system_clock( count=c2 )
              call mmul_kernel<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev,
     :         N, M, L )
              r = cudathreadsynchronize()

! Stop data xfer-exclusive timer, copy the results back, stop data xfer-
! inclusive timer

              call system_clock( count=c3 )
              C(1:N,1:L) = Cdev
              call system_clock( count=c4 )

! Calculate inclusive/exclusive execution times, and report MFLOPS

              flops = float(N) * float(M) * float(L) * 2   
              ctimekernel = c3 - c2
              mflopskernel = flops / ctimekernel
              ctimeall = c4 - c1
              mflopsall = flops / ctimeall

! Print out results

              print *, 'Kernel time excluding data xfer:', ctimekernel,
     :                 ' microseconds'
              print *, 'Megaflops excluding data xfer: ', mflopskernel
              print *, 'Total time including data xfer: ', ctimeall, 
     :                 ' microseconds'
              print *, 'Megaflops including data xfer: ', mflopsall

! Deallocate device arrays and exit

              deallocate( Adev, Bdev, Cdev )

       end subroutine mmul
       end module mmul_mod

! Main program to initialize arrays, invoke mmul, check results

       program matmul
       use mmul_mod
       real,dimension(:,:),allocatable :: A,B,C,CC
       integer N, M, L
       integer idevice, istat

! Begin execution

       N = 960 ! 512
       M = 960 ! 1024
       L = 960 ! 512
       idevice = 0
       print *,' arrays sized ', N, ' by ', M, ' by ', L
       allocate(A(N,M),B(M,L),C(N,L),CC(N,L))

! Initialize the A and B arrays; zero out the C array to be computed
! on the GPU, and the CC array to be computed on the host

       do j = 1,M
              do i = 1,N
                     A(i,j) = i*10 + j+1000
              enddo
       enddo
       do j = 1,L
              do i = 1,M
                     B(i,j) = i-j
              enddo
       enddo
       do j = 1,L
              do i = 1,N
                     CC(i,j) = 0.0
                     C(i,j) = 0.0
              enddo
       enddo

! Initialize CPU device

       istat = cudaSetDevice(idevice)

! Call matrix multiply subroutine to execute on the GPU to compute C

       print *,'calling mmul'
       call mmul (A, B, C )
       print *,' C(1,1) = ', C(1,1)
       print *,' C(2,2) = ', C(2,2)

! Perform matrix multiply on host to compute CC

       do i = 1,N
              do j = 1,L
                     do k = 1,M
                            CC(i,j) = CC(i,j) + A(i,k)*B(k,j)
                     enddo
              enddo
       enddo

! Check for errors

       ierr = 0
       do j = 1,L
              do i = 1,N
                     diff = abs(C(i,j) - CC(i,j))
                     denom = CC(i,j)
                     if ( denom == 0.0 ) denom = 1.0
                     error = diff / denom
                     if ( error > 2.0e-5 ) then
                            ierr = ierr + 1
                            if ( ierr <= 10 ) then
                                   print *, 'C(',i,',',j,') = ',C(i,j),
     :                        ' should be ', CC(i,j), ' error=', error
                            endif
                     endif
              enddo
       enddo

       if( ierr == 0 ) then
              print *, ' No errors found'
       else
              print *, ierr, ' ERRORS FOUND!!!'
       endif

       end program

Hi Jim,

I think you just need to compile with optimization enabled:

Without optimization, I get the same “.035” second result:

% pgf90 mm1.cuf -Mfixed -o mm1.out -V10.6 ; mm1.out
  arrays sized           960  by           960  by           960
 calling mmul
 Kernel time excluding data xfer:    35166.00      microseconds
 Megaflops excluding data xfer:     50317.69
 Total time including data xfer:     132872.0      microseconds
 Megaflops including data xfer:     13317.12
  C(1,1) =    7.5983488E+08
  C(2,2) =    7.6299712E+08
  No errors found

With optimization, the time is reduced to “.009” seconds:

% pgf90 -O2 mm1.cuf -Mfixed -o mm1.out -V10.6 ; mm1.out
  arrays sized           960  by           960  by           960
 calling mmul
 Kernel time excluding data xfer:    9273.000      microseconds
 Megaflops excluding data xfer:     190819.8
 Total time including data xfer:     100289.0      microseconds
 Megaflops including data xfer:     17643.73
  C(1,1) =    7.5983488E+08
  C(2,2) =    7.6299712E+08
  No errors found

Hope this helps,
Mat

That was the first thing I tried, something else must have been wrong at that point. Well anyway, it works now! Thanks very much!

Jim