I have two CUDA programs (one in C and one in Fortran) that perform the same computation, however the kernel call in the C version is 10 times faster for some reason.
In my attempt to discover why a different CUDA (fortran) program was running slowly, I compared the performance of the CUDA C Matrix-Multiply found in section 3.2.2 of the NVIDIA CUDA Programming Guide (version 3.0) to the performance of a fortran version that I believe uses the same algorithm. I timed the kernel calls (I’m not worried about memory copying times right now) when multiplying two 960x960 matrices of floats, and the kernel in the C version takes 0.009 seconds while the Fortran kernel takes 0.108 seconds (10 x longer). Clearly, I’m missing something. Can anyone explain this?
The code is below. I’m using a block size of 16x16, I’m assuming the matrices are square, and I’m assuming their width and height are divisible by the block width (16) and height (16).
Thanks very much
Jim
CODE:
C version
(main method first, then matmul function, kernel at the end)
//matrix multiply example from NVICIA_CUDA_Programming_Guide_3.0 page 24
//SHARED memory version
#include <stdio.h>
#include <cuda.h>
#include <ctime>
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
#define BLOCK_SIZE 16
__global__ void MatMulKernel(Matrix, Matrix, Matrix);
void MatMul(Matrix, Matrix, Matrix);
__device__ float GetElement(const Matrix, int, int);
__device__ void SetElement(Matrix, int, int, float);
int main(int argc, char *argv[])
{
// for now, assume matrices A and B have same dimensions
int matrix_width;
int matrix_height;
if (argc < 3) {
printf("\nERROR: not enough arguments.\n");
printf("arg1: (int)matrix_width, arg2: (int)matrix_height\n");
printf("\nQuitting program.\n\n");
return 1;
} else {
// assume argv[0] and argv[1] are ints
matrix_width = atoi(argv[1]);
matrix_height = atoi(argv[2]);
printf("\nmatrix_width: %i\nmatrix_height: %i\n\n", matrix_width, matrix_height);
if (matrix_width != matrix_height || matrix_width%16 != 0 || matrix_height%16 != 0) {
printf("\nERROR: cannot multiply matrices\n");
printf("For simplicity, matrix must be square,\n");
printf("and matrix height and width must be multiples of 16 (BLOCKSIZE)\n\n");
return 1;
}
}
Matrix A,B,C;
A.width = B.width = C.width = matrix_width;
A.height = B.height = C.height = matrix_height;
size_t size = matrix_width*matrix_height*sizeof(float);
A.elements = (float *)malloc(size);
B.elements = (float *)malloc(size);
C.elements = (float *)malloc(size);
// initialize
for (int i = 0; i < A.height; ++i) {
for (int j = 0; j < A.width; ++j) {
A.elements[i*A.width + j] = (float)i-j;
B.elements[i*B.width + j] = (float)i+j;
C.elements[i*C.width + j] = (float)0;
}
}
MatMul(A,B,C);
return 0;
}
//host code, assume Matrix dimensions are multiples of BLOCK_SIZE
void MatMul(Matrix A, Matrix B, Matrix C)
{
Matrix d_A; // d means device
d_A.width = d_A.stride = A.width;
d_A.height = A.height;
size_t size = A.width*A.height*sizeof(float);
cudaMalloc((void**)&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = d_B.stride = B.width;
d_B.height = B.height;
size = B.width*B.height*sizeof(float);
cudaMalloc((void**)&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);
Matrix d_C;
d_C.width = d_C.stride = C.width;
d_C.height = C.height;
size = C.width*C.height*sizeof(float);
cudaMalloc((void**)&d_C.elements, size);
// kernel invocation
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.width/dimBlock.x, A.height/dimBlock.y);
long float time1 = ((long float)clock())/CLOCKS_PER_SEC;
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
cudaThreadSynchronize();
long float time2 = ((long float)clock())/CLOCKS_PER_SEC;
printf("Time (seconds): %lf\n", (long float)(time2-time1));
// Get result
cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);
printf("\n Testing result... (this may take a few seconds) \n\n");
// Test Matrix
Matrix CC;
CC.width = C.width;
CC.height = C.height;
CC.stride = C.stride;
CC.elements = (float *)malloc(C.width*C.height*sizeof(float));
// Test Multiply
float temp = 0;
for(int j = 0; j < B.width; ++j) {
for (int i = 0; i < A.height; ++i) {
temp = 0;
for (int k = 0; k < A.width; ++k) {
temp+= A.elements[i*A.width + k] * B.elements[k*B.width + j];
}
CC.elements[i*CC.width + j] = temp;
}
}
for (int i = 0; i < C.height; ++i) {
for (int j = 0; j < C.width; ++j) {
// printf("(%i,%i): %f ?= %f\n", i, j, CC.elements[i*CC.width + j], C.elements[i*C.width + j]);
if (CC.elements[i*CC.width + j] != C.elements[i*C.width + j]) {
printf("FAILURE\n");
printf("(%i,%i): %f ?= %f\n", i, j, CC.elements[i*CC.width + j], C.elements[i*C.width + j]);
}
}
}
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
//kernel
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Each thread block computes one sub-matrix Csub of C
// This is in GLOBAL memory?
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
// Each thread computes on element of Csub
// by accumulating results into Cvalue
float Cvalue = 0;
// Thread row and column within Csub
int threadRow = threadIdx.y;
int threadCol = threadIdx.x;
// loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {
// Get sub-matrices Asub of A and Bsub of B
Matrix Asub = GetSubMatrix(A, blockRow, m);
Matrix Bsub = GetSubMatrix(B, m, blockCol);
// Shared memory used to store Asub and Bsub
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[threadRow][threadCol] = GetElement(Asub, threadRow, threadCol);
Bs[threadRow][threadCol] = GetElement(Bsub, threadRow, threadCol);
__syncthreads();
// Multiply Asub and Bsub
for (int e = 0; e < BLOCK_SIZE; ++e) {
Cvalue += As[threadRow][e] * Bs[e][threadCol];
}
// Syhcnronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
SetElement(Csub, threadRow, threadCol, Cvalue);
}
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
__device__ void SetElement(Matrix A, int row, int col, float value)
{
A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride*BLOCK_SIZE * row + BLOCK_SIZE * col];
return Asub;
}
Fortran version
(functions are in opposite order, sorry – kernel first, then mat-mul function, then main method)
! start the module containing the matrix multiply kernel
module mmul_mod
use cudafor
contains
! mmul_kernel computes A*B into C (A is NxM, B is MxL, C is NxL)
attributes(global) subroutine mmul_kernel(A,B,C,N,M,L)
real,device :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, threadcol, threadrow
real, shared :: Asub(16,16), Bsub(16,16)
! the value of C(i,j) being computed, a temporary scalar
real :: Cij
! Start execution, first get my thread indices
threadcol = threadidx%x
threadrow = threadidx%y
! This thread computes C(i,j) = sum(A(i,:) * B(:,j))
j = (blockidx%x-1) * 16 + threadcol
i = (blockidx%y-1) * 16 + threadrow
Cij = 0.0
! Do the k loop in chunks of 16, the block size
do kb = 1, M, 16
! submatrices are declared to be in CUDA shared memory
! real, shared :: Asub(16,16), Bsub(16,16)
! Fill the submatrices; each of 16x16 threads in the thread
! block loads one element of Asub and Bsub
Asub(threadrow,threadcol) = A(i,kb+threadcol-1)
Bsub(threadrow,threadcol) = B(kb+threadrow-1,j)
! Wait until all elements are filled
call syncthreads()
! Multiply the two submatrices; ! Each of the 16x16 threads
! accumulates the dot product for its element of C(i,j)
do k = 1,16
Cij = Cij + Asub(threadrow,k) * Bsub(k,threadcol)
enddo
! Synchronize to make sure all threads are done reading the
! submatrices before overwriting them in the next iteration
! of hte kb loop
call syncthreads()
enddo
! Each of the 16x16 threads stores its element to the global C array
C(i,j) = Cij
end subroutine mmul_kernel
! The host routine to drive the matrix multiplication
subroutine mmul( A, B, C )
! assume shape input arrays
real, dimension (:,:) :: A, B, C
! Array dimensions
integer :: N, M, L
! allocatable device arrays
real,device,allocatable,dimension(:,:) :: Adev,Bdev,Cdev
! dim2 variables to define the grid and block shapes
type(dim3) :: dimGrid, dimBlock
integer :: r
! Get the array sizes
real ctimeall,ctimekernel,flops,mflopskernel,mflopsall
integer c1, c2, c3, c4
! Begin execution, first determine the sizes of the input arrays
N = size( A, 1 )
M = size( A, 2 )
L = size( B, 2 )
! start data xfer-inclusive timer and allocate the
! device arrays using F90 ALLOCATE
call system_clock( count=c1 )
allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )
! Copy A and B to the device using F90 array assignments
Adev = A(1:N,1:M)
Bdev = B(1:M,1:L)
! Create the grid and block dimensions
dimGrid = dim3( L/16, N/16, 1 )
dimBlock = dim3( 16, 16, 1)
! Start the data xfer-exclusive timer, launch the GPU kernel,
! wait for completion
call system_clock( count=c2 )
call mmul_kernel<<<dimGrid,dimBlock>>>(Adev,Bdev,Cdev,
: N,M,L)
r = cudathreadsynchronize()
! Stop data xfer-exclusive timer, copy the results back,
! stop data xfer-inclusive timer
call system_clock( count=c3 )
C(1:N,1:L) = Cdev
call system_clock( count=c4 )
! Calculate inclusive/exclusive execution times, and report MFLOPS
flops = float(N) * float(M) * float(L) * 2
! QUESTION: souldn't this be *2
ctimekernel = c3 - c2
mflopskernel = flops / ctimekernel
ctimeall = c4 - c1
mflopsall = flops / ctimeall
! Print out results
print *, 'Kernel time excluding data xfer:', ctimekernel,
: ' microseconds'
print *, 'Megaflops excluding data xfer: ', mflopskernel
print *, 'Total time including data xfer: ', ctimeall,
: ' microseconds'
print *, 'Megaflops including data xfer: ', mflopsall
! Deallocate device arrays and exit
deallocate( Adev, Bdev, Cdev )
end subroutine mmul
end module mmul_mod
! Main program to initialize arrays, invoke mmul, check results
program matmul
use mmul_mod
real,dimension(:,:),allocatable :: A,B,C,CC
integer N, M, L
integer idevice, istat
! Begin execution
N = 960 !512
M = 960 !1024
L = 960 !512
idevice = 0
print *,' arrays sized ', N, ' by ', M, ' by ', L
allocate(A(N,M),B(M,L),C(N,L),CC(N,L))
! Initialize the A and B arrays;
! zero out the C array to be computed on the GPU,
! and the CC array to be computed on the host
do j = 1,M
do i = 1,N
A(i,j) = i+j ! i*10 + j+1000
! print *,'A(',i,',',j,')=',A(i,j)
enddo
enddo
do j = 1,L
do i = 1,M
! B(i,j) = -1*(i == j)
B(i,j) = 10.0*i+3.0*j
! print *,'B(',i,',',j,')=',B(i,j)
enddo
enddo
do j = 1,L
do i = 1,N
CC(i,j) = 0.0
C(i,j) = 0.0
enddo
enddo
! Initialize CPU device
istat = cudaSetDevice(idevice) !df
! Call matrix multiply subroutine to execute
! on the GPU to compute C
print *,'calling mmul'
call mmul (A, B, C )
print *,' C(1,1) = ', C(1,1)
print *,' C(2,2) = ', C(2,2)
! Perform matrix multiply on host to compute CC
do i = 1,N
do j = 1,L
do k = 1,M
CC(i,j) = CC(i,j) + A(i,k)*B(k,j)
enddo
enddo
enddo
! Check for errors
ierr = 0
do j = 1,L
do i = 1,N
diff = abs(C(i,j) - CC(i,j))
denom = CC(i,j)
if ( denom == 0.0 ) denom = 1.0
error = diff / denom
if ( error > 2.0e-5 ) then
ierr = ierr + 1
if ( ierr <= 10 ) then
print *, 'C(',i,',',j,') = ',C(i,j),
: ' should be ', CC(i,j), ' error=', error
endif
endif
enddo
enddo
if( ierr == 0 ) then
print *, ' No errors found'
else
print *, ierr, ' ERRORS FOUND!!'
endif
end program