I’m measuring three approaches to matrix multiplication performance: a naive CUDA implementation, and SGEMM from CuBLAS. For simplicity all matrices are square, type float
, size n x n
. The compiler is nvcc V11.7.64
and GCC 8.3.1
with compilation flags -O3
for architectures 70 and 80
. OS is CentOS 7
I don’t understand why CUBLAS SGEMM is the slower one. Why is a naive GPU implementation faster than an optimized library? What am I missing?
Simple code
__global__ void matrixMultiplicationKernel(int N, float* A, float* B, float* C) {
float alpha = 1.f, beta = 0.f;
int row = blockIdx.y*blockDim.y+threadIdx.y;
int col = blockIdx.x*blockDim.x+threadIdx.x;
if (row < N && col < N) {
float tmpSum = 0;
for (int i = 0; i < N; i++)
tmpSum += A[row * N + i] * B[i * N + col];
C[row * N + col] = beta*C[row * N + col] + alpha * tmpSum;
}
}
void matmat_mul_cuda_kernel(int n, int bs, float *A, float *B, float *C) {
thrust::device_vector<float> dvA(A, A + n*n);
thrust::device_vector<float> dvB(B, B + n*n);
thrust::device_vector<float> dvC(n*n);
int nthreads = bs;
int nblocks = ceil(float(n)/float(bs));
dim3 blocksPerGrid(nblocks, nblocks);
dim3 threadsPerBlock(nthreads, nthreads);
matrixMultiplicationKernel<<<blocksPerGrid, threadsPerBlock>>>(n,
thrust::raw_pointer_cast(&dvA[0]),
thrust::raw_pointer_cast(&dvB[0]),
thrust::raw_pointer_cast(&dvC[0]));
thrust::copy(dvC.begin(), dvC.end(), C);
}
CUBLAS SGEMM
void matmat_mul_cublas(int n, float *A, float *B, float *C) {
thrust::device_vector<float> dvA(A, A + n*n);
thrust::device_vector<float> dvB(B, B + n*n);
thrust::device_vector<float> dvC(C, C + n*n);
int lda=n, ldb=n, ldc=n;
const float alpha = 1.0, beta = 0.0;
cublasHandle_t handle;
cublasCreate(&handle);
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha,
thrust::raw_pointer_cast(&dvB[0]), lda,
thrust::raw_pointer_cast(&dvA[0]), ldb, &beta,
thrust::raw_pointer_cast(&dvC[0]), ldc);
thrust::copy(dvC.begin(), dvC.end(), C);
cublasDestroy(handle);
}
The google benchmark results for Tesla V100-PCIE (n = [4096, 32768], 16 blocks):
Benchmark Time CPU Iterations
-------------------------------------------------------------------------------
MatMul/CudaKernel/4096/16/real_time 40.8 ms 40.8 ms 17
MatMul/CudaKernel/32768/16/real_time 4170 ms 4169 ms 1
MatMul/CuBlas/4096/real_time 61.0 ms 61.0 ms 11
MatMul/CuBlas/32768/real_time 9908 ms 9906 ms 1
ldd info
libcublas.so.11 => /mnt/software/c/cuda/11.7.0_515.43.04/lib64/libcublas.so.11
libcudart.so.11.0 => /mnt/software/c/cuda/11.7.0_515.43.04/lib64/libcudart.so.11.0
libcublasLt.so.11 => /mnt/software/c/cuda/11.7.0_515.43.04/lib64/libcublasLt.so.11