I’ve had problems with poor cublas gemm performance for a long time, I was hoping that a newer cuda toolkit version would solve my issues but now that I have upgraded to v 9.2 I can see that the performance issues are still very much here so I will consult this forum again and see if anyone has a solution.
My problem is that cublas functions such as dgemm and sgemm are so slow that they are practically unusable for me since poorly written custom kernels often outperform them on my system. Here is an example where I calculate a large batch of small matrix-matrix multiplications both using the cublasSgemmStridedBatched function and with a naïve, unoptimized, manual kernel:
#include "mex.h"
#include "cublas_v2.h"
#include <time.h>
// Kernel code for naive matrix multiplication
__global__ void MatrixMultiplication(const float* A, const float* B, float* C, const int m, const int n, const int q, const int k){
float e;
for (int page = blockIdx.z * blockDim.z + threadIdx.z; page < k; page += blockDim.z * gridDim.z) {
// Loop rows of A
for (int rows = blockIdx.x * blockDim.x + threadIdx.x; rows < m; rows += blockDim.x * gridDim.x) {
// Loop columns of B.
for (int cols = blockIdx.y * blockDim.y + threadIdx.y; cols < q; cols += blockDim.y * gridDim.y) {
// Initialize temporary variable e to hold values of current row/col run
e = 0;
// Loop columns of A/rows of B
for (int steps = 0; steps < n; steps++) {
e = e + A[rows + steps*m + m*n*page] * B[steps + cols*n + n*q*page];
}
C[rows + cols*m + m*q*page] = e;
}
}
}
}
/* The MEX gateway function */
void mexFunction(int nlhs, mxArray ∗plhs[], int nrhs,const mxArray ∗prhs[]) {
// Get host input matrices A and B.
const float *A, *B;
A = (float*)mxGetPr(prhs[0]);
B = (float*)mxGetPr(prhs[1]);
// Get size of inputs.
size_t m, n, q, k;
clock_t t1, t2;
const mwSize *Adims, *Bdims;
Adims = mxGetDimensions(prhs[0]);
Bdims = mxGetDimensions(prhs[1]);
m = Adims[0];
n = Adims[1];
k = Adims[2];
q = Bdims[1];
// Allocate device memory for A, B and C.
float *dA, *dB, *dCcublas, *dCmanual;
cudaMalloc(&dA, sizeof(float) * m * n * k);
cudaMalloc(&dB, sizeof(float) * n * q * k);
cudaMalloc(&dCcublas, sizeof(float) * m * q * k);
cudaMalloc(&dCmanual, sizeof(float) * m * q * k);
// Copy A & B to device.
cudaMemcpy(dA, A, m * n * k * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dB, B, n * q * k * sizeof(float), cudaMemcpyHostToDevice);
// Cublas handle
cublasHandle_t h;
cublasCreate(&h);
// Calculate C = A*B using cublas.
const float One = 1.0;
const float Zero = 0.0;
t1 = clock();
cublasSgemmStridedBatched(h, CUBLAS_OP_N, CUBLAS_OP_N, m, q, n, &One, dA, m, m * n, dB, n, n * q, &Zero, dCcublas, m, m*q, k);
cudaDeviceSynchronize();
t2 = clock();
printf("cublasSgemmStridedBatched took: %.3f ms\n", (double)(t2 - t1) / CLOCKS_PER_SEC * 1000);
// Calculate C = A*B using manual kernel.
int xnTPB = 32; int ynTPB = 2; int znTPB = 16;
int xnumBlocks = (m + xnTPB - 1) / xnTPB;
int ynumBlocks = (q + ynTPB - 1) / ynTPB;
int znumBlocks = (k + znTPB - 1) / znTPB;
dim3 Nblocks(xnumBlocks, ynumBlocks, znumBlocks);
dim3 nTPB(xnTPB, ynTPB, znTPB);
t1 = clock();
MatrixMultiplication<<<Nblocks, nTPB>>>(dA, dB, dCmanual, m, n, q, k);
cudaDeviceSynchronize();
t2 = clock();
printf("Manual matmult kernel took: %.3f ms\n", (double)(t2 - t1) / CLOCKS_PER_SEC * 1000);
float *HostOutputCcublas, *HostOutputCmanual;
mwSize dims[3] = { m, q, k };
plhs[0] = mxCreateNumericArray(3, dims, mxSINGLE_CLASS, mxREAL);
plhs[1] = mxCreateNumericArray(3, dims, mxSINGLE_CLASS, mxREAL);
HostOutputCcublas = (float*)mxGetPr(plhs[0]);
HostOutputCmanual = (float*)mxGetPr(plhs[1]);
cudaDeviceSynchronize();
cudaMemcpy(HostOutputCcublas, dCcublas, m * q * k * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(HostOutputCmanual, dCmanual, m * q * k * sizeof(float), cudaMemcpyDeviceToHost);
// Free memory & destroy cublas handle.
cudaFree(dA);
cudaFree(dB);
cudaFree(dCcublas);
cudaFree(dCmanual);
cublasDestroy(h);
}
Im compiling and running the function from Matlab with the following Matlab code:
mexcuda WhatsWrongWithMyCublas.cu -lcublas
m = 150;
n = 256;
q = 2;
k = 10000;
A = rand(m,n,k,'single');
B = rand(n,q,k,'single');
[Cblas,Cmanual] = WhatsWrongWithMyCublas(A,B);
The output matrices Cblas and Cmanual are identical and the printed output becomes:
cublasSgemmStridedBatched took: 7.000 ms
Manual matmult kernel took: 5.000 ms
When profiling the kernel launches with Nsight it verifies that what clock() reports is true with the following:
Function Name: maxwell_sgemm_128_64_nn Grid Dimensions: {2,1,10000} Block Dimensions: {128,1,1} Duration: 7200.987 µs Occupancy: 25 %
Function Name: MatrixMultiplication Grid Dimensions: {5,1,625} Block Dimensions: {32,2,16} Duration: 5200.831 µs Occupancy: 100 %
So how can this possibly be? How can a highly optimized batched gemm library be slower than a naïve looped matrix multiplication at precisely what it is designed to do—thousands of equally sized small matrix multiplications?
One thing I noticed is that cublas is using the “maxwell_sgemm_128_64_nn” kernel, but Im using a pascal architecture card (GTX 1080ti). Is it supposed to call a cublas function named “Maxwell” or is there a corresponding “pascal_sgemm_128_64_nn” kernel that my system should be using?
This exact problem persists across multiple cuda toolkit versions, multiple GPU driver versions and across multiple different GPUs I’ve tested (all pascal series).
Is there anyone else who has this problem or knows what the reason for it could be? Obviously there is something wrong here…