Hi there!
I was checking on some performance numbers again and recompiled and rerun my programs for that purpose. After wondering why I got such bad results compared to the ones I had before I was able to isolate the problem to the cuSPARSE spMM routine and a change from CUDA version 10.1 to 10.2. Depending on the exact layout of the CSR matrix my spMM-runtime could go up by a factor of five.
Is that a (known) bug or is it more likely I am doing something wrong?
Here is a complete example for which I witness a roughly doubled runtime when using CUDA version 10.2.89 instead of 10.1.243 on a V100 and on a P100: (I got a worse increase of runtime for a more complex CSR matrix but this program is already long enough for a showcase.)
#include <cuda_runtime.h>
#include <cusparse.h>
#include <iostream>
#include <iomanip>
#include <cassert>
#include <limits>
int main()
{
std::cout << "CUDART_VERSION = " << CUDART_VERSION << std::endl;
cusparseStatus_t status;
cudaError_t error;
constexpr int n = 1024000; // size of csr matrix
constexpr int k = 512; // number of columns of dense matrices
constexpr int nnz = 3*n-2*1000;
// * * * * * raw data * * * * * //
float alpha = 3.1415f;
float beta = 2.1718f;
int* csr_rowptr = nullptr;
int* csr_colidx = nullptr;
float* csr_values = nullptr;
float* dense_x = nullptr;
float* dense_y = nullptr;
error = cudaMallocManaged(&csr_rowptr, (n+1) * sizeof(int )); assert(error == cudaSuccess);
error = cudaMallocManaged(&csr_colidx, nnz * sizeof(int )); assert(error == cudaSuccess);
error = cudaMallocManaged(&csr_values, nnz * sizeof(float)); assert(error == cudaSuccess);
error = cudaMallocManaged(&dense_x, n*k * sizeof(float)); assert(error == cudaSuccess);
error = cudaMallocManaged(&dense_y, n*k * sizeof(float)); assert(error == cudaSuccess);
error = cudaDeviceSynchronize(); assert(error == cudaSuccess); // not needed
// fill arrays with values
// three diagonals:
// main diagonal, +1000 offset and -1000 offset
// the two or three entries in row i are i
int off = 0;
for (int i(0); i < 1000; ++i)
{
csr_rowptr[i] = 2*i;
csr_colidx[off ] = i;
csr_values[off++] = float(i);
csr_colidx[off ] = i+1000;
csr_values[off++] = float(i);
}
assert(off == 2000);
for (int i(1000); i < n-1000; ++i)
{
csr_rowptr[i] = 2*i;
csr_colidx[off ] = i-1000;
csr_values[off++] = float(i);
csr_colidx[off ] = i;
csr_values[off++] = float(i);
csr_colidx[off ] = i+1000;
csr_values[off++] = float(i);
}
assert(off == nnz - 2000);
for (int i(n-1000); i < n; ++i)
{
csr_rowptr[i] = 2*i;
csr_colidx[off ] = i-1000;
csr_values[off++] = float(i);
csr_colidx[off ] = i;
csr_values[off++] = float(i);
}
assert(off == nnz);
csr_rowptr[n] = n;
for (int i(0); i < n*k; ++i)
{
dense_x[i] = 123.45f;
dense_y[i] = 54.321f;
}
error = cudaDeviceSynchronize(); assert(error == cudaSuccess); // not needed
// * * * * * create cuSPARSE matrices * * * * * //
cusparseSpMatDescr_t descr_csr;
cusparseDnMatDescr_t descr_dense_x, descr_dense_y;
status = cusparseCreateCsr(&descr_csr, n, n, n,
csr_rowptr, csr_colidx, csr_values,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
assert(status == CUSPARSE_STATUS_SUCCESS);
status = cusparseCreateDnMat(&descr_dense_x, n, k, n,
dense_x, CUDA_R_32F, CUSPARSE_ORDER_COL);
assert(status == CUSPARSE_STATUS_SUCCESS);
status = cusparseCreateDnMat(&descr_dense_y, n, k, n,
dense_y, CUDA_R_32F, CUSPARSE_ORDER_COL);
assert(status == CUSPARSE_STATUS_SUCCESS);
error = cudaDeviceSynchronize(); assert(error == cudaSuccess); // not needed
// * * * * * last preparations * * * * * //
cusparseHandle_t handle;
status = cusparseCreate(&handle); assert(status == CUSPARSE_STATUS_SUCCESS);
// initialize cusparse matmat routine
size_t buffer_size = 0;
void* buffer = nullptr;
status = cusparseSpMM_bufferSize(handle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,
(const void*)&alpha,
descr_csr,
descr_dense_x,
(const void*)&beta,
descr_dense_y,
CUDA_R_32F,
CUSPARSE_CSRMM_ALG1,
&buffer_size);
assert(status == CUSPARSE_STATUS_SUCCESS);
if (buffer_size > 0)
{ error = cudaMallocManaged(&buffer, buffer_size); assert(error == cudaSuccess); }
// wrap matmat routine in a lambda
auto spmm = [&]() {
return cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, descr_csr, descr_dense_x, &beta, descr_dense_y,
CUDA_R_32F, CUSPARSE_CSRMM_ALG1, buffer);
};
cudaEvent_t start, stop;
error = cudaEventCreate(&start); assert(error == cudaSuccess);
error = cudaEventCreate(&stop); assert(error == cudaSuccess);
float t_min = std::numeric_limits<float>::max();
float t_max = 0.f;
float t_total = 0.f;
// * * * * * timing * * * * * //
// warmup
status = spmm(); assert(status == CUSPARSE_STATUS_SUCCESS);
status = spmm(); assert(status == CUSPARSE_STATUS_SUCCESS);
error = cudaDeviceSynchronize(); assert(error == cudaSuccess); // not needed
for (int i(0); i < 50; ++i)
{
error = cudaEventRecord(start, NULL); assert(error == cudaSuccess);
status = spmm(); assert(status == CUSPARSE_STATUS_SUCCESS);
error = cudaEventRecord(stop, NULL); assert(error == cudaSuccess);
error = cudaEventSynchronize(stop); assert(error == cudaSuccess);
float msec;
error = cudaEventElapsedTime(&msec, start, stop); assert(error == cudaSuccess);
t_min = std::min(t_min, msec);
t_max = std::max(t_max, msec);
t_total += msec;
}
// * * * * * output * * * * * //
// informative output of setup
std::cout << "operation: csrmm: Y = a*A*X + b*Y" << std::endl
<< " A: csr matrix in R^NxN" << std::endl
<< " X,Y: dense matrices in R^NxK" << std::endl
<< " a,b: real values" << std::endl
<< "n : " << n << std::endl
<< "k : " << k << std::endl
<< "nnz: " << nnz << std::endl;
float flop = ((long)(2)*nnz*k // alpha * A*X [=: Z]
+ 2*n*k // Y += beta * Z
);
float byte = ( sizeof(float) * 3*n*k // 2 dense loads, 1 dense store
+ sizeof(int) * n // csr_colidx
+ sizeof(int) * (nnz+1) // csr_rowptr
+ sizeof(float) * n // csr_values
);
std::cout << "flop: " << flop << std::endl;
std::cout << "byte: " << byte << std::endl;
// informative output of measurement
std::cout << "msec min/avg/max : " << t_min << " / " << t_total/50.f << " / " << t_max << std::endl;
std::cout << std::fixed << std::setprecision(0);
std::cout << "GLOP/s min/avg/max : " << flop/t_min*1e-6f << " / " << flop/(t_total/50.f)*1e-6f << " / " << flop/t_max*1e-6f << std::endl;
std::cout << "GByte/s min/avg/max: " << byte/t_min*1e-6f << " / " << byte/(t_total/50.f)*1e-6f << " / " << byte/t_max*1e-6f << std::endl;
// * * * * * cleanup * * * * * //
error = cudaFree(csr_rowptr); assert(error == cudaSuccess);
error = cudaFree(csr_colidx); assert(error == cudaSuccess);
error = cudaFree(csr_values); assert(error == cudaSuccess);
error = cudaFree(dense_x ); assert(error == cudaSuccess);
error = cudaFree(dense_y ); assert(error == cudaSuccess);
return 0;
}
I cannot say anything about version 11 yet. I hope I’m doing something wrong here or it is fixed there.