Device version of cusolverSpScsrlsvqr is extremely slower than host version

I have sparse 3-diagonal NxN matrix A built by some rule and want to solve the system Ax=b. For this I’m using cusolverSpScsrlsvqr() from cuSolverSp module. Is it ok to have device version many times slower than cusolverSpScsrlsvqrHost() for large N? E.g. for N=2^14 the times are 174.1 ms for device and 3.5 ms for host. I’m on RTX 2060.

Code:

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cusolverSp.h>
#include <cusparse_v2.h>

#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <chrono> 


using namespace std;

void checkCudaCusolverStatus(cusolverStatus_t status, char const* operation) {
    char const *str = "UNKNOWN STATUS";
    switch (status) {
    case CUSOLVER_STATUS_SUCCESS:
        str = "CUSOLVER_STATUS_SUCCESS";
        break;
    case CUSOLVER_STATUS_NOT_INITIALIZED:
        str = "CUSOLVER_STATUS_NOT_INITIALIZED";
        break;
    case CUSOLVER_STATUS_ALLOC_FAILED:
        str = "CUSOLVER_STATUS_ALLOC_FAILED";
        break;
    case CUSOLVER_STATUS_INVALID_VALUE:
        str = "CUSOLVER_STATUS_INVALID_VALUE";
        break;
    case CUSOLVER_STATUS_ARCH_MISMATCH:
        str = "CUSOLVER_STATUS_ARCH_MISMATCH";
        break;
    case CUSOLVER_STATUS_MAPPING_ERROR:
        str = "CUSOLVER_STATUS_MAPPING_ERROR";
        break;
    case CUSOLVER_STATUS_EXECUTION_FAILED:
        str = "CUSOLVER_STATUS_EXECUTION_FAILED";
        break;
    case CUSOLVER_STATUS_INTERNAL_ERROR:
        str = "CUSOLVER_STATUS_INTERNAL_ERROR";
        break;
    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
        str = "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
        break;
    case CUSOLVER_STATUS_ZERO_PIVOT:
        str = "CUSOLVER_STATUS_ZERO_PIVOT";
        break;
    }
    cout << left << setw(30) << operation << " " << str << endl;
}

__global__ void fillAB(float *aValues, int *aRowPtrs, int *aColIdxs, float *b, int const n) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= n) return;
    if (i == 0) {
        float xn = 10 * (n + 1);
        aValues[n * 3] = xn;
        aRowPtrs[0] = 0;
        aRowPtrs[n + 1] = n * 3 + 1;
        aColIdxs[n * 3] = n;
        b[n] = xn * 2;
    }
    float xi = 10 * (i + 1);
    aValues[i * 3 + 0] = xi;
    aValues[i * 3 + 1] = xi + 5;
    aValues[i * 3 + 2] = xi - 5;
    aColIdxs[i * 3 + 0] = i;
    aColIdxs[i * 3 + 1] = i + 1;
    aColIdxs[i * 3 + 2] = i;
    aRowPtrs[i + 1] = 2 + (i * 3);
    b[i] = xi * 2;
}

int main() {
    int const n = (int)pow(2, 14);  // <<<<<<<<<<<<<<<<<<<<<<<<<<<<< N HERE
    int const valCount = n * 3 - 2;
    float *const aValues = new float[valCount];
    int *const aRowPtrs = new int[n + 1];
    int *const aColIdxs = new int[valCount];
    float *const b = new float[n];
    float *const x = new float[n];

    float *dev_aValues;
    int *dev_aRowPtrs;
    int *dev_aColIdxs;
    float *dev_b;
    float *dev_x;
    int aValuesSize = sizeof(float) * valCount;
    int aRowPtrsSize = sizeof(int) * (n + 1);
    int aColIdxsSize = sizeof(int) * valCount;
    int bSize = sizeof(float) * n;
    int xSize = sizeof(float) * n;
    cudaMalloc((void**)&dev_aValues, aValuesSize);
    cudaMalloc((void**)&dev_aRowPtrs, aRowPtrsSize);
    cudaMalloc((void**)&dev_aColIdxs, aColIdxsSize);
    cudaMalloc((void**)&dev_b, bSize);
    cudaMalloc((void**)&dev_x, xSize);
    fillAB<<<1024, (int)ceil(n / 1024.f)>>>(dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, n - 1);
    cudaMemcpy(aValues, dev_aValues, aValuesSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(aRowPtrs, dev_aRowPtrs, aRowPtrsSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(aColIdxs, dev_aColIdxs, aColIdxsSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(b, dev_b, bSize, cudaMemcpyDeviceToHost);

    cusolverSpHandle_t handle;
    checkCudaCusolverStatus(cusolverSpCreate(&handle), "cusolverSpCreate");
    cusparseMatDescr_t aDescr;
    cusparseCreateMatDescr(&aDescr);
    cusparseSetMatIndexBase(aDescr, CUSPARSE_INDEX_BASE_ZERO);
    cusparseSetMatType(aDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
    int singularity;
    cusolverStatus_t status;
    cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
    cudaDeviceSynchronize();  // <-- edit by njuffa
    auto t0 = chrono::high_resolution_clock::now();
    for (int i = 0; i < 10; ++i)
        ////////////////////// CUSOLVER HERE //////////////////////
        status = cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
        //status = cusolverSpScsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
        ///////////////////////////////////////////////////////////
    cudaDeviceSynchronize();
    auto t1 = chrono::high_resolution_clock::now();
    checkCudaCusolverStatus(status, "cusolverSpScsrlsvqr");
    checkCudaCusolverStatus(cusolverSpDestroy(handle), "cusolverSpDestroy");
    cout << "System solved: " << setw(20) << fixed << right << setprecision(3) << (t1 - t0).count() / 10.0 / 1000000 << " ms" << endl;

    cudaMemcpy(x, dev_x, xSize, cudaMemcpyDeviceToHost);
    /*for (int i = 0; i < n; ++i) {
        cout << " " << x[i];
    }*/
    cudaFree(dev_aValues);
    cudaFree(dev_aRowPtrs);
    cudaFree(dev_aColIdxs);
    cudaFree(dev_b);
    cudaFree(dev_x);
    delete[] aValues;
    delete[] aRowPtrs;
    delete[] aColIdxs;
    delete[] b;
    delete[] x;
    cudaDeviceReset();
    return 0;
}

The timing methodology used is flawed. Because of the asynchronous operation of kernels, you want:

cudaDeviceSynchronize();  // wait for all previously issued work to finish
auto t0 = chrono::high_resolution_clock::now(); // start of timed portion

I have never used cuSparse so cannot comment on performance expectations.

So I have to call cudaDeviceSynchronize() every time before chrono::high_resolution_clock::now()? Thx, I’ll keep in mind, but now it didn’t affect the measurements.

The question is answered by @Robert_Crovella here

My guess is the issue here is that it is a tridiagonal matrix. I suspect that may eliminate certain parallelism aspects that would be a benefit to the GPU cusolver routine. I suspect you’ll have more satisfactory performance by using a routine that knows that your matrix is tridiagonal, and that would be cusparseSgtsv2 . According to my testing its about 40x faster than your host-code case.