I have sparse 3-diagonal NxN matrix A
built by some rule and want to solve the system Ax=b
. For this I’m using cusolverSpScsrlsvqr()
from cuSolverSp module. Is it ok to have device version many times slower than cusolverSpScsrlsvqrHost()
for large N? E.g. for N=2^14 the times are 174.1 ms for device and 3.5 ms for host. I’m on RTX 2060.
Code:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cusolverSp.h>
#include <cusparse_v2.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <chrono>
using namespace std;
void checkCudaCusolverStatus(cusolverStatus_t status, char const* operation) {
char const *str = "UNKNOWN STATUS";
switch (status) {
case CUSOLVER_STATUS_SUCCESS:
str = "CUSOLVER_STATUS_SUCCESS";
break;
case CUSOLVER_STATUS_NOT_INITIALIZED:
str = "CUSOLVER_STATUS_NOT_INITIALIZED";
break;
case CUSOLVER_STATUS_ALLOC_FAILED:
str = "CUSOLVER_STATUS_ALLOC_FAILED";
break;
case CUSOLVER_STATUS_INVALID_VALUE:
str = "CUSOLVER_STATUS_INVALID_VALUE";
break;
case CUSOLVER_STATUS_ARCH_MISMATCH:
str = "CUSOLVER_STATUS_ARCH_MISMATCH";
break;
case CUSOLVER_STATUS_MAPPING_ERROR:
str = "CUSOLVER_STATUS_MAPPING_ERROR";
break;
case CUSOLVER_STATUS_EXECUTION_FAILED:
str = "CUSOLVER_STATUS_EXECUTION_FAILED";
break;
case CUSOLVER_STATUS_INTERNAL_ERROR:
str = "CUSOLVER_STATUS_INTERNAL_ERROR";
break;
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
str = "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
break;
case CUSOLVER_STATUS_ZERO_PIVOT:
str = "CUSOLVER_STATUS_ZERO_PIVOT";
break;
}
cout << left << setw(30) << operation << " " << str << endl;
}
__global__ void fillAB(float *aValues, int *aRowPtrs, int *aColIdxs, float *b, int const n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= n) return;
if (i == 0) {
float xn = 10 * (n + 1);
aValues[n * 3] = xn;
aRowPtrs[0] = 0;
aRowPtrs[n + 1] = n * 3 + 1;
aColIdxs[n * 3] = n;
b[n] = xn * 2;
}
float xi = 10 * (i + 1);
aValues[i * 3 + 0] = xi;
aValues[i * 3 + 1] = xi + 5;
aValues[i * 3 + 2] = xi - 5;
aColIdxs[i * 3 + 0] = i;
aColIdxs[i * 3 + 1] = i + 1;
aColIdxs[i * 3 + 2] = i;
aRowPtrs[i + 1] = 2 + (i * 3);
b[i] = xi * 2;
}
int main() {
int const n = (int)pow(2, 14); // <<<<<<<<<<<<<<<<<<<<<<<<<<<<< N HERE
int const valCount = n * 3 - 2;
float *const aValues = new float[valCount];
int *const aRowPtrs = new int[n + 1];
int *const aColIdxs = new int[valCount];
float *const b = new float[n];
float *const x = new float[n];
float *dev_aValues;
int *dev_aRowPtrs;
int *dev_aColIdxs;
float *dev_b;
float *dev_x;
int aValuesSize = sizeof(float) * valCount;
int aRowPtrsSize = sizeof(int) * (n + 1);
int aColIdxsSize = sizeof(int) * valCount;
int bSize = sizeof(float) * n;
int xSize = sizeof(float) * n;
cudaMalloc((void**)&dev_aValues, aValuesSize);
cudaMalloc((void**)&dev_aRowPtrs, aRowPtrsSize);
cudaMalloc((void**)&dev_aColIdxs, aColIdxsSize);
cudaMalloc((void**)&dev_b, bSize);
cudaMalloc((void**)&dev_x, xSize);
fillAB<<<1024, (int)ceil(n / 1024.f)>>>(dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, n - 1);
cudaMemcpy(aValues, dev_aValues, aValuesSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aRowPtrs, dev_aRowPtrs, aRowPtrsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aColIdxs, dev_aColIdxs, aColIdxsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, bSize, cudaMemcpyDeviceToHost);
cusolverSpHandle_t handle;
checkCudaCusolverStatus(cusolverSpCreate(&handle), "cusolverSpCreate");
cusparseMatDescr_t aDescr;
cusparseCreateMatDescr(&aDescr);
cusparseSetMatIndexBase(aDescr, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(aDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
int singularity;
cusolverStatus_t status;
cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
cudaDeviceSynchronize(); // <-- edit by njuffa
auto t0 = chrono::high_resolution_clock::now();
for (int i = 0; i < 10; ++i)
////////////////////// CUSOLVER HERE //////////////////////
status = cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
//status = cusolverSpScsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
///////////////////////////////////////////////////////////
cudaDeviceSynchronize();
auto t1 = chrono::high_resolution_clock::now();
checkCudaCusolverStatus(status, "cusolverSpScsrlsvqr");
checkCudaCusolverStatus(cusolverSpDestroy(handle), "cusolverSpDestroy");
cout << "System solved: " << setw(20) << fixed << right << setprecision(3) << (t1 - t0).count() / 10.0 / 1000000 << " ms" << endl;
cudaMemcpy(x, dev_x, xSize, cudaMemcpyDeviceToHost);
/*for (int i = 0; i < n; ++i) {
cout << " " << x[i];
}*/
cudaFree(dev_aValues);
cudaFree(dev_aRowPtrs);
cudaFree(dev_aColIdxs);
cudaFree(dev_b);
cudaFree(dev_x);
delete[] aValues;
delete[] aRowPtrs;
delete[] aColIdxs;
delete[] b;
delete[] x;
cudaDeviceReset();
return 0;
}