I used cusparseDgtsv_nopivot(…) call from cusparse library with different row sizes and it took (0.268, 0.347, 0.433) ms for (36, 4194, 41994) respectively on K40 GPU. For 0.268 ms, CPU can solve tri-diagonal of size ~10000! The matrices I am dealing with are small, so I would like to know if there is some offset cost in the call. Following is the code I have used for getting time metrics.
#include <string>
#include <algorithm>
#include <cstdlib>
#include <cstdio>
#include <assert.h>
#include <sys/time.h>
#include <cusparse_v2.h>
using namespace std;
int main(int argc, char* argv[]){
int lin_rows = atoi(argv[1]);
double* h_linear_tridiag;
double* h_linear_x;
double* h_linear_rhs;
h_linear_tridiag = new double[3*lin_rows]();
h_linear_rhs = new double[lin_rows]();
h_linear_x = new double[lin_rows]();
for(int i=0;i<3*lin_rows;i++)
h_linear_tridiag[i] = rand()%1000+2;
for(int i=0;i<lin_rows;i++)
h_linear_rhs[i] = rand()%1000+2;
double* d_lin_tridiag, *d_lin_rhs, *d_lin_x;
cudaMalloc((void**)&d_lin_tridiag, 3*lin_rows*sizeof(double));
cudaMalloc((void**)&d_lin_rhs, lin_rows*sizeof(double));
cudaMallocManaged((void**)&d_lin_x, lin_rows*sizeof(double));
// Transferring linear solver data
cudaMemcpy(d_lin_tridiag, h_linear_tridiag, 3*lin_rows*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_lin_rhs, h_linear_rhs, lin_rows*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_lin_x, h_linear_rhs, lin_rows*sizeof(double), cudaMemcpyHostToDevice);
cusparseHandle_t cusparseH;
cusparseMatDescr_t descrA;
// Initializing variables
cusparseCreate(&cusparseH);
cusparseCreateMatDescr(&descrA);
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
cusparseDgtsv_nopivot(cusparseH, lin_rows, 1,
&d_lin_tridiag[0], &d_lin_tridiag[lin_rows], &d_lin_tridiag[2*lin_rows],d_lin_x, lin_rows);
cudaDeviceSynchronize();
cudaEventRecord(stop);
cudaMemcpy(h_linear_x, d_lin_x, lin_rows*sizeof(double), cudaMemcpyDeviceToHost);
float milliSec = 0;
cudaEventElapsedTime(&milliSec, start, stop);
cout << lin_rows << " " << milliSec << endl;
return 0;
}