I have implemented a PCG method with CUDA. After timing it, I have found that 99% of my time is spent in a function which implements the cusparseSpSV_solve function set, and was created to perform a forwards-backwards solve. This results in my preconditioned method taking 16 times longer than my non-preconditioned method, regardless of which preconditioner I use. The issue is not with the results, but the time taken to perform a single forwards-backwards solve, so it is not about data formatting.
I am using matrices with sparsities of 99.7% or greater. The matrices are converted into CSR format before being passed into the function. Can someone tell me why this particular CUDA function is so slow? Have I implemented it poorly? Is there an alternative function set?
// This function performs a forwards-backwards solve with a lower triangular matrix using SpSV function set.
void triangular_solve(cusparseHandle_t cusparseHandle, cusparseConstSpMatDescr_t matL, cusparseDnVecDescr_t vecZ, cusparseDnVecDescr_t vecRes, cusparseDnVecDescr_t vecTemp_a) {
size_t bufferSize = 0; // Declare variables for buffer size and buffer allocation
void* buffer = nullptr;
double alpha = 1.0; // Initialize alpha for scaling (usually set to 1.0)
// Forward solve: L * a = r
// Step 1: Create SpSV descriptor
cusparseSpSVDescr_t spSVDescr = NULL, spSVDescr2 = NULL;
cusparseSpSV_createDescr(&spSVDescr);
cusparseSpSV_createDescr(&spSVDescr2);
// Step 2: Query buffer size for forward solve (L * a = r)
cusparseSpSV_bufferSize(cusparseHandle,
CUSPARSE_OPERATION_NON_TRANSPOSE, // opA
&alpha,
matL, // matrix A
vecRes, // vector r
vecTemp_a, // vector a (output)
CUDA_R_64F, // Use the appropriate data type (double precision in this case)
CUSPARSE_SPSV_ALG_DEFAULT,
spSVDescr,
&bufferSize);
// Step 3: Allocate memory for buffer
cudaMalloc(&buffer, bufferSize);
// Step 4: Perform analysis for forward solve
cusparseSpSV_analysis(cusparseHandle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha,
matL,
vecRes,
vecTemp_a,
CUDA_R_64F,
CUSPARSE_SPSV_ALG_DEFAULT,
spSVDescr,
buffer);
// Step 5: Solve L * a = r (forward solve)
cusparseSpSV_solve(cusparseHandle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha,
matL,
vecRes,
vecTemp_a,
CUDA_R_64F,
CUSPARSE_SPSV_ALG_DEFAULT,
spSVDescr);
// Backward solve: L^T * z = a
// Step 6: Query buffer size for backward solve (L^T * z = a)
cusparseSpSV_bufferSize(cusparseHandle,
CUSPARSE_OPERATION_TRANSPOSE,
&alpha,
matL, // matrix A
vecTemp_a, // vector a (input)
vecZ, // vector z (output)
CUDA_R_64F,
CUSPARSE_SPSV_ALG_DEFAULT,
spSVDescr2,
&bufferSize);
// Step 7: Reallocate buffer if necessary (if backward solve needs more memory)
cudaFree(buffer);
cudaMalloc(&buffer, bufferSize);
// Step 8: Perform analysis for backward solve
cusparseSpSV_analysis(cusparseHandle,
CUSPARSE_OPERATION_TRANSPOSE,
&alpha,
matL,
vecTemp_a,
vecZ,
CUDA_R_64F,
CUSPARSE_SPSV_ALG_DEFAULT,
spSVDescr2,
buffer);
// Step 9: Solve L^T * z = a (backward solve)
cusparseSpSV_solve(cusparseHandle,
CUSPARSE_OPERATION_TRANSPOSE,
&alpha,
matL,
vecTemp_a,
vecZ,
CUDA_R_64F,
CUSPARSE_SPSV_ALG_DEFAULT,
spSVDescr2);
// Clean up: free buffer and destroy the descriptor
cudaFree(buffer);
cusparseSpSV_destroyDescr(spSVDescr);
cusparseSpSV_destroyDescr(spSVDescr2);
}