I am interested in leveraging BSR-formatted matrices within the CUDA API. Currently, I have built a simple example that takes a 2x2 BSR-formatted matrix and a 2x1 right-hand side vector to solve for x in Ax=b.
I understand that the BSR functionality is deprecated, yet I was informed that although it will be deprecated in a future release, the current version will work as expected.
The issue with my code is it appears to only solve the lower triangular part of the matrix. In other words, if I input a fully dense matrix and attempt to implement bsrv2_solve(), it only “sees” the lower triangle part of the matrix and solves the system ignoring the upper triangular part of the matrix.
My approach to implementing this code is to:
- Initialize the matrix on the CPU
- Send the matrix to GPU
- Run bsrv2_bufferSIze()
- Run bsrv2_analysis(). My understanding is that this function call will decompose the matrix into LU format for the solver.
- Run bsrv2_solve()
- Send the solution back to the CPU and print the results
Here is the simple minimalist code I have cooked up to test the BSR solver functionality:
nvcc exampleBSR.cu -lcublas -lcusparse -lcudart -lcusolver && ./a.out
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>
#include <iostream>
/*
nvcc exampleBSR.cu -lcublas -lcusparse -lcudart -lcusolver && ./a.out
Solve Ax = b:
A = [2.8 0
1.8 2]
b = [1 , 2]
x= [0.36714, 0.67857]
*/
#define CHECK_CUSPARSE(call) \
{ \
cusparseStatus_t err; \
if ((err = (call)) != CUSPARSE_STATUS_SUCCESS) { \
fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__); \
fprintf(stderr, " cuSPARSE error = %s string = \n", \
cusparseGetErrorString(err)); \
cudaError_t cuda_err = cudaGetLastError(); \
if (cuda_err != cudaSuccess) { \
fprintf(stderr, " CUDA error \"%s\" also detected\n", \
cudaGetErrorString(cuda_err)); \
} \
exit(1); \
} \
}
int main() {
// Initialize cuSPARSE
cusparseHandle_t handle;
cusparseCreate(&handle);
// Create solve analysis info
bsrsv2Info_t info;
cusparseCreateBsrsv2Info(&info);
// Define matrix transpose, direction, and polict
const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
const cusparseDirection_t dir = CUSPARSE_DIRECTION_ROW;
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_NO_LEVEL;
// Create matrix descriptor
cusparseMatDescr_t descr;
cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
// BSR matrix
int mb = 1; // Number of block rows (and columns)
int nnzb = 1; // Number of non-zero blocks
int blockDim = 2;
const double alpha = 1.0;
double hBsrVal[] = {1.0, 2.0, 5.0, 9.0};
int hBsrRowPtr[] = {0, 1};
int hBsrColInd[] = {0};
// Right-hand side vector (b)
double h_b[2] = {1, 2};
// Solution vector (x)
double h_x[2] = {0, 0};
// Create device arrays and copy data to the device
double *dBsrVal, *d_b = nullptr, *d_x = nullptr;
int *dBsrRowPtr, *dBsrColInd;
cudaMalloc((void **)&dBsrVal, sizeof(hBsrVal));
cudaMalloc((void **)&dBsrRowPtr, sizeof(hBsrRowPtr));
cudaMalloc((void **)&dBsrColInd, sizeof(hBsrColInd));
cudaMalloc((void **)&d_b, sizeof(double) * 2);
cudaMalloc((void **)&d_x, sizeof(double) * 2);
cudaMemcpy(dBsrVal, hBsrVal, sizeof(hBsrVal), cudaMemcpyHostToDevice);
cudaMemcpy(dBsrRowPtr, hBsrRowPtr, sizeof(hBsrRowPtr),
cudaMemcpyHostToDevice);
cudaMemcpy(dBsrColInd, hBsrColInd, sizeof(hBsrColInd),
cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, 2 * sizeof(double), cudaMemcpyHostToDevice);
// Step 1: Analyze
int pBufferSize;
void *pBuffer = 0;
CHECK_CUSPARSE(cusparseDbsrsv2_bufferSize(handle, dir, trans, mb, nnzb, descr,
dBsrVal, dBsrRowPtr, dBsrColInd,
blockDim, info, &pBufferSize));
cudaMalloc(&pBuffer, pBufferSize);
// Perform analysis
CHECK_CUSPARSE(cusparseDbsrsv2_analysis(handle, dir, trans, mb, nnzb, descr,
dBsrVal, dBsrRowPtr, dBsrColInd,
blockDim, info, policy, pBuffer));
// Step 2: Solve A * x = b
CHECK_CUSPARSE(cusparseDbsrsv2_solve(
handle, dir, trans, mb, nnzb, &alpha, descr, dBsrVal, dBsrRowPtr,
dBsrColInd, blockDim, info, d_b, d_x, policy, pBuffer));
// Copy the result back to the host
cudaMemcpy(h_x, d_x, 2 * sizeof(double), cudaMemcpyDeviceToHost);
// Print the solution
std::cout << "Solution x = [";
for (int i = 0; i < 2; i++) {
std::cout << h_x[i] << " ";
}
std::cout << "]" << std::endl;
// Cleanup
cudaFree(dBsrVal);
cudaFree(dBsrRowPtr);
cudaFree(dBsrColInd);
cudaFree(d_b);
cudaFree(d_x);
cudaFree(pBuffer);
cusparseDestroyBsrsv2Info(info);
cusparseDestroyMatDescr(descr);
cusparseDestroy(handle);
return 0;
}
The issue with my code is it appears to only solve the lower triangular part of the matrix. In other words, if I input a fully dense matrix and attempt to implement bsrv2_solve(), it only “sees” the lower triangle part of the matrix and solves the system ignoring the upper triangular part of the matrix.
I have attempted to implement the CHECK_CUSPARSE macro to capture any potential errors in my function call. It has not captured any failure points in my implementation.
Does anyone know if the BSR solver in CUDA is capable of solving either sparse or dense matrices?