Packed matrix format for cuSOLVER Cholesky (potrf)

s769.098211 · January 28, 2025, 7:04pm

If I want to do a cholesky factorization of a symmetric matrix with cuSOLVER, is it enough for me to just allocate and pass the lower triangular entries in a packed format? That is, for the nxn SPD matrix A, can I store the entries [a_11,a_21,a_31,…,a_n1,a_22,_a32,…,a_n2,…,a_nn] in a double pointer and pass that to DnDpotrf?

I tried this out, but I’m getting a non-positive definite matrix error when I use the packed format. The test matrix has n on the diagonal and 1’s elsewhere. I have the code below that chatGPT generated for this example. The full matrix version seems to work, but the packed one gives an error. I tried a few orderings of the packed format but didn’t get anything that worked.

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include <cublas_v2.h>
#include <time.h>

#define CUDA_CHECK(call) do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        fprintf(stderr, "CUDA error: %s (%s:%d)\n", cudaGetErrorString(err), __FILE__, __LINE__); \
        exit(EXIT_FAILURE); \
    } \
} while(0)

#define CUSOLVER_CHECK(call) do { \
    cusolverStatus_t err = call; \
    if (err != CUSOLVER_STATUS_SUCCESS) { \
        fprintf(stderr, "cuSolver error: %d (%s:%d)\n", err, __FILE__, __LINE__); \
        exit(EXIT_FAILURE); \
    } \
} while(0)

void generate_lower_triangular_packed(double *A, int n) {
    int index = 0;
    for (int j = 0; j < n; ++j) {
        for (int i = j; i < n; ++i) {
            if (i == j) {
                A[index] = n;  // Diagonal element
            } else {
                A[index] = 1.0;  // Off-diagonal element
            }
            index++;
        }
    }
}

void generate_lower_triangular_full(double *A, int n) {
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < n; ++j) {
            if (i == j) {
                A[i * n + j] = n;  // Diagonal element
            } else if (i > j) {
                A[i * n + j] = 0.0;  // Zero upper triangle
            } else {
                A[i * n + j] = 1.0;  // Lower triangle is 1
            }
        }
    }
}

void check_solution(double *x, int n, double expected_value) {
    for (int i = 0; i < n; ++i) {
        if (fabs(x[i] - expected_value) > 1e-6) {
            fprintf(stderr, "Solution verification failed at index %d: expected %f, got %f\n", i, expected_value, x[i]);
            exit(EXIT_FAILURE);
        }
    }
    printf("Solution verification passed.\n");
}

int main(int argc, char **argv) {
    if (argc != 3) {
        fprintf(stderr, "Usage: %s <matrix_size> <storage_type (0: full, 1: packed)>\n", argv[0]);
        return EXIT_FAILURE;
    }

    int n = atoi(argv[1]);
    int storage_type = atoi(argv[2]);

    if (n <= 0) {
        fprintf(stderr, "Matrix size must be a positive integer.\n");
        return EXIT_FAILURE;
    }

    if (storage_type != 0 && storage_type != 1) {
        fprintf(stderr, "Storage type must be 0 (full) or 1 (packed).\n");
        return EXIT_FAILURE;
    }

    printf("Matrix size: %d\n", n);
    printf("Storage type: %s\n", storage_type == 0 ? "Full" : "Packed");

    // Host memory
    size_t matrix_size = (storage_type == 1) ? n * (n + 1) / 2 : n * n;
    double *h_A = (double *)malloc(matrix_size * sizeof(double));
    double *h_b = (double *)malloc(n * sizeof(double));
    double *h_x = (double *)malloc(n * sizeof(double));
    if (!h_A || !h_b || !h_x) {
        fprintf(stderr, "Host memory allocation failed.\n");
        return EXIT_FAILURE;
    }

    // Initialize matrix and RHS vector
    if (storage_type == 1) {
        generate_lower_triangular_packed(h_A, n);
    } else {
        generate_lower_triangular_full(h_A, n);
    }

    for (int i = 0; i < n; ++i) h_b[i] = 1.0;

    // Device memory
    double *d_A, *d_b, *d_workspace;
    int *d_info;
    int workspace_size;

    CUDA_CHECK(cudaMalloc((void **)&d_A, matrix_size * sizeof(double)));
    CUDA_CHECK(cudaMalloc((void **)&d_b, n * sizeof(double)));
    CUDA_CHECK(cudaMalloc((void **)&d_info, sizeof(int)));

    // Copy data to device
    CUDA_CHECK(cudaMemcpy(d_A, h_A, matrix_size * sizeof(double), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_b, h_b, n * sizeof(double), cudaMemcpyHostToDevice));

    // cuSOLVER handle
    cusolverDnHandle_t handle;
    CUSOLVER_CHECK(cusolverDnCreate(&handle));

    // Workspace query
    CUSOLVER_CHECK(cusolverDnDpotrf_bufferSize(handle, CUBLAS_FILL_MODE_LOWER, n, d_A, n, &workspace_size));
    CUDA_CHECK(cudaMalloc((void **)&d_workspace, workspace_size * sizeof(double)));

    // Timing
    clock_t start, end;

    // Cholesky factorization
    start = clock();
    CUSOLVER_CHECK(cusolverDnDpotrf(handle, CUBLAS_FILL_MODE_LOWER, n, d_A, n, d_workspace, workspace_size, d_info));
    end = clock();
    printf("Factorization time: %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC);

    // Check factorization success
    int info;
    CUDA_CHECK(cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
    if (info != 0) {
        fprintf(stderr, "Factorization failed: info = %d\n", info);
        return EXIT_FAILURE;
    }

    // Solve system
    start = clock();
    CUSOLVER_CHECK(cusolverDnDpotrs(handle, CUBLAS_FILL_MODE_LOWER, n, 1, d_A, n, d_b, n, d_info));
    end = clock();
    printf("Solve time: %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC);

    // Copy solution back to host
    CUDA_CHECK(cudaMemcpy(h_x, d_b, n * sizeof(double), cudaMemcpyDeviceToHost));

    // Verify solution
    double expected_value = 1.0 / (2 * n - 1);
    check_solution(h_x, n, expected_value);

    // Memory usage
    printf("Memory usage (bytes):\n");
    printf("  Matrix A: %zu\n", matrix_size * sizeof(double));
    printf("  RHS vector b: %zu\n", n * sizeof(double));
    printf("  Workspace: %d\n", workspace_size * (int)sizeof(double));

    // Cleanup
    free(h_A);
    free(h_b);
    free(h_x);
    CUDA_CHECK(cudaFree(d_A));
    CUDA_CHECK(cudaFree(d_b));
    CUDA_CHECK(cudaFree(d_workspace));
    CUDA_CHECK(cudaFree(d_info));
    CUSOLVER_CHECK(cusolverDnDestroy(handle));

    return EXIT_SUCCESS;
}

Compile with nvcc -o cholesky_solver cholesky_solver.cu -lcusolver -lcublas
run with ./cholesky_solver <n> <full/packed=0/1>

Topic		Replies	Views
cusolverDnDpotrf does not support float CUDA Programming and Performance	0	524	August 30, 2017
cusparse Incomplete Cholesky CG - incorrect results GPU-Accelerated Libraries	9	3764	June 16, 2013
Problem with "cusparseDcsrsv" GPU-Accelerated Libraries	5	2102	May 23, 2013
Abnormally slow performance GPU-Accelerated Libraries	3	836	February 4, 2019
cublas_cublasDgetrsBatched_problem GPU-Accelerated Libraries cublas , cusolver	5	928	October 6, 2022
cuSOLVER into python (possibly for pyCUDA) CUDA Programming and Performance	8	3668	June 4, 2015
Problem with cusparseDcsrsv_solve() function GPU-Accelerated Libraries	2	1218	July 30, 2015
Wrong output of Matrix Inversion using cuSOLVER GPU-Accelerated Libraries cuda	7	66	July 22, 2024
Problem using cusparseScsric0 function GPU-Accelerated Libraries	22	6908	November 29, 2012
cusparseScsrilu02 breaks with large matrices GPU-Accelerated Libraries cublas , cusolver , cusparse	10	1145	June 1, 2022

Packed matrix format for cuSOLVER Cholesky (potrf)

Related topics