If I want to do a cholesky factorization of a symmetric matrix with cuSOLVER, is it enough for me to just allocate and pass the lower triangular entries in a packed format? That is, for the nxn SPD matrix A, can I store the entries [a_11,a_21,a_31,…,a_n1,a_22,_a32,…,a_n2,…,a_nn] in a double pointer and pass that to DnDpotrf?
I tried this out, but I’m getting a non-positive definite matrix error when I use the packed format. The test matrix has n on the diagonal and 1’s elsewhere. I have the code below that chatGPT generated for this example. The full matrix version seems to work, but the packed one gives an error. I tried a few orderings of the packed format but didn’t get anything that worked.
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include <cublas_v2.h>
#include <time.h>
#define CUDA_CHECK(call) do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error: %s (%s:%d)\n", cudaGetErrorString(err), __FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
} while(0)
#define CUSOLVER_CHECK(call) do { \
cusolverStatus_t err = call; \
if (err != CUSOLVER_STATUS_SUCCESS) { \
fprintf(stderr, "cuSolver error: %d (%s:%d)\n", err, __FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
} while(0)
void generate_lower_triangular_packed(double *A, int n) {
int index = 0;
for (int j = 0; j < n; ++j) {
for (int i = j; i < n; ++i) {
if (i == j) {
A[index] = n; // Diagonal element
} else {
A[index] = 1.0; // Off-diagonal element
}
index++;
}
}
}
void generate_lower_triangular_full(double *A, int n) {
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
if (i == j) {
A[i * n + j] = n; // Diagonal element
} else if (i > j) {
A[i * n + j] = 0.0; // Zero upper triangle
} else {
A[i * n + j] = 1.0; // Lower triangle is 1
}
}
}
}
void check_solution(double *x, int n, double expected_value) {
for (int i = 0; i < n; ++i) {
if (fabs(x[i] - expected_value) > 1e-6) {
fprintf(stderr, "Solution verification failed at index %d: expected %f, got %f\n", i, expected_value, x[i]);
exit(EXIT_FAILURE);
}
}
printf("Solution verification passed.\n");
}
int main(int argc, char **argv) {
if (argc != 3) {
fprintf(stderr, "Usage: %s <matrix_size> <storage_type (0: full, 1: packed)>\n", argv[0]);
return EXIT_FAILURE;
}
int n = atoi(argv[1]);
int storage_type = atoi(argv[2]);
if (n <= 0) {
fprintf(stderr, "Matrix size must be a positive integer.\n");
return EXIT_FAILURE;
}
if (storage_type != 0 && storage_type != 1) {
fprintf(stderr, "Storage type must be 0 (full) or 1 (packed).\n");
return EXIT_FAILURE;
}
printf("Matrix size: %d\n", n);
printf("Storage type: %s\n", storage_type == 0 ? "Full" : "Packed");
// Host memory
size_t matrix_size = (storage_type == 1) ? n * (n + 1) / 2 : n * n;
double *h_A = (double *)malloc(matrix_size * sizeof(double));
double *h_b = (double *)malloc(n * sizeof(double));
double *h_x = (double *)malloc(n * sizeof(double));
if (!h_A || !h_b || !h_x) {
fprintf(stderr, "Host memory allocation failed.\n");
return EXIT_FAILURE;
}
// Initialize matrix and RHS vector
if (storage_type == 1) {
generate_lower_triangular_packed(h_A, n);
} else {
generate_lower_triangular_full(h_A, n);
}
for (int i = 0; i < n; ++i) h_b[i] = 1.0;
// Device memory
double *d_A, *d_b, *d_workspace;
int *d_info;
int workspace_size;
CUDA_CHECK(cudaMalloc((void **)&d_A, matrix_size * sizeof(double)));
CUDA_CHECK(cudaMalloc((void **)&d_b, n * sizeof(double)));
CUDA_CHECK(cudaMalloc((void **)&d_info, sizeof(int)));
// Copy data to device
CUDA_CHECK(cudaMemcpy(d_A, h_A, matrix_size * sizeof(double), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_b, h_b, n * sizeof(double), cudaMemcpyHostToDevice));
// cuSOLVER handle
cusolverDnHandle_t handle;
CUSOLVER_CHECK(cusolverDnCreate(&handle));
// Workspace query
CUSOLVER_CHECK(cusolverDnDpotrf_bufferSize(handle, CUBLAS_FILL_MODE_LOWER, n, d_A, n, &workspace_size));
CUDA_CHECK(cudaMalloc((void **)&d_workspace, workspace_size * sizeof(double)));
// Timing
clock_t start, end;
// Cholesky factorization
start = clock();
CUSOLVER_CHECK(cusolverDnDpotrf(handle, CUBLAS_FILL_MODE_LOWER, n, d_A, n, d_workspace, workspace_size, d_info));
end = clock();
printf("Factorization time: %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC);
// Check factorization success
int info;
CUDA_CHECK(cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
if (info != 0) {
fprintf(stderr, "Factorization failed: info = %d\n", info);
return EXIT_FAILURE;
}
// Solve system
start = clock();
CUSOLVER_CHECK(cusolverDnDpotrs(handle, CUBLAS_FILL_MODE_LOWER, n, 1, d_A, n, d_b, n, d_info));
end = clock();
printf("Solve time: %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC);
// Copy solution back to host
CUDA_CHECK(cudaMemcpy(h_x, d_b, n * sizeof(double), cudaMemcpyDeviceToHost));
// Verify solution
double expected_value = 1.0 / (2 * n - 1);
check_solution(h_x, n, expected_value);
// Memory usage
printf("Memory usage (bytes):\n");
printf(" Matrix A: %zu\n", matrix_size * sizeof(double));
printf(" RHS vector b: %zu\n", n * sizeof(double));
printf(" Workspace: %d\n", workspace_size * (int)sizeof(double));
// Cleanup
free(h_A);
free(h_b);
free(h_x);
CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_b));
CUDA_CHECK(cudaFree(d_workspace));
CUDA_CHECK(cudaFree(d_info));
CUSOLVER_CHECK(cusolverDnDestroy(handle));
return EXIT_SUCCESS;
}
Compile with nvcc -o cholesky_solver cholesky_solver.cu -lcusolver -lcublas
run with ./cholesky_solver <n> <full/packed=0/1>