Hi,
I am new to CUDA programming. I was testing a cuda code for matrix inversion using LU in cuSOLVER (cusolverDnDgetrf
and then cusolverDnDgetrs
). I used eigen3 (libeigen3-dev) library to compute the inverse of the same matrix. I also verified the output inverted matrix using the eigen library (AA^-1 = I).
The difficulty I found was that I was obtaining correct results for matrix sizes up to 25 X 25. However, for size bigger than 25, the cusolver result differs from the eigen output and fails to validate AA^-1 = I.
Can somebody tell me whether I’m doing something incorrectly or what the likely cause of the issue is?
Thanks in advance !
System Information:
- CUDA Version: 12.5
- cuSOLVER Version: 11602
- GPU: NVIDIA T1000 (8 GB)
- Compute Capability: 7.5
- Operating System: Ubuntu 22.04
- Compiler: nvcc V12.5.40
Here is my complete code:
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <chrono>
#include <Eigen/Dense>
#define N 20
// Size of the matrix
static void HandleError( cudaError_t err,
const char *file,
int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
const char* cusolverGetErrorString(cusolverStatus_t status) {
switch(status) {
case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED: return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED: return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE: return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH: return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_MAPPING_ERROR: return "CUSOLVER_STATUS_MAPPING_ERROR";
case CUSOLVER_STATUS_EXECUTION_FAILED: return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR: return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
default: return "Unknown cuSOLVER status";
}
}
#define CHECK_CUSOLVER(call) \
do { \
cusolverStatus_t status = (call); \
if (status != CUSOLVER_STATUS_SUCCESS) { \
std::cerr << "CUSOLVER error at " << __FILE__ << ":" << __LINE__ << std::endl; \
std::cerr << cusolverGetErrorString(status) << std::endl; \
return EXIT_FAILURE; \
} \
} while(0)
void printMatrix(double *matrix, int n) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
std::cout << matrix[i * n + j] << " ";
}
std::cout << std::endl;
}
}
void verifyInverse(double *A, double *invA, int n) {
Eigen::Map<Eigen::MatrixXd> matA(A, n, n);
Eigen::Map<Eigen::MatrixXd> matInvA(invA, n, n);
Eigen::MatrixXd I = matA * matInvA;
// Verify if A * A^-1 = I (Identity Matrix)
if (I.isIdentity(1e-6)) {
std::cout << "Verification successful: The computed inverse is correct." << std::endl;
} else {
std::cout << "Verification failed: The computed inverse is incorrect." << std::endl;
}
// Print the resulting identity matrix for detailed comparison
// std::cout << "Resulting identity matrix from A * A^-1:\n" << I << std::endl;
}
void verifyCPUInverse(Eigen::MatrixXd& matA, Eigen::MatrixXd& matInvA) {
Eigen::MatrixXd I = matA * matInvA;
// Verify if A * A^-1 = I (Identity Matrix)
if (I.isIdentity(1e-6)) {
std::cout << "Verification successful: The CPU-computed inverse is correct." << std::endl;
} else {
std::cout << "Verification failed: The CPU-computed inverse is incorrect." << std::endl;
}
// Print the resulting identity matrix for detailed comparison
// std::cout << "Resulting identity matrix from A * A^-1 (CPU):\n" << I << std::endl;
}
void printArray(int *array, int n) {
for (int i = 0; i < n; i++) {
std::cout << array[i] << " ";
}
std::cout << std::endl;
}
int main() {
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
double *h_A = (double*)malloc(sizeof(double) * N * N);
double *h_invA = (double*)malloc(sizeof(double) * N * N);
double *d_A = nullptr;
double *d_invA = nullptr;
double *d_work = nullptr;
int *d_Ipiv = nullptr;
int *d_info = nullptr;
int h_Ipiv[N];
int h_info;
int lwork;
// Initialize cuSOLVER
CHECK_CUSOLVER(cusolverDnCreate(&cusolverH));
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
CHECK_CUSOLVER(cusolverDnSetStream(cusolverH, stream));
// Generate a random matrix A
srand(time(0));
for (int i = 0; i < N * N; i++) {
h_A[i] = rand() % 100 + 1; // Random numbers between 1 and 100
}
// Print the original matrix
// std::cout << "Original matrix A:\n";
// printMatrix(h_A, N);
auto start_gpu = std::chrono::high_resolution_clock::now();
// Allocate device memory
HANDLE_ERROR(cudaMalloc((void**)&d_A, sizeof(double) * N * N));
HANDLE_ERROR(cudaMalloc((void**)&d_invA, sizeof(double) * N * N));
HANDLE_ERROR(cudaMalloc((void**)&d_Ipiv, sizeof(int) * N));
HANDLE_ERROR(cudaMalloc((void**)&d_info, sizeof(int)));
// Copy data from host to device
HANDLE_ERROR(cudaMemcpy(d_A, h_A, sizeof(double) * N * N, cudaMemcpyHostToDevice));
// Query working space of getrf
CHECK_CUSOLVER(cusolverDnDgetrf_bufferSize(cusolverH, N, N, d_A, N, &lwork));
HANDLE_ERROR(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
// Measure GPU execution time
// Perform LU decomposition
CHECK_CUSOLVER(cusolverDnDgetrf(cusolverH, N, N, d_A, N, d_work, d_Ipiv, d_info));
// Check if LU decomposition was successful
HANDLE_ERROR(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
if (h_info != 0) {
std::cerr << "LU decomposition failed" << std::endl;
return -1;
}
// Copy LU decomposition result and pivot indices to host
double *h_LU = (double*)malloc(sizeof(double) * N * N);
HANDLE_ERROR(cudaMemcpy(h_LU, d_A, sizeof(double) * N * N, cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(h_Ipiv, d_Ipiv, sizeof(int) * N, cudaMemcpyDeviceToHost));
// Print LU decomposition result
// std::cout << "LU decomposition result:\n";
// printMatrix(h_LU, N);
// Print pivot indices
std::cout << "Pivot indices:\n";
printArray(h_Ipiv, N);
// Create an identity matrix as the right-hand side
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
if (i == j)
h_invA[i * N + j] = 1.0;
else
h_invA[i * N + j] = 0.0;
}
}
// Copy the identity matrix to the device
HANDLE_ERROR(cudaMemcpy(d_invA, h_invA, sizeof(double) * N * N, cudaMemcpyHostToDevice));
// Solve the system A * X = I to get the inverse
CHECK_CUSOLVER(cusolverDnDgetrs(cusolverH, CUBLAS_OP_N, N, N, d_A, N, d_Ipiv, d_invA, N, d_info));
// Check if the solve was successful
HANDLE_ERROR(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
if (h_info != 0) {
std::cerr << "Matrix inversion failed" << std::endl;
return -1;
}
// Measure GPU execution time
auto end_gpu = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> gpu_time = end_gpu - start_gpu;
std::cout << "GPU execution time: " << gpu_time.count() << " seconds" << std::endl;
// Copy the inverse matrix back to the host
HANDLE_ERROR(cudaMemcpy(h_invA, d_invA, sizeof(double) * N * N, cudaMemcpyDeviceToHost));
// Print the inverse matrix
// std::cout << "Inverse matrix A^-1 (GPU):\n";
// printMatrix(h_invA, N);
// Verify the inverse matrix
verifyInverse(h_A, h_invA, N);
// Clean up resources
HANDLE_ERROR(cudaFree(d_A));
HANDLE_ERROR(cudaFree(d_invA));
HANDLE_ERROR(cudaFree(d_Ipiv));
HANDLE_ERROR(cudaFree(d_info));
HANDLE_ERROR(cudaFree(d_work));
CHECK_CUSOLVER(cusolverDnDestroy(cusolverH));
cudaStreamDestroy(stream);
// Measure CPU execution time
Eigen::MatrixXd matA = Eigen::Map<Eigen::MatrixXd>(h_A, N, N);
auto start_cpu = std::chrono::high_resolution_clock::now();
Eigen::MatrixXd matInvA = matA.inverse();
auto end_cpu = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> cpu_time = end_cpu - start_cpu;
std::cout << "CPU execution time: " << cpu_time.count() << " seconds" << std::endl;
// Print the CPU inverse matrix
// std::cout << "Inverse matrix A^-1 (CPU):\n" << matInvA << std::endl;
// Verify the CPU-computed inverse matrix
verifyCPUInverse(matA, matInvA);
// Free host memory
free(h_A);
free(h_invA);
free(h_LU);
return 0;
}