cuBLAS EVD function not satisfy AV = VD

tnwilly · August 6, 2024, 9:20am

Hi,

I’m using cuBLAS EVD to obtain the eigen values and eigen vectors of a covariance matrix.
But the results didn’t satisfy AV = VD (covariance mat * eigen vector = eigen vector * eigen values)

I also do the same operation with Eigen lib, and made it.

Can someone explain the mismatch?

Eigen Lib Version (satisfied)

#include <iostream>
#include <Eigen/Dense>

// Helper function to print a complex matrix
void printMatrix(const Eigen::MatrixXcd& mat) {
    for (int i = 0; i < mat.rows(); ++i) {
        for (int j = 0; j < mat.cols(); ++j) {
            std::cout << "(" << mat(i, j).real() << ", " << mat(i, j).imag() << ") ";
        }
        std::cout << std::endl;
    }
}

int main() {
    const int N = 2;
    // Define a 2x2 complex covariance matrix
    Eigen::MatrixXcd A(N, N);
    A(0, 0) = std::complex<double>(4.0, 0.0);
    A(0, 1) = std::complex<double>(1.0, 1.0);
    A(1, 0) = std::complex<double>(1.0, -1.0);
    A(1, 1) = std::complex<double>(3.0, 0.0);

    std::cout << "Original matrix A:" << std::endl;
    printMatrix(A);

    // Compute EVD using Eigen
    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXcd> solver(A);
    Eigen::VectorXcd eigenvalues = solver.eigenvalues();
    Eigen::MatrixXcd eigenvectors = solver.eigenvectors();

    std::cout << "Eigenvalues (diagonal of D):" << std::endl;
    for (int i = 0; i < eigenvalues.size(); ++i) {
        std::cout << eigenvalues[i] << " ";
    }
    std::cout << std::endl;

    std::cout << "Eigenvectors (columns of V):" << std::endl;
    printMatrix(eigenvectors);

    // Verify A * V = V * D
    Eigen::MatrixXcd D = eigenvalues.asDiagonal();
    Eigen::MatrixXcd AV = A * eigenvectors;
    Eigen::MatrixXcd VD = eigenvectors * D;

    std::cout << "A * V:" << std::endl;
    printMatrix(AV);

    std::cout << "V * D:" << std::endl;
    printMatrix(VD);

    // Check if A * V is approximately equal to V * D
    bool equal = AV.isApprox(VD, 1e-6);

    if (equal) {
        std::cout << "Verification successful: A * V = V * D" << std::endl;
    } else {
        std::cout << "Verification failed: A * V != V * D" << std::endl;
    }

    return 0;
}

Original matrix A:
(4, 0) (1, 1)
(1, -1) (3, 0)
Eigenvalues (diagonal of D):
(2,0) (5,0)
Eigenvectors (columns of V):
(0.57735, 0) (0.816497, 0)
(-0.57735, 0.57735) (0.408248, -0.408248)
A * V:
(1.1547, 1.11022e-16) (4.08248, -1.66533e-16)
(-1.1547, 1.1547) (2.04124, -2.04124)
V * D:
(1.1547, 0) (4.08248, 0)
(-1.1547, 1.1547) (2.04124, -2.04124)
Verification successful: A * V = V * D

cuBLAS Version (Failed)

#include <iostream>
#include <vector>
#include <cuComplex.h>
#include <cusolverDn.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>

// Helper function to print a complex matrix
void printMatrix(const cuDoubleComplex* A, int rows, int cols) {
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            std::cout << "(" << cuCreal(A[i * cols + j]) << ", " << cuCimag(A[i * cols + j]) << ") ";
        }
        std::cout << std::endl;
    }
}

// Helper function to check if two complex numbers are approximately equal
bool isApproxEqual(cuDoubleComplex a, cuDoubleComplex b, double tol = 1e-6) {
    return (fabs(cuCreal(a) - cuCreal(b)) < tol) && (fabs(cuCimag(a) - cuCimag(b)) < tol);
}

int main() {
    const int N = 2;
    // Define a 2x2 complex covariance matrix
    cuDoubleComplex A[N * N] = {make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(1.0, 1.0),
                                make_cuDoubleComplex(1.0, -1.0), make_cuDoubleComplex(3.0, 0.0)};

    std::cout << "Original matrix A:" << std::endl;
    printMatrix(A, N, N);

    // cuBLAS and cuSOLVER setup
    cusolverDnHandle_t cusolverH = nullptr;
    cublasHandle_t cublasH = nullptr;
    cudaStream_t stream = nullptr;
    cusolverDnCreate(&cusolverH);
    cublasCreate(&cublasH);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    cusolverDnSetStream(cusolverH, stream);
    cublasSetStream(cublasH, stream);

    // Device memory for matrix A, eigenvalues, and workspace
    cuDoubleComplex* d_A = nullptr;
    double* d_W = nullptr;
    int* devInfo = nullptr;
    int lwork = 0;
    cuDoubleComplex* d_work = nullptr;

    cudaMalloc((void**)&d_A, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_W, sizeof(double) * N);
    cudaMalloc((void**)&devInfo, sizeof(int));

    cudaMemcpy(d_A, A, sizeof(cuDoubleComplex) * N * N, cudaMemcpyHostToDevice);

    // Query working space
    cusolverDnZheevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, d_A, N, d_W, &lwork);
    cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork);

    // Compute EVD
    cusolverDnZheevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, d_A, N, d_W, d_work, lwork, devInfo);

    // Copy results back to host
    cuDoubleComplex V[N * N];
    double W[N];
    cudaMemcpy(V, d_A, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);
    cudaMemcpy(W, d_W, sizeof(double) * N, cudaMemcpyDeviceToHost);

    std::cout << "Eigenvalues (diagonal of D):" << std::endl;
    for (int i = 0; i < N; ++i) {
        std::cout << W[i] << " ";
    }
    std::cout << std::endl;

    std::cout << "Eigenvectors (columns of V):" << std::endl;
    printMatrix(V, N, N);

    // Verify A * V = V * D using cuBLAS
    cuDoubleComplex* d_V = nullptr;
    cuDoubleComplex* d_DV = nullptr;
    cuDoubleComplex* d_AV = nullptr;

    cudaMalloc((void**)&d_V, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_DV, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_AV, sizeof(cuDoubleComplex) * N * N);

    cudaMemcpy(d_V, V, sizeof(cuDoubleComplex) * N * N, cudaMemcpyHostToDevice);

    cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
    cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0);

    // Calculate V * D
    for (int i = 0; i < N; ++i) {
        cublasZscal(cublasH, N, &alpha, &d_V[i * N], 1);
        cublasZscal(cublasH, N, (cuDoubleComplex*)&W[i], &d_V[i * N], 1);
    }

    cudaMemcpy(d_DV, d_V, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToDevice);

    // Calculate A * V
    cublasZgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_V, N, &beta, d_AV, N);

    // Copy results back to host
    cuDoubleComplex AV[N * N];
    cuDoubleComplex DV[N * N];
    cudaMemcpy(AV, d_AV, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);
    cudaMemcpy(DV, d_DV, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);

    std::cout << "A * V:" << std::endl;
    printMatrix(AV, N, N);

    std::cout << "V * D:" << std::endl;
    printMatrix(DV, N, N);

    // Check if A * V is approximately equal to V * D
    bool equal = true;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            if (!isApproxEqual(AV[i * N + j], DV[i * N + j])) {
                equal = false;
                break;
            }
        }
    }

    if (equal) {
        std::cout << "Verification successful: A * V = V * D" << std::endl;
    } else {
        std::cout << "Verification failed: A * V != V * D" << std::endl;
    }

    // Clean up
    cudaFree(d_A);
    cudaFree(d_W);
    cudaFree(devInfo);
    cudaFree(d_work);
    cudaFree(d_V);
    cudaFree(d_DV);
    cudaFree(d_AV);
    cusolverDnDestroy(cusolverH);
    cublasDestroy(cublasH);
    cudaStreamDestroy(stream);

    return 0;
}

Original matrix A:
(4, 0) (1, 1)
(1, -1) (3, 0)
Eigenvalues (diagonal of D):
2 5
Eigenvectors (columns of V):
(-0.408248, 0.408248) (0.816497, 0)
(-0.57735, 0.57735) (-0.57735, 0)
A * V:
(-1.63316, -2.08088) (-3.27614, -3.35702)
(4.88562, -2.69036) (-2.57597, 1.80474)
V * D:
(-2.85774, -1.22474) (1.63299, 4.08248)
(-5.19615, 0.57735) (-2.88675, -2.3094)
Verification failed: A * V != V * D

Thanks!

Robert_Crovella · August 6, 2024, 4:18pm

cublas and cusolver both expect column-major storage.
we can observe that the produced eigenvalues between the two cases are the same
I don’t recommend this construct the way you are using it:
```
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
```
see here for discussion.
I recommend rigorous error checking any time you are having trouble with a CUDA code
I’m not really sure what you are trying to do here:
```
(cuDoubleComplex*)&W[i],
```
that is not in any way proper for the computation of VD. You’re taking the two eigenvalues and treating them as a single complex number.
When you specified CUSOLVER_EIG_MODE_VECTOR, you declared to cusolver that it should overwrite d_A with the eigenvectors. Therefore your computation of AV is not correct.

Here is an example that has those/various issues addressed, and seems to produce a valid comparison:

# cat t246.cu
#include <iostream>
#include <vector>
#include <cuComplex.h>
#include <cusolverDn.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>

// Helper function to print a complex matrix
void printMatrix(const cuDoubleComplex* A, int rows, int cols) {
    for (int i = 0; i < cols; ++i) {
        for (int j = 0; j < rows; ++j) {
            std::cout << "(" << cuCreal(A[j * cols + i]) << ", " << cuCimag(A[j * cols + i]) << ") ";
        }
        std::cout << std::endl;
    }
}

// Helper function to check if two complex numbers are approximately equal
bool isApproxEqual(cuDoubleComplex a, cuDoubleComplex b, double tol = 1e-6) {
    return (fabs(cuCreal(a) - cuCreal(b)) < tol) && (fabs(cuCimag(a) - cuCimag(b)) < tol);
}

int main() {
    const int N = 2;
    // Define a 2x2 complex covariance matrix
    cuDoubleComplex A[N * N] = {make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(1.0, 1.0),
                                make_cuDoubleComplex(1.0, -1.0), make_cuDoubleComplex(3.0, 0.0)};

    std::cout << "Original matrix A:" << std::endl;
    printMatrix(A, N, N);

    // cuBLAS and cuSOLVER setup
    cusolverDnHandle_t cusolverH = nullptr;
    cublasHandle_t cublasH = nullptr;
    cudaStream_t stream = nullptr;
    cusolverStatus_t csstat = cusolverDnCreate(&cusolverH);
    if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;
    cublasStatus_t cbstat = cublasCreate(&cublasH);
    if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;
  //  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    cudaStreamCreate(&stream);
    csstat = cusolverDnSetStream(cusolverH, stream);
    if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;
    cbstat = cublasSetStream(cublasH, stream);
    if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;

    // Device memory for matrix A, eigenvalues, and workspace
    cuDoubleComplex* d_A = nullptr;
    double* d_W = nullptr;
    int* devInfo = nullptr;
    int lwork = 0;
    cuDoubleComplex* d_work = nullptr;

    cudaMalloc((void**)&d_A, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_W, sizeof(double) * N);
    cudaMalloc((void**)&devInfo, sizeof(int));

    cudaMemcpy(d_A, A, sizeof(cuDoubleComplex) * N * N, cudaMemcpyHostToDevice);

    // Query working space
    csstat = cusolverDnZheevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, d_A, N, d_W, &lwork);
    if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;
    cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork);
    // Compute EVD
    csstat = cusolverDnZheevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, d_A, N, d_W, d_work, lwork, devInfo);
    if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;

    // Copy results back to host
    cuDoubleComplex V[N * N];
    double W[N];
    cudaMemcpy(V, d_A, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);
    cudaMemcpy(W, d_W, sizeof(double) * N, cudaMemcpyDeviceToHost);
    int hinfo;
    cudaMemcpy(&hinfo, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
    if (hinfo) std::cout << "devInfo: " << hinfo << std::endl;
    std::cout << "Eigenvalues (diagonal of D):" << std::endl;
    for (int i = 0; i < N; ++i) {
        std::cout << W[i] << " ";
    }
    std::cout << std::endl;

    std::cout << "Eigenvectors (columns of V):" << std::endl;
    printMatrix(V, N, N);

    // Verify A * V = V * D using cuBLAS
    cuDoubleComplex* d_V = nullptr;
    cuDoubleComplex* d_D = nullptr;
    cuDoubleComplex* d_DV = nullptr;
    cuDoubleComplex* d_AV = nullptr;

    cudaMalloc((void**)&d_V, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_D, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_DV, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_AV, sizeof(cuDoubleComplex) * N * N);

    cudaMemset(d_D, 0, N*N*sizeof(cuDoubleComplex));
    cuDoubleComplex ev1 = make_cuDoubleComplex(W[0], 0.0);
    cuDoubleComplex ev2 = make_cuDoubleComplex(W[1], 0.0);
    cudaMemcpy(d_D, &ev1, sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
    cudaMemcpy(d_D+3, &ev2, sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);

    cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
    cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0);

    // Calculate V * D  d_A now contains the eigenvectors, it is effectively V
    cbstat = cublasZgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_D, N, &beta, d_DV, N);
    if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;

    cuDoubleComplex* d_nA = nullptr;
    cudaMalloc(&d_nA, N*N*sizeof(cuDoubleComplex));
    cudaMemcpy(d_nA, A, N*N*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
    // Calculate A * V
    cbstat = cublasZgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_nA, N, d_A, N, &beta, d_AV, N);
    if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;

    // Copy results back to host
    cuDoubleComplex AV[N * N];
    cuDoubleComplex DV[N * N];
    cudaMemcpy(AV, d_AV, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);
    cudaMemcpy(DV, d_DV, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);

    std::cout << "A * V:" << std::endl;
    printMatrix(AV, N, N);

    std::cout << "V * D:" << std::endl;
    printMatrix(DV, N, N);

    // Check if A * V is approximately equal to V * D
    bool equal = true;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            if (!isApproxEqual(AV[i * N + j], DV[i * N + j])) {
                equal = false;
                break;
            }
        }
    }

    if (equal) {
        std::cout << "Verification successful: A * V = V * D" << std::endl;
    } else {
        std::cout << "Verification failed: A * V != V * D" << std::endl;
    }

    // Clean up
    cudaFree(d_A);
    cudaFree(d_W);
    cudaFree(devInfo);
    cudaFree(d_work);
    cudaFree(d_V);
    cudaFree(d_DV);
    cudaFree(d_AV);
    cusolverDnDestroy(cusolverH);
    cublasDestroy(cublasH);
    cudaStreamDestroy(stream);

    return 0;
}
# nvcc -o t246 t246.cu -lcublas -lcusolver
# compute-sanitizer ./t246
========= COMPUTE-SANITIZER
Original matrix A:
(4, 0) (1, -1)
(1, 1) (3, 0)
Eigenvalues (diagonal of D):
2 5
Eigenvectors (columns of V):
(-0.408248, 0.408248) (-0.57735, 0.57735)
(0.816497, 0) (-0.57735, 0)
A * V:
(-0.816497, 0.816497) (-2.88675, 2.88675)
(1.63299, 0) (-2.88675, 0)
V * D:
(-0.816497, 0.816497) (-2.88675, 2.88675)
(1.63299, 0) (-2.88675, 0)
Verification successful: A * V = V * D
========= ERROR SUMMARY: 0 errors
#

tnwilly · August 7, 2024, 3:20am

@Robert_Crovella,

I got it, even the input of cublas expect col-majored storage.

Another question, do you recommend to convert all of my matrixes into row-majored?

Also noted other comments.

Thanks a lot!

tnwilly · August 7, 2024, 6:00am

@Robert_Crovella,

I tried with another matrix A.
Can I ask if this is a numerical error?

Original matrix A:
(10, 0) (1, 1) 
(1, 1) (3, 0) 

Eigenvalues (diagonal of D):
2.72508 10.2749 

Eigenvectors (columns of V):
(-0.134933, -0.134933) (-0.694113, -0.694113) 
(0.981624, 0) (-0.190824, 0) 

A * V:
(-0.367703, -0.367703) (-7.13196, -7.13196) 
(2.94487, -0.269865) (-0.572471, -1.38823) 

V * D:
(-0.367703, -0.367703) (-7.13196, -7.13196) 
(2.67501, 0) (-1.9607, 0) 
Verification failed: A * V != V * D

Robert_Crovella · August 7, 2024, 2:34pm

The Zheevd/Cheevd (cusolver) functions expect and require an A matrix that is hermitian, from here:

This function computes eigenvalues and eigenvectors of a symmetric (Hermitian) n×n matrix A .

The requirement is symmetric for real matrices, and hermitian for complex matrices.

Your original matrix in this thread is hermitian:

This one is not:

You should not attempt to use Zheevd/Cheevd on non-hermitian matrices, and you should expect undefined behavior if you do it anyway.

system · August 21, 2024, 2:35pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Cusolver SVD for general complex matrix GPU-Accelerated Libraries cusolver	4	2124	October 12, 2021
Stream capture of cublas gemm CUDA Programming and Performance	8	695	March 31, 2025
cuBLAS call from kernel in CUDA 10.0 GPU-Accelerated Libraries	9	4849	April 7, 2021
Cuda-memcheck error in cusolverDnCgesvdjBatched function CUDA-MEMCHECK	1	924	September 27, 2021
How to run eigen decomposition of multiple Hermitian matrices in parallel? GPU-Accelerated Libraries cusolver	0	1138	March 4, 2022
cublasZgemm() gives false result for large data and potential bug GPU-Accelerated Libraries	6	1150	October 12, 2021
CUBLAS problem CUDA Programming and Performance	32	19292	March 28, 2012
cusparse Incomplete Cholesky CG - incorrect results GPU-Accelerated Libraries	9	3780	June 16, 2013
Strange behaviour setMatrix 3.2 v 3.1 CUDA Programming and Performance	12	12456	October 22, 2010
Eigen decomposition of Hermitian Matrix using CuSolver does not match the result with matlab GPU-Accelerated Libraries cusolver	8	1728	March 1, 2022

cuBLAS EVD function not satisfy AV = VD

Related topics