cuBLAS EVD function not satisfy AV = VD

Robert_Crovella · August 6, 2024, 4:18pm

cublas and cusolver both expect column-major storage.
we can observe that the produced eigenvalues between the two cases are the same
I don’t recommend this construct the way you are using it:
```
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
```
see here for discussion.
I recommend rigorous error checking any time you are having trouble with a CUDA code
I’m not really sure what you are trying to do here:
```
(cuDoubleComplex*)&W[i],
```
that is not in any way proper for the computation of VD. You’re taking the two eigenvalues and treating them as a single complex number.
When you specified CUSOLVER_EIG_MODE_VECTOR, you declared to cusolver that it should overwrite d_A with the eigenvectors. Therefore your computation of AV is not correct.

Here is an example that has those/various issues addressed, and seems to produce a valid comparison:

# cat t246.cu
#include <iostream>
#include <vector>
#include <cuComplex.h>
#include <cusolverDn.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>

// Helper function to print a complex matrix
void printMatrix(const cuDoubleComplex* A, int rows, int cols) {
    for (int i = 0; i < cols; ++i) {
        for (int j = 0; j < rows; ++j) {
            std::cout << "(" << cuCreal(A[j * cols + i]) << ", " << cuCimag(A[j * cols + i]) << ") ";
        }
        std::cout << std::endl;
    }
}

// Helper function to check if two complex numbers are approximately equal
bool isApproxEqual(cuDoubleComplex a, cuDoubleComplex b, double tol = 1e-6) {
    return (fabs(cuCreal(a) - cuCreal(b)) < tol) && (fabs(cuCimag(a) - cuCimag(b)) < tol);
}

int main() {
    const int N = 2;
    // Define a 2x2 complex covariance matrix
    cuDoubleComplex A[N * N] = {make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(1.0, 1.0),
                                make_cuDoubleComplex(1.0, -1.0), make_cuDoubleComplex(3.0, 0.0)};

    std::cout << "Original matrix A:" << std::endl;
    printMatrix(A, N, N);

    // cuBLAS and cuSOLVER setup
    cusolverDnHandle_t cusolverH = nullptr;
    cublasHandle_t cublasH = nullptr;
    cudaStream_t stream = nullptr;
    cusolverStatus_t csstat = cusolverDnCreate(&cusolverH);
    if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;
    cublasStatus_t cbstat = cublasCreate(&cublasH);
    if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;
  //  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    cudaStreamCreate(&stream);
    csstat = cusolverDnSetStream(cusolverH, stream);
    if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;
    cbstat = cublasSetStream(cublasH, stream);
    if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;

    // Device memory for matrix A, eigenvalues, and workspace
    cuDoubleComplex* d_A = nullptr;
    double* d_W = nullptr;
    int* devInfo = nullptr;
    int lwork = 0;
    cuDoubleComplex* d_work = nullptr;

    cudaMalloc((void**)&d_A, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_W, sizeof(double) * N);
    cudaMalloc((void**)&devInfo, sizeof(int));

    cudaMemcpy(d_A, A, sizeof(cuDoubleComplex) * N * N, cudaMemcpyHostToDevice);

    // Query working space
    csstat = cusolverDnZheevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, d_A, N, d_W, &lwork);
    if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;
    cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork);
    // Compute EVD
    csstat = cusolverDnZheevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, d_A, N, d_W, d_work, lwork, devInfo);
    if (csstat != CUSOLVER_STATUS_SUCCESS) std::cout << __LINE__ << " cusolver error: " << (int)csstat << std::endl;

    // Copy results back to host
    cuDoubleComplex V[N * N];
    double W[N];
    cudaMemcpy(V, d_A, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);
    cudaMemcpy(W, d_W, sizeof(double) * N, cudaMemcpyDeviceToHost);
    int hinfo;
    cudaMemcpy(&hinfo, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
    if (hinfo) std::cout << "devInfo: " << hinfo << std::endl;
    std::cout << "Eigenvalues (diagonal of D):" << std::endl;
    for (int i = 0; i < N; ++i) {
        std::cout << W[i] << " ";
    }
    std::cout << std::endl;

    std::cout << "Eigenvectors (columns of V):" << std::endl;
    printMatrix(V, N, N);

    // Verify A * V = V * D using cuBLAS
    cuDoubleComplex* d_V = nullptr;
    cuDoubleComplex* d_D = nullptr;
    cuDoubleComplex* d_DV = nullptr;
    cuDoubleComplex* d_AV = nullptr;

    cudaMalloc((void**)&d_V, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_D, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_DV, sizeof(cuDoubleComplex) * N * N);
    cudaMalloc((void**)&d_AV, sizeof(cuDoubleComplex) * N * N);

    cudaMemset(d_D, 0, N*N*sizeof(cuDoubleComplex));
    cuDoubleComplex ev1 = make_cuDoubleComplex(W[0], 0.0);
    cuDoubleComplex ev2 = make_cuDoubleComplex(W[1], 0.0);
    cudaMemcpy(d_D, &ev1, sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
    cudaMemcpy(d_D+3, &ev2, sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);

    cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
    cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0);

    // Calculate V * D  d_A now contains the eigenvectors, it is effectively V
    cbstat = cublasZgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_D, N, &beta, d_DV, N);
    if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;

    cuDoubleComplex* d_nA = nullptr;
    cudaMalloc(&d_nA, N*N*sizeof(cuDoubleComplex));
    cudaMemcpy(d_nA, A, N*N*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
    // Calculate A * V
    cbstat = cublasZgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_nA, N, d_A, N, &beta, d_AV, N);
    if (cbstat != CUBLAS_STATUS_SUCCESS) std::cout << __LINE__ << " cublas error: " << (int)cbstat << std::endl;

    // Copy results back to host
    cuDoubleComplex AV[N * N];
    cuDoubleComplex DV[N * N];
    cudaMemcpy(AV, d_AV, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);
    cudaMemcpy(DV, d_DV, sizeof(cuDoubleComplex) * N * N, cudaMemcpyDeviceToHost);

    std::cout << "A * V:" << std::endl;
    printMatrix(AV, N, N);

    std::cout << "V * D:" << std::endl;
    printMatrix(DV, N, N);

    // Check if A * V is approximately equal to V * D
    bool equal = true;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            if (!isApproxEqual(AV[i * N + j], DV[i * N + j])) {
                equal = false;
                break;
            }
        }
    }

    if (equal) {
        std::cout << "Verification successful: A * V = V * D" << std::endl;
    } else {
        std::cout << "Verification failed: A * V != V * D" << std::endl;
    }

    // Clean up
    cudaFree(d_A);
    cudaFree(d_W);
    cudaFree(devInfo);
    cudaFree(d_work);
    cudaFree(d_V);
    cudaFree(d_DV);
    cudaFree(d_AV);
    cusolverDnDestroy(cusolverH);
    cublasDestroy(cublasH);
    cudaStreamDestroy(stream);

    return 0;
}
# nvcc -o t246 t246.cu -lcublas -lcusolver
# compute-sanitizer ./t246
========= COMPUTE-SANITIZER
Original matrix A:
(4, 0) (1, -1)
(1, 1) (3, 0)
Eigenvalues (diagonal of D):
2 5
Eigenvectors (columns of V):
(-0.408248, 0.408248) (-0.57735, 0.57735)
(0.816497, 0) (-0.57735, 0)
A * V:
(-0.816497, 0.816497) (-2.88675, 2.88675)
(1.63299, 0) (-2.88675, 0)
V * D:
(-0.816497, 0.816497) (-2.88675, 2.88675)
(1.63299, 0) (-2.88675, 0)
Verification successful: A * V = V * D
========= ERROR SUMMARY: 0 errors
#

Topic		Replies	Views
Cusolver SVD for general complex matrix GPU-Accelerated Libraries cusolver	4	2102	October 12, 2021
Stream capture of cublas gemm CUDA Programming and Performance	8	674	March 31, 2025
cuBLAS call from kernel in CUDA 10.0 GPU-Accelerated Libraries	9	4838	April 7, 2021
Cuda-memcheck error in cusolverDnCgesvdjBatched function CUDA-MEMCHECK	1	924	September 27, 2021
How to run eigen decomposition of multiple Hermitian matrices in parallel? GPU-Accelerated Libraries cusolver	0	1136	March 4, 2022
cublasZgemm() gives false result for large data and potential bug GPU-Accelerated Libraries	6	1149	October 12, 2021
CUBLAS problem CUDA Programming and Performance	32	19289	March 28, 2012
cusparse Incomplete Cholesky CG - incorrect results GPU-Accelerated Libraries	9	3775	June 16, 2013
Strange behaviour setMatrix 3.2 v 3.1 CUDA Programming and Performance	12	12456	October 22, 2010
Eigen decomposition of Hermitian Matrix using CuSolver does not match the result with matlab GPU-Accelerated Libraries cusolver	8	1716	March 1, 2022

cuBLAS EVD function not satisfy AV = VD

Related topics