I am following the example of eigen decomposition from here, CUDALibrarySamples/cusolver_syevd_example.cu at master · NVIDIA/CUDALibrarySamples · GitHub
I need to do it for Hermatian complex matrix. The problem is the eigen vector is not matching at all with the result with Matlab result.
Does anyone have any idea about it why this mismatch is happening?
I have also tried cusolverdn svd method to get eigen values and vector that is giving another result.
My code is here for convenience,
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "cusolver_utils.h"
int N = 16;
void BuildMatrix(cuComplex* input);
void main()
{
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
printf("*******************\n");
cuComplex* h_idata = (cuComplex*)malloc(sizeof(cuComplex) * N);
cuComplex* h_eigenVector = (cuComplex*)malloc(sizeof(cuComplex) * N); // eigen vector
float* h_eigenValue = (float*)malloc(sizeof(float) * 4); // eigen Value
BuildMatrix(h_idata);
int count = 0;
for (int i = 0; i < N / 4; i++)
{
for (int j = 0; j < 4; j++)
{
printf("%f + %f\t", h_idata[count].x, h_idata[count].y);
count++;
}
printf("\n");
}
printf("\n*****************\n");
/* step 1: create cusolver handle, bind a stream */
CUSOLVER_CHECK(cusolverDnCreate(&cusolverH));
CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream));
// step 2: reserve memory in cuda and copy input data from host to device
cuComplex* d_idata;
float* d_eigenValue = nullptr;
int* d_info = nullptr;
CUDA_CHECK(cudaMalloc((void**)&d_idata, N * sizeof(cuComplex)));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_eigenValue), N * sizeof(float)));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
CUDA_CHECK(cudaMemcpyAsync(d_idata, h_idata, N * sizeof(cuComplex), cudaMemcpyHostToDevice, stream));
// step 3: query working space of syevd
cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvalues and eigenvectors.
cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
int lwork = 0; /* size of workspace */
cuComplex* d_work = nullptr; /* device workspace*/
const int m = 4;
const int lda = m;
cusolverDnCheevd_bufferSize(cusolverH, jobz, uplo, m, d_idata, lda, d_eigenValue, &lwork);
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuComplex) * lwork));
// step 4: compute spectrum
cusolverDnCheevd(cusolverH, jobz, uplo, m, d_idata, lda, d_eigenValue, d_work, lwork, d_info);
CUDA_CHECK(
cudaMemcpyAsync(h_eigenVector, d_idata, N * sizeof(cuComplex), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(
cudaMemcpyAsync(h_eigenValue, d_eigenValue, 4 * sizeof(double), cudaMemcpyDeviceToHost, stream));
int info = 0;
CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
std::printf("after syevd: info = %d\n", info);
if (0 > info)
{
std::printf("%d-th parameter is wrong \n", -info);
exit(1);
}
count = 0;
for (int i = 0; i < N / 4; i++)
{
for (int j = 0; j < 4; j++)
{
printf("%f + %f\t", h_eigenVector[count].x, h_eigenVector[count].y);
count++;
}
printf("\n");
}
printf("\n");
for (int i = 0; i < N / 4; i++)
{
std::cout << h_eigenValue[i] << std::endl;
}
printf("\n*****************\n");
/* free resources */
CUDA_CHECK(cudaFree(d_idata));
CUDA_CHECK(cudaFree(d_eigenValue));
CUDA_CHECK(cudaFree(d_info));
CUDA_CHECK(cudaFree(d_work));
CUSOLVER_CHECK(cusolverDnDestroy(cusolverH));
CUDA_CHECK(cudaStreamDestroy(stream));
CUDA_CHECK(cudaDeviceReset());
}
//0.5560 + 0.0000i - 0.4864 + 0.0548i 0.8592 + 0.2101i - 1.5374 - 0.2069i
//- 0.4864 - 0.0548i 0.4317 + 0.0000i - 0.7318 - 0.2698i 1.3255 + 0.3344i
//0.8592 - 0.2101i - 0.7318 + 0.2698i 1.4099 + 0.0000i - 2.4578 + 0.2609i
//- 1.5374 + 0.2069i 1.3255 - 0.3344i - 2.4578 - 0.2609i 4.3333 + 0.0000i
void BuildMatrix(cuComplex* input)
{
std::vector<float> realVector = { 0.5560, -0.4864, 0.8592, -1.5374, -0.4864, 0.4317, -0.7318, 1.3255,
0.8592, -0.7318, 1.4099, -2.4578, -1.5374, 1.3255, -2.4578, 4.3333 };
std::vector<float> imagVector = { 0, -0.0548, -0.2101, 0.2069, 0.0548, 0.0000, 0.2698, -0.3344,
0.2101, -0.2698, 0, -0.2609, -0.2069, 0.3344, 0.2609, 0 };
for (int i = 0; i < N; i++)
{
input[i].x = realVector.at(i) * std::pow(10, 11);
input[i].y = imagVector.at(i) * std::pow(10, 11);
}
}
I checked with MATLAB, CUDA result does not match with MATLAB. I validated MATLAB result by doing A*EigenVector = EigenValue*EigenVector
. But the CUDA Eigen decomposition result does not validate this eigen decomposition equation.
I raised this issue in their git ( Eigen decomposition for Hermitian complex matrix does not match the result with matlab · Issue #58 · NVIDIA/CUDALibrarySamples · GitHub), but unfortunately did not get any answer yet.