# How to run eigen decomposition of multiple Hermitian matrices in parallel?

I need to do eigen decomposition for a number of small matrices in parallel. Therefore, I need to choose the evd function running in batch. I do not see any function for `cusolverDnCheevd` which is running in Batch. Is there any?

I see a function `cusolverDnCheevjBatched` which runs in batch. It seems it uses `Jacobi algorithm `. Unfortunately, after implementing It is not giving the same result as `cusolverDnCheevd`. Is there anything am I missing?

I followed the example from here,
CUDALibrarySamples/cusolver_syevjBatched_example.cu at master · NVIDIA/CUDALibrarySamples · GitHub

For the same input I modified this example and the code is below. I did not change any parameter for Jacobi.

``````#include <cstdio>
#include <cstdlib>
#include <vector>

#include <cuda_runtime.h>
#include <cusolverDn.h>

#include "cusolver_utils.h"

int N = 16;
void BuildMatrix(cuComplex* input);

int main(int argc, char* argv[]) {
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
syevjInfo_t syevj_params = NULL;

const int m = 4;
const int lda = m;
const int batchSize = 2; // Number of Inputs/Matrices

cuComplex c;
c.x = 0;
c.y = 0;
std::vector<cuComplex> A(lda * m * batchSize, c); /* V = [A0 ; A1] */
std::vector<cuComplex> V(lda * m * batchSize, c); /* V = [V0 ; V1] */
std::vector<float> W(m * batchSize, 0);       /* W = [W0 ; W1] */
std::vector<int> info(batchSize, 0);           /* info = [info0 ; info1] */

cuComplex* d_A = nullptr;    /* lda-by-m-by-batchSize */
float* d_W = nullptr;    /* m-by-batchSize */
int* d_info = nullptr;    /* batchSize */
cuComplex* d_work = nullptr; /* device workspace for syevjBatched */
int lwork = 0;            /* size of workspace */

/* configuration of syevj  */
const double tol = 1.e-7;
const int max_sweeps = 15;
const int sort_eig = 1;                                  /* don't sort eigenvalues */
const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; /* compute eigenvectors */
const cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;

cuComplex* A0 = A.data();
cuComplex* A1 = A.data() + lda * m;
BuildMatrix(A0);
BuildMatrix(A1);

std::printf("A0 = (matlab base-1)\n");
print_matrix(m, m, A0, lda);
std::printf("=====\n");

std::printf("A1 = (matlab base-1)\n");
print_matrix(m, m, A1, lda);
std::printf("=====\n");

/* step 1: create cusolver handle, bind a stream */
CUSOLVER_CHECK(cusolverDnCreate(&cusolverH));

CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream));

///* step 2: configuration of syevj */
CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params));

/* default value of tolerance is machine zero */
CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol));

/* default value of max. sweeps is 100 */
CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, max_sweeps));

/* disable sorting */
CUSOLVER_CHECK(cusolverDnXsyevjSetSortEig(syevj_params, sort_eig));

/* step 3: copy A to device */
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_A), sizeof(cuComplex) * A.size()));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_W), sizeof(float) * W.size()));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int) * info.size()));

CUDA_CHECK(
cudaMemcpyAsync(d_A, A.data(), sizeof(cuComplex) * A.size(), cudaMemcpyHostToDevice, stream));
/* step 4: query working space of syevj */
CUSOLVER_CHECK(cusolverDnCheevjBatched_bufferSize(cusolverH, jobz, uplo, m, d_A, lda, d_W,
&lwork, syevj_params, batchSize));

CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuComplex) * lwork));

/* step 5: compute eigen-pair   */
CUSOLVER_CHECK(cusolverDnCheevjBatched(cusolverH, jobz, uplo, m, d_A, lda, d_W, d_work, lwork,
d_info, syevj_params, batchSize));

CUDA_CHECK(
cudaMemcpyAsync(V.data(), d_A, sizeof(cuComplex) * A.size(), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(
cudaMemcpyAsync(W.data(), d_W, sizeof(float) * W.size(), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaMemcpyAsync(info.data(), d_info, sizeof(int) * info.size(),
cudaMemcpyDeviceToHost, stream));

CUDA_CHECK(cudaStreamSynchronize(stream));

for (int i = 0; i < batchSize; i++) {
if (0 == info[i]) {
std::printf("matrix %d: syevj converges \n", i);
}
else if (0 > info[i]) {
/* only info shows if some input parameter is wrong.
* If so, the error is CUSOLVER_STATUS_INVALID_VALUE.
*/
std::printf("Error: %d-th parameter is wrong \n", -info[i]);
exit(1);
}
else { /* info = m+1 */
/* if info[i] is not zero, Jacobi method does not converge at i-th matrix. */
std::printf("WARNING: matrix %d, info = %d : sygvj does not converge \n", i, info[i]);
}
}

/* Step 6: show eigenvalues and eigenvectors */
float* W0 = W.data();
float* W1 = W.data() + m;

std::printf("==== \n");
for (int i = 0; i < m; i++) {
std::printf("W0[%d] = %f\n", i, W0[i]);
}
std::printf("==== \n");
for (int i = 0; i < m; i++) {
std::printf("W1[%d] = %f\n", i, W1[i]);
}
std::printf("==== \n");

cuComplex* V0 = V.data();
cuComplex* V1 = V.data() + lda * m;

std::printf("V0 = (matlab base-1)\n");
print_matrix(m, m, V0, lda);
std::printf("V1 = (matlab base-1)\n");
print_matrix(m, m, V1, lda);

/* free resources */
CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_W));
CUDA_CHECK(cudaFree(d_info));
CUDA_CHECK(cudaFree(d_work));

CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params));

CUSOLVER_CHECK(cusolverDnDestroy(cusolverH));

CUDA_CHECK(cudaStreamDestroy(stream));

return EXIT_SUCCESS;
}

//0.5560 + 0.0000i - 0.4864 + 0.0548i   0.8592 + 0.2101i - 1.5374 - 0.2069i
//- 0.4864 - 0.0548i   0.4317 + 0.0000i - 0.7318 - 0.2698i   1.3255 + 0.3344i
//0.8592 - 0.2101i - 0.7318 + 0.2698i   1.4099 + 0.0000i - 2.4578 + 0.2609i
//- 1.5374 + 0.2069i   1.3255 - 0.3344i - 2.4578 - 0.2609i   4.3333 + 0.0000i
void BuildMatrix(cuComplex* input)
{
std::vector<float> realVector = { 0.5560, -0.4864, 0.8592, -1.5374, -0.4864, 0.4317, -0.7318, 1.3255,
0.8592, -0.7318, 1.4099, -2.4578, -1.5374, 1.3255, -2.4578, 4.3333 };
std::vector<float> imagVector = { 0, -0.0548, -0.2101, 0.2069, 0.0548, 0.0000, 0.2698, -0.3344,
0.2101, -0.2698, 0, -0.2609, -0.2069, 0.3344, 0.2609, 0 };

for (int i = 0; i < N; i++)
{
input[i].x = realVector.at(i) * std::pow(10, 11);
input[i].y = imagVector.at(i) * std::pow(10, 11);
}
}

``````

Is there any other way to solve this?
mfatica
mnicely