How to run eigen decomposition of multiple Hermitian matrices in parallel?

Following the post, Eigen decomposition of Hermitian Matrix using CuSolver does not match the result with matlab - Accelerated Computing / GPU-Accelerated Libraries - NVIDIA Developer Forums

I need to do eigen decomposition for a number of small matrices in parallel. Therefore, I need to choose the evd function running in batch. I do not see any function for cusolverDnCheevd which is running in Batch. Is there any?

I see a function cusolverDnCheevjBatched which runs in batch. It seems it uses Jacobi algorithm . Unfortunately, after implementing It is not giving the same result as cusolverDnCheevd. Is there anything am I missing?

I followed the example from here,
CUDALibrarySamples/cusolver_syevjBatched_example.cu at master · NVIDIA/CUDALibrarySamples · GitHub

For the same input I modified this example and the code is below. I did not change any parameter for Jacobi.

#include <cstdio>
#include <cstdlib>
#include <vector>

#include <cuda_runtime.h>
#include <cusolverDn.h>

#include "cusolver_utils.h"

int N = 16;
void BuildMatrix(cuComplex* input);

int main(int argc, char* argv[]) {
    cusolverDnHandle_t cusolverH = NULL;
    cudaStream_t stream = NULL;
    syevjInfo_t syevj_params = NULL;

    const int m = 4;
    const int lda = m;
    const int batchSize = 2; // Number of Inputs/Matrices

    cuComplex c;
    c.x = 0;
    c.y = 0;
    std::vector<cuComplex> A(lda * m * batchSize, c); /* V = [A0 ; A1] */
    std::vector<cuComplex> V(lda * m * batchSize, c); /* V = [V0 ; V1] */
    std::vector<float> W(m * batchSize, 0);       /* W = [W0 ; W1] */
    std::vector<int> info(batchSize, 0);           /* info = [info0 ; info1] */

    cuComplex* d_A = nullptr;    /* lda-by-m-by-batchSize */
    float* d_W = nullptr;    /* m-by-batchSize */
    int* d_info = nullptr;    /* batchSize */
    cuComplex* d_work = nullptr; /* device workspace for syevjBatched */
    int lwork = 0;            /* size of workspace */

    /* configuration of syevj  */
    const double tol = 1.e-7;
    const int max_sweeps = 15;
    const int sort_eig = 1;                                  /* don't sort eigenvalues */
    const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; /* compute eigenvectors */
    const cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;

    cuComplex* A0 = A.data();
    cuComplex* A1 = A.data() + lda * m;
    BuildMatrix(A0);
    BuildMatrix(A1);

    std::printf("A0 = (matlab base-1)\n");
    print_matrix(m, m, A0, lda);
    std::printf("=====\n");

    std::printf("A1 = (matlab base-1)\n");
    print_matrix(m, m, A1, lda);
    std::printf("=====\n");

    /* step 1: create cusolver handle, bind a stream */
    CUSOLVER_CHECK(cusolverDnCreate(&cusolverH));

    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream));

    ///* step 2: configuration of syevj */
    CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params));

    /* default value of tolerance is machine zero */
    CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol));

    /* default value of max. sweeps is 100 */
    CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, max_sweeps));

    /* disable sorting */
    CUSOLVER_CHECK(cusolverDnXsyevjSetSortEig(syevj_params, sort_eig));

    /* step 3: copy A to device */
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_A), sizeof(cuComplex) * A.size()));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_W), sizeof(float) * W.size()));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int) * info.size()));

    CUDA_CHECK(
        cudaMemcpyAsync(d_A, A.data(), sizeof(cuComplex) * A.size(), cudaMemcpyHostToDevice, stream));
    /* step 4: query working space of syevj */
    CUSOLVER_CHECK(cusolverDnCheevjBatched_bufferSize(cusolverH, jobz, uplo, m, d_A, lda, d_W,
        &lwork, syevj_params, batchSize));

    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuComplex) * lwork));

    /* step 5: compute eigen-pair   */
    CUSOLVER_CHECK(cusolverDnCheevjBatched(cusolverH, jobz, uplo, m, d_A, lda, d_W, d_work, lwork,
        d_info, syevj_params, batchSize));

    CUDA_CHECK(
        cudaMemcpyAsync(V.data(), d_A, sizeof(cuComplex) * A.size(), cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(
        cudaMemcpyAsync(W.data(), d_W, sizeof(float) * W.size(), cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(info.data(), d_info, sizeof(int) * info.size(),
        cudaMemcpyDeviceToHost, stream));

    CUDA_CHECK(cudaStreamSynchronize(stream));

    for (int i = 0; i < batchSize; i++) {
        if (0 == info[i]) {
            std::printf("matrix %d: syevj converges \n", i);
        }
        else if (0 > info[i]) {
            /* only info[0] shows if some input parameter is wrong.
             * If so, the error is CUSOLVER_STATUS_INVALID_VALUE.
             */
            std::printf("Error: %d-th parameter is wrong \n", -info[i]);
            exit(1);
        }
        else { /* info = m+1 */
              /* if info[i] is not zero, Jacobi method does not converge at i-th matrix. */
            std::printf("WARNING: matrix %d, info = %d : sygvj does not converge \n", i, info[i]);
        }
    }

    /* Step 6: show eigenvalues and eigenvectors */
    float* W0 = W.data();
    float* W1 = W.data() + m;

    std::printf("==== \n");
    for (int i = 0; i < m; i++) {
        std::printf("W0[%d] = %f\n", i, W0[i]);
    }
    std::printf("==== \n");
    for (int i = 0; i < m; i++) {
        std::printf("W1[%d] = %f\n", i, W1[i]);
    }
    std::printf("==== \n");

    cuComplex* V0 = V.data();
    cuComplex* V1 = V.data() + lda * m;

    std::printf("V0 = (matlab base-1)\n");
    print_matrix(m, m, V0, lda);
    std::printf("V1 = (matlab base-1)\n");
    print_matrix(m, m, V1, lda);

    /* free resources */
    CUDA_CHECK(cudaFree(d_A));
    CUDA_CHECK(cudaFree(d_W));
    CUDA_CHECK(cudaFree(d_info));
    CUDA_CHECK(cudaFree(d_work));

    CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params));

    CUSOLVER_CHECK(cusolverDnDestroy(cusolverH));

    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaDeviceReset());

    return EXIT_SUCCESS;
}

//0.5560 + 0.0000i - 0.4864 + 0.0548i   0.8592 + 0.2101i - 1.5374 - 0.2069i
//- 0.4864 - 0.0548i   0.4317 + 0.0000i - 0.7318 - 0.2698i   1.3255 + 0.3344i
//0.8592 - 0.2101i - 0.7318 + 0.2698i   1.4099 + 0.0000i - 2.4578 + 0.2609i
//- 1.5374 + 0.2069i   1.3255 - 0.3344i - 2.4578 - 0.2609i   4.3333 + 0.0000i
void BuildMatrix(cuComplex* input)
{
    std::vector<float> realVector = { 0.5560, -0.4864, 0.8592, -1.5374, -0.4864, 0.4317, -0.7318, 1.3255,
                                    0.8592, -0.7318, 1.4099, -2.4578, -1.5374, 1.3255, -2.4578, 4.3333 };
    std::vector<float> imagVector = { 0, -0.0548, -0.2101, 0.2069, 0.0548, 0.0000, 0.2698, -0.3344,
                                     0.2101, -0.2698, 0, -0.2609, -0.2069, 0.3344, 0.2609, 0 };

    for (int i = 0; i < N; i++)
    {
        input[i].x = realVector.at(i) * std::pow(10, 11);
        input[i].y = imagVector.at(i) * std::pow(10, 11);
    }
}

Is there any other way to solve this?
mfatica
mnicely