cuSolver work improperly with cusolverDnSsyevd.

Hi, I am trying to use cusolverDnSsyevd to solve eigenvalues of matrices. The code runs ok with the input of example (array AA below). However, If I change the input a little bit, like use eye(3) or ones(3), most of the output become nan .

My code shows as follows. Could somebody help me to check where is the problem. Thanks!

__host__ void Eig(cusolverDnHandle_t cusolverH,
                    const unsigned int dim,
                    float *Sigma,float *working){

    cusolverStatus_t cusolver_status;

    assert(dim<31);

    float *d_W=&working[0];
    int *d_Info = (int *)&working[dim];
    float *d_work = &working[32];
    int  h_lwork = 0;

    cusolver_status = cusolverDnSsyevd_bufferSize(
            cusolverH,
            CUSOLVER_EIG_MODE_VECTOR,
            CUBLAS_FILL_MODE_LOWER,
            dim,
            Sigma,
            dim,
            d_W,
            &h_lwork);
    assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);

    cusolver_status = cusolverDnSsyevd(
            cusolverH,
            CUSOLVER_EIG_MODE_VECTOR,
            CUBLAS_FILL_MODE_LOWER,
            dim,
            Sigma,
            dim,
            d_W,
            d_work,
            h_lwork,
            d_Info);
    cudaDeviceSynchronize();
    assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);

    int info_gpu = 0;
    CudaCheck(cudaMemcpy(&info_gpu, d_Info, sizeof(int), cudaMemcpyDeviceToHost));
    assert(info_gpu==0);
}

float* mallocOnGpu(const size_t N) {
	float* device_A;
	float ABytes = N * sizeof(float);
	CudaCheck(cudaMalloc(&device_A, ABytes));
	return device_A;
}

int main(){

        cusolverStatus_t cusolver_status;
        cusolverDnHandle_t cusolverH;
        cusolver_status = cusolverDnCreate(&cusolverH);
        assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);

        int dim=3;

        float *d_mat=mallocOnGpu(dim*dim);
        float *d_d=mallocOnGpu(dim*dim*20096);
        
        float A[dim*dim] = { 3.5, 0.5, 0, 0.5, 3.5, 0, 0, 0, 2.0};

        CudaCheck(cudaMemcpy(
                &d_mat[0], A,
                dim*dim*sizeof(float),
                cudaMemcpyHostToDevice));

        Eig(cusolverH, dim,d_mat, d_d);

        cudaDeviceSynchronize();

/* My result:

d_mat[0] = 0
d_mat[1] = 0
d_mat[2] = 1
d_mat[3] = -0.707107
d_mat[4] = 0.707107
d_mat[5] = 0
d_mat[6] = -0.707107
d_mat[7] = -0.707107
d_mat[8] = 0
d_d[0] = 2
d_d[1] = 3
d_d[2] = 4
*/

        float B[dim*dim] = { 2, 0, 0, 0, 2, 0, 0, 0, 2};

        CudaCheck(cudaMemcpy(
                &d_mat[0], B,
                dim*dim*sizeof(float),
                cudaMemcpyHostToDevice));

        Eig(cusolverH, dim,d_mat, d_d);

        cudaDeviceSynchronize();

/* My result:
d_mat[0] = 1
d_mat[1] = nan
d_mat[2] = nan
d_mat[3] = -nan
d_mat[4] = nan
d_mat[5] = nan
d_mat[6] = -nan
d_mat[7] = nan
d_mat[8] = nan
d_d[0] = 2
d_d[1] = -nan
d_d[2] = -nan
*/

}

your code doesn’t define or allocate for d_Sigma anywhere, so appears to be quite broken. It seems evident that this is not the code you are actually testing with.

Now it is supposed to work. And I publish my test result. Please review.

This appears to be a defect in CUDA 9.2 cusolver for this function. It applies to the case of a diagonal matrix. It should be fixed in a future CUDA release. Thanks for reporting.

This should be fixed in CUDA 10.0 and beyond.