dgetrf gets different results between cuBLAS/cuSOLVER and LAPACK?

I am testing out some scenarios where the function dgetrf is returned differently when used with cuBLAS/cuSOLVER compared to writing for LAPACK. For example, I am looking at LU factorization of the following matrix:

[2.0 4.0 1.0 -3.0 0.0]
[-1.0 -2.0 2.0 4.0 0.0]
[4.0 2.0 -3.0 5.0 0.0]
[5.0 -4.0 -3.0 1.0 0.0]
[0.0 0.0 0.0 0.0 0.0]

I test out getting the results using dgetrf functions from cuBLAS/cuSOLVER as followed (the code is far from pretty but it should do the job…)

#include <cblas.h>
#include <time.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>

int main(int argc, char** argv){

    const int matrixSize = 5;

    int i, j;
    double arrA[matrixSize][matrixSize] = {
        {2.0, 4.0, 1.0, -3.0, 0.0},
        {-1.0, -2.0, 2.0, 4.0, 0.0},
        {4.0, 2.0, -3.0, 5.0, 0.0},
        {5.0, -4.0, -3.0, 1.0, 0.0},
        {0.0, 0.0, 0.0, 0.0, 0.0}
    };

    double *arrADev, *workArray;
    double **matrixArray;
    int *pivotArray;
    int *infoArray;
    double flat[matrixSize*matrixSize] = {0};
    cublasHandle_t cublasHandle;
    cublasStatus_t cublasStatus;
    cudaError_t error;

    cudaError cudaStatus;
    cusolverStatus_t cusolverStatus;
    cusolverDnHandle_t cusolverHandle;

    double *matrices[2];

error = cudaMalloc(&arrADev,  sizeof(double) * matrixSize*matrixSize);
    if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

    error = cudaMalloc(&matrixArray,  sizeof(double*) * 2);
    if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

    error = cudaMalloc(&pivotArray,  sizeof(int) * matrixSize*matrixSize);
    if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

    error = cudaMalloc(&infoArray,  sizeof(int) * matrixSize*matrixSize);
    if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

    cublasStatus = cublasCreate(&cublasHandle);
    if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);

    //maps matrix to flat vector
    for(i=0; i<matrixSize; i++){
        for(j=0; j<matrixSize; j++){
            flat[i+j*matrixSize] = arrA[i][j];
        }
    }

    //copy matrix A to device
    cublasStatus = cublasSetMatrix(matrixSize, matrixSize, sizeof(double), flat, matrixSize, arrADev, matrixSize);
    if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);

    //save matrix address
    matrices[0] = arrADev;

    //copy matrices references to device
    error = cudaMemcpy(matrixArray, matrices, sizeof(double*)*1, cudaMemcpyHostToDevice);
    if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

    int Lwork;
    // calculate buffer size for cuSOLVER LU factorization
    cusolverStatus = cusolverDnDgetrf_bufferSize(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, &Lwork);
    cudaStatus = cudaMalloc((void**)&workArray, Lwork*sizeof(double));

    // cuBLAS LU factorization
    cublasStatus = cublasDgetrfBatched(cublasHandle, matrixSize, matrixArray, matrixSize, pivotArray, infoArray, 1);
    if (cublasStatus == CUBLAS_STATUS_SUCCESS)
        printf("cuBLAS DGETRF SUCCESSFUL! \n");
    else
        printf("cuBLAS DGETRF UNSUCCESSFUL! \n");

    // cuSOLVER LU factorization
    cusolverStatus = cusolverDnCreate(&cusolverHandle);
    cusolverStatus = cusolverDnDgetrf(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, workArray, pivotArray, infoArray);
    if (cusolverStatus == CUSOLVER_STATUS_SUCCESS)
        printf("cuSOLVER DGETRF SUCCESSFUL! \n");
    else
        printf("cuSOLVER DGETRF UNSUCCESSFUL! \n");

    return 0;
}

The output from the code above is

cuBLAS DGETRF SUCCESSFUL!
cuSOLVER DGETRF SUCCESSFUL!

Then when I try to do the same with LAPACK:

#include <iostream>
#include <vector>

using namespace std;

extern "C" void dgetrf_(int* dim1, int* dim2, double* a, int* lda, int* ipiv, int* info);
extern "C" void dgetrs_(char *TRANS, int *N, int *NRHS, double *A, int *LDA, int *IPIV, double *B, int *LDB, int *INFO );

int main()
{
   char trans = 'N';
   int dim = 5;
   int LDA = dim;
   int info;

   vector<double> a,b;

   a.push_back(2.0); a.push_back(4.0); a.push_back(1.0); a.push_back(-3.0); a.push_back(0.0);
   a.push_back(-1.0); a.push_back(-2.0); a.push_back(2.0); a.push_back(4.0); a.push_back(0.0);
   a.push_back(4.0); a.push_back(2.0); a.push_back(-3.0); a.push_back(5.0); a.push_back(0.0);
   a.push_back(5.0); a.push_back(-4.0); a.push_back(-3.0); a.push_back(1.0); a.push_back(0.0);
   a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0);

   int ipiv[5];
   dgetrf_(&dim, &dim, &*a.begin(), &LDA, ipiv, &info);
   if (info == 0)
       printf("dgetrf successful\n");
   else
       printf("dgetrf unsuccessful\n");

   return 0;
}

The output I get is

dgetrf unsuccessful

I understand that they are two different libraries, but is this behaviour expected?

Mind telling us the returned info value, or are we expected to all run the code ourselves?

@tera I got error code 5 for the LAPACK code. The cuBLAS/cuSOLVER are successful. Let me know if that helps.

https://stackoverflow.com/questions/58800010/lu-factorization-receives-different-results-between-lapack-and-cublas-cusolver

Thank you very much!