Incorrect inversion for matrix with CUBLAS

I’m looking how to implement an inversion of a matrix in GPU .I try to do it with a single thread per block based on the Gauss-Jordan elimination algorithm (with cuda) but I need a matrix inversion which assigned multiple threads per block. so ,I am trying to run it Using Cublas . i found this code but when i execute it , many errors are display and i can’t understand these errors .noted that I am a novice student in CUDA.

#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#define cudacall(call)                                                                                                          \
    do                                                                                                                          \
    {                                                                                                                           \
        cudaError_t err = (call);                                                                                               \
        if(cudaSuccess != err)                                                                                                  \
        {                                                                                                                       \
            fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err));    \
            cudaDeviceReset();                                                                                                  \
            exit(EXIT_FAILURE);                                                                                                 \
        }                                                                                                                       \
    }                                                                                                                           \
    while (0)

#define cublascall(call)                                                                                        \
    do                                                                                                          \
    {                                                                                                           \
        cublasStatus_t status = (call);                                                                         \
        if(CUBLAS_STATUS_SUCCESS != status)                                                                     \
        {                                                                                                       \
            fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status);     \
            cudaDeviceReset();                                                                                  \
            exit(EXIT_FAILURE);                                                                                 \
        }                                                                                                       \
                                                                                                                \
    }                                                                                                           \
    while(0)


void invert_device(float* src_d, float* dst_d, int n)
{
    cublasHandle_t handle;
    cublascall(cublasCreate_v2(&handle));

    int batchSize = 1;

    int *P, *INFO;

    cudacall(cudaMalloc<int>(&P,17 * batchSize * sizeof(int)));
    cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));

    int lda = 17;

    float *A[] = { src_d };
    float** A_d;
    cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
    cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));

    cublascall(cublasSgetrfBatched(handle,17,A_d,lda,P,INFO,batchSize));

    int INFOh = 0;
    cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));

    if(INFOh == 17)
    {
        fprintf(stderr, "Factorization Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }

    float* C[] = { dst_d };
    float** C_d;
    cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
    cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));

    cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,n,INFO,batchSize));

    cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));

    if(INFOh != 0)
    {
        fprintf(stderr, "Inversion Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }

    cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}

void invert(float* src, float* dst, int n)
{
    float* src_d, *dst_d;

    cudacall(cudaMalloc<float>(&src_d,17 * 17 * sizeof(float)));
    cudacall(cudaMemcpy(src_d,src,17 * 17 * sizeof(float),cudaMemcpyHostToDevice));
    cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));

    invert_device(src_d,dst_d,n);

    cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));

    cudaFree(src_d), cudaFree(dst_d);
}

void test_invert()
{
    const int n = 3;

    //Random matrix with full pivots
/*    float full_pivots[n*n] = { 0.5, 3, 4,
                                1, 3, 10,
                                4 , 9, 16 };

    //Almost same as above matrix with first pivot zero
    float zero_pivot[n*n] = { 0, 3, 4,
                              1, 3, 10,
                              4 , 9, 16 };

    float zero_pivot_col_major[n*n] = { 0, 1, 4,
                                        3, 3, 9,
                                        4 , 10, 16 };
*/
    float zero_pivot_war[17*17] = {
                                        0,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                                        1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                                        4,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                                        0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
                                        0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
                                        0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
                                        0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
                                        0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
                                        0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
                                        0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
                                        0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
                                        0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
                                        0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
                                        0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
                                        0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
                                        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
                                        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 };
/*
    float another_zero_pivot[n*n] = { 0, 3, 4,
                                      1, 5, 6,
                                      9, 8, 2 };

    float another_full_pivot[n * n] = { 22, 3, 4,
                                        1, 5, 6,
                                        9, 8, 2 };

    float singular[n*n] = {1,2,3,
                           4,5,6,
                           7,8,9};
*/
    float result[n*n];

    //Select matrix by setting "a"
    float* a = zero_pivot_war;

    fprintf(stdout, "Input:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            fprintf(stdout,"%f\t",a[i*17+j]);
        fprintf(stdout,"\n");
    }

    fprintf(stdout,"\n\n");

    invert(a,result,n);

    fprintf(stdout, "Inverse:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            fprintf(stdout,"%f\t",result[i*n+j]);
        fprintf(stdout,"\n");
    }

}

int main()
{
    test_invert();

//    int n;  scanf("%d",&n);
    return 0;
}

Encountered compilation error as following when tried it with CUDA 7.5. Any suggestion?

===
test.cu: In function ‘void invert_device(float*, float*, int)’:
test.cu:70:97: error: invalid conversion from ‘float**’ to ‘const float**’ [-fpermissive]
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,n,INFO,batchSize));
^
/usr/local/cuda-7.5/bin/…//include/cublas_api.h:2202:27: error: initializing argument 3 of ‘cublasStatus_t cublasSgetriBatched(cublasHandle_t, int, const float**, int, const int*, float**, int, int*, int)’ [-fpermissive]
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetriBatched(cublasHandle_t handle,
^

Try out the MAGMA library (http://icl.cs.utk.edu/magma/) which provides also a matrix inversion routine. Working example (code) can be found at page 144 of https://developer.nvidia.com/sites/default/files/akamai/cuda/files/Misc/mygpu.pdf
Routine is called ‘magma_dgetri_gpu’ (double precision variant)

Ranya, did you get a solution to this that uses cublas? I’m trying to compile code also that gives this error. I inserted a cast for the third argument of cublasSgetriBatched( handle, n, (const float **) Data, … ) but I’m not sure if this is OK?

Could anyone comment on this? I don’t believe there was a problem in past… I recently upgraded to Cuda 8.0.

Yes, in more recent versions of CUDA/CUBLAS, it is necessary that the argument be const-qualified.

This change was noticeable I believe somewhere around the CUDA 6 timeframe.

The code posted by OP may have come from here:

http://stackoverflow.com/questions/22887167/cublas-incorrect-inversion-for-matrix-with-zero-pivot

The zero-pivot issue described there has been fixed in CUBLAS, as mentioned there. Note that this is not a particularly high-performance matrix inversion method for large matrices, as mentioned in the CUBLAS documentation.