# Matrix inverse usng linear system solver through cublas , cublasCreate exception or something else

I’m trying to inverse a matrix using linear equation solver through cublas CUDA library.

The original equation is in form of:

Ax = B = I

I - identity matrix
A - The matrix I’m trying to inverse
x - (hopefully) the inverse(A) matrix

I’d like to perform LU decomposition, receive the following:

LUx = B

L - is a lower triangle matrix
U - is a upper triangle matrix

Here is a good example who can show what I’m trying to do:

For the sake of discussion, let’s assume I already have LU decomposition of A. (A = LU)
I’m trying to find the inverse in a two steps series:

Ax = B = I

LUx = B = I

1st step: Declare y:

Ly = I

solve 1st linear equation

2nd step, Solve the following linear equation

Ux = y

find X = inverse(A) - Bingo!@!

Albeit, for now I have no idea what am I doing wrong here…

There might be two assumptions, either I’m not using cublas properly or cublas throws an exception for no reason. keep reading :)

See my code attached , There is matlab code at the end:

``````     #include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
``````

//#include “cublas.h”
#include “cublas_v2.h”

int main()
{

``````cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;

//cublasInit();

stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS)
{
printf ("CUBLAS initialization failed\n");
return -1;
}

unsigned int size = 3;
// Allowcate device matrixes
float* p_h_LowerTriangle;
float* p_d_LowerTriangle;

p_h_LowerTriangle = new float;
p_h_LowerTriangle[0] = 1.f;
p_h_LowerTriangle[1] = 0.f;
p_h_LowerTriangle[2] = 0.f;
p_h_LowerTriangle[3] = 2.56f;
p_h_LowerTriangle[4] = 1.f;
p_h_LowerTriangle[5] = 0.f;
p_h_LowerTriangle[6] = 5.76f;
p_h_LowerTriangle[7] = 3.5f;
p_h_LowerTriangle[8] = 1.f;

float* p_h_UpperTriangle;
float* p_d_UpperTriangle;

p_h_UpperTriangle = new float;
p_h_UpperTriangle[0] = 25.f;
p_h_UpperTriangle[1] = 5.f;
p_h_UpperTriangle[2] = 1.f;
p_h_UpperTriangle[3] = 0.f;
p_h_UpperTriangle[4] = -4.8f;
p_h_UpperTriangle[5] = -1.56f;
p_h_UpperTriangle[6] = 0.f;
p_h_UpperTriangle[7] = 0.f;
p_h_UpperTriangle[8] = 0.7f;

float* p_h_IdentityMatrix;
float* p_d_IdentityMatrix;

p_h_IdentityMatrix = new float;
p_h_IdentityMatrix[0] = 1.f;
p_h_IdentityMatrix[1] = 0.f;
p_h_IdentityMatrix[2] = 0.f;
p_h_IdentityMatrix[3] = 0.f;
p_h_IdentityMatrix[4] = 1.f;
p_h_IdentityMatrix[5] = 0.f;
p_h_IdentityMatrix[6] = 0.f;
p_h_IdentityMatrix[7] = 0.f;
p_h_IdentityMatrix[8] = 1.f;

cudaMalloc ((void**)&p_d_LowerTriangle, size*size*sizeof(float));
cudaMalloc ((void**)&p_d_UpperTriangle, size*size*sizeof(float));
cudaMalloc ((void**)&p_d_IdentityMatrix, size*size*sizeof(float));

float* p_d_tempMatrix;
cudaMalloc ((void**)&p_d_tempMatrix, size*size*sizeof(float));

stat = cublasSetMatrix(size,size,sizeof(float),p_h_LowerTriangle,size,p_d_LowerTriangle,size);
stat = cublasSetMatrix(size,size,sizeof(float),p_h_UpperTriangle,size,p_d_UpperTriangle,size);
stat = cublasSetMatrix(size,size,sizeof(float),p_h_IdentityMatrix,size,p_d_IdentityMatrix,size);

// 1st phase - solve Ly = b
const float alpha  = 1.f;

// Function solves the triangulatr linear system with multiple right hand sides, function overrides b as a result
stat = 	cublasStrsm(handle,
cublasSideMode_t::CUBLAS_SIDE_LEFT,
cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
cublasOperation_t::CUBLAS_OP_N,
CUBLAS_DIAG_UNIT,
size,
size,
&alpha,
p_d_LowerTriangle,
size,
p_d_IdentityMatrix,
size);

////////////////////////////////////
// TODO: printf 1st phase the results

cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);

stat =cublasGetMatrix(size,size,sizeof(float),p_d_IdentityMatrix,size,p_h_IdentityMatrix,size);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_UpperTriangle,size,p_h_UpperTriangle,size);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_LowerTriangle,size,p_h_LowerTriangle,size);

////////////////////////////////////

// 2nd phase - solve Ux = b
stat = 	cublasStrsm(handle,
cublasSideMode_t::CUBLAS_SIDE_LEFT,
cublasFillMode_t::CUBLAS_FILL_MODE_UPPER,
cublasOperation_t::CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT,
size,
size,
&alpha,
p_d_UpperTriangle,
size,
p_d_IdentityMatrix,
size);

// TODO print the results

////////////////////////////////////
cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
////////////////////////////////////

cublasDestroy(handle);
//cublasShutdown();

cudaFree(p_d_LowerTriangle);
cudaFree(p_d_UpperTriangle);
cudaFree(p_d_IdentityMatrix);

free(p_h_LowerTriangle);
free(p_h_UpperTriangle);
free(p_h_IdentityMatrix);
``````

return 0;
}

Matlab code - works perfect

/////////////////////////// MATLAB

clear all

``````   UpperMatrix  = [ 25 5 1  ;  0 -4.8  -1.56  ;  0 0 0.7 ]

LowerMatrix  = [1 0 0 ; 2.56 1 0 ; 5.76 3.5 1  ]

IdentityMatrix = eye(3)

% 1st phase solution
otps1.LT = true;
y = linsolve(LowerMatrix,IdentityMatrix,otps1)

%2nd phase solution
otps2.UT = true;
y = linsolve(UpperMatrix,y,otps2)
``````

////////////////////////////////MATLAB output

UpperMatrix =

25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000

LowerMatrix =

``````1.0000         0         0
2.5600    1.0000         0
5.7600    3.5000    1.0000
``````

IdentityMatrix =

`````` 1     0     0
0     1     0
0     0     1
``````

y =

``````1.0000         0         0
``````

-2.5600 1.0000 0
3.2000 -3.5000 1.0000

y =

``````0.0476   -0.0833    0.0357
``````

-0.9524 1.4167 -0.4643
4.5714 -5.0000 1.4286

UpperMatrix =

25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000

LowerMatrix =

``````1.0000         0         0
2.5600    1.0000         0
5.7600    3.5000    1.0000
``````

IdentityMatrix =

`````` 1     0     0
0     1     0
0     0     1
``````

y =

``````1.0000         0         0
``````

-2.5600 1.0000 0
3.2000 -3.5000 1.0000

Inverse_LU_UT

UpperMatrix =

25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000

LowerMatrix =

``````1.0000         0         0
2.5600    1.0000         0
5.7600    3.5000    1.0000
``````

IdentityMatrix =

`````` 1     0     0
0     1     0
0     0     1
``````

y =

``````1.0000         0         0
``````

-2.5600 1.0000 0
3.2000 -3.5000 1.0000

y =

``````0.0476   -0.0833    0.0357
``````

-0.9524 1.4167 -0.4643
4.5714 -5.0000 1.4286

I have no idea what am I doing wrong here, I suspect the cublasCreate operation,

Every time I hit the command:

``````  cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
``````

stat and handle variables are both valid.

BUT VS10 output several of error messages specify the following:

First chance exception at 0x… microsoft C++ exception cudaError_enum at memory location 0x…

Some may say it’s an internal cublas error message, handled by the library itself.

Help would be very appreciated :)

To find the inverse of a lower triangular matrix ‘L’ you would use cuBLAS Strsm();

http://docs.nvidia.com/cuda/cublas/index.html#topic_8_7

Here is an example where D_L is the Cholesky factor ‘L’ and D_B starts off as the identity matrix;

``````cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if(ret!=CUBLAS_STATUS_SUCCESS){printf("error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

wTimerRes = 0;
init = InitMMTimer(wTimerRes);
startTime = timeGetTime();

err=cudaMemcpy(D_L,L,nCols*nCols*sizeof(float),_HTD);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
err=cudaMemcpy(D_B,B,nCols*nCols*sizeof(float),_HTD);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

ret=cublasStrsm_v2(handle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_UPPER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,nCols,nCols,&t_alphA,D_L,nCols,D_B,nCols);
if(ret!=CUBLAS_STATUS_SUCCESS){printf("error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

err=cudaMemcpy(B,D_B,nCols*nCols*sizeof(float),_DTH);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

endTime = timeGetTime();
gtime=endTime-startTime;
cout<<"GPU timing(including all device-host & host-device copies): "<<float(gtime)/1000.0f<<" seconds.\n";
DestroyMMTimer(wTimerRes, init);
``````

After this call D_B will have the value of inverse(L). For a 4092x4092 ‘L’ matrix it takes about 0.1 seconds for the whole process.