Matrix inverse usng linear system solver through cublas , cublasCreate exception or something else

I’m trying to inverse a matrix using linear equation solver through cublas CUDA library.

The original equation is in form of:

Ax = B = I

I - identity matrix
A - The matrix I’m trying to inverse
x - (hopefully) the inverse(A) matrix

I’d like to perform LU decomposition, receive the following:

LUx = B

L - is a lower triangle matrix
U - is a upper triangle matrix

Here is a good example who can show what I’m trying to do:

For the sake of discussion, let’s assume I already have LU decomposition of A. (A = LU)
I’m trying to find the inverse in a two steps series:

Ax = B = I

LUx = B = I

1st step: Declare y:

Ly = I

solve 1st linear equation

2nd step, Solve the following linear equation

Ux = y

find X = inverse(A) - Bingo!@!


Albeit, for now I have no idea what am I doing wrong here…

There might be two assumptions, either I’m not using cublas properly or cublas throws an exception for no reason. keep reading :)


See my code attached , There is matlab code at the end:


     #include "cuda_runtime.h" 
     #include "device_launch_parameters.h"

     #include <stdio.h>

//include “cublas.h”
include “cublas_v2.h”

int main()
{

cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;

//cublasInit();

stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS)
{
	printf ("CUBLAS initialization failed\n");
	return -1;
}

unsigned int size = 3; 
// Allowcate device matrixes 
float* p_h_LowerTriangle; 
float* p_d_LowerTriangle; 

p_h_LowerTriangle = new float;
p_h_LowerTriangle[0] = 1.f;  
p_h_LowerTriangle[1] = 0.f;
p_h_LowerTriangle[2] = 0.f;
p_h_LowerTriangle[3] = 2.56f;
p_h_LowerTriangle[4] = 1.f; 
p_h_LowerTriangle[5] = 0.f; 
p_h_LowerTriangle[6] = 5.76f; 
p_h_LowerTriangle[7] = 3.5f;
p_h_LowerTriangle[8] = 1.f;

float* p_h_UpperTriangle; 
float* p_d_UpperTriangle; 

p_h_UpperTriangle = new float;
p_h_UpperTriangle[0] = 25.f;  
p_h_UpperTriangle[1] = 5.f;
p_h_UpperTriangle[2] = 1.f;
p_h_UpperTriangle[3] = 0.f;
p_h_UpperTriangle[4] = -4.8f; 
p_h_UpperTriangle[5] = -1.56f; 
p_h_UpperTriangle[6] = 0.f; 
p_h_UpperTriangle[7] = 0.f;
p_h_UpperTriangle[8] = 0.7f;

float* p_h_IdentityMatrix; 
float* p_d_IdentityMatrix; 

p_h_IdentityMatrix = new float;
p_h_IdentityMatrix[0] = 1.f;  
p_h_IdentityMatrix[1] = 0.f;
p_h_IdentityMatrix[2] = 0.f;
p_h_IdentityMatrix[3] = 0.f;
p_h_IdentityMatrix[4] = 1.f; 
p_h_IdentityMatrix[5] = 0.f; 
p_h_IdentityMatrix[6] = 0.f; 
p_h_IdentityMatrix[7] = 0.f;
p_h_IdentityMatrix[8] = 1.f;

cudaMalloc ((void**)&p_d_LowerTriangle, size*size*sizeof(float));
cudaMalloc ((void**)&p_d_UpperTriangle, size*size*sizeof(float));
cudaMalloc ((void**)&p_d_IdentityMatrix, size*size*sizeof(float));

float* p_d_tempMatrix; 
cudaMalloc ((void**)&p_d_tempMatrix, size*size*sizeof(float));


stat = cublasSetMatrix(size,size,sizeof(float),p_h_LowerTriangle,size,p_d_LowerTriangle,size);
stat = cublasSetMatrix(size,size,sizeof(float),p_h_UpperTriangle,size,p_d_UpperTriangle,size);
stat = cublasSetMatrix(size,size,sizeof(float),p_h_IdentityMatrix,size,p_d_IdentityMatrix,size);

cudaDeviceSynchronize();

// 1st phase - solve Ly = b 
const float alpha  = 1.f;

// Function solves the triangulatr linear system with multiple right hand sides, function overrides b as a result 
stat = 	cublasStrsm(handle,
	cublasSideMode_t::CUBLAS_SIDE_LEFT, 
	cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
	cublasOperation_t::CUBLAS_OP_N,
	CUBLAS_DIAG_UNIT,
	size,
	size,
	&alpha,
	p_d_LowerTriangle,
	size,
	p_d_IdentityMatrix,
	size);

////////////////////////////////////
// TODO: printf 1st phase the results 
cudaDeviceSynchronize();

cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);

stat =cublasGetMatrix(size,size,sizeof(float),p_d_IdentityMatrix,size,p_h_IdentityMatrix,size);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_UpperTriangle,size,p_h_UpperTriangle,size);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_LowerTriangle,size,p_h_LowerTriangle,size);

////////////////////////////////////

// 2nd phase - solve Ux = b
stat = 	cublasStrsm(handle,
	cublasSideMode_t::CUBLAS_SIDE_LEFT, 
	cublasFillMode_t::CUBLAS_FILL_MODE_UPPER,
	cublasOperation_t::CUBLAS_OP_N,
	CUBLAS_DIAG_NON_UNIT,
	size,
	size,
	&alpha,
	p_d_UpperTriangle,
	size,
	p_d_IdentityMatrix,
	size);

// TODO print the results 
cudaDeviceSynchronize();

////////////////////////////////////
cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
////////////////////////////////////

cublasDestroy(handle);
//cublasShutdown();

cudaFree(p_d_LowerTriangle);
cudaFree(p_d_UpperTriangle);
cudaFree(p_d_IdentityMatrix);

free(p_h_LowerTriangle);
free(p_h_UpperTriangle);
free(p_h_IdentityMatrix);

return 0;
}


Matlab code - works perfect


/////////////////////////// MATLAB

clear all

   UpperMatrix  = [ 25 5 1  ;  0 -4.8  -1.56  ;  0 0 0.7 ]

   LowerMatrix  = [1 0 0 ; 2.56 1 0 ; 5.76 3.5 1  ]

   IdentityMatrix = eye(3)

    % 1st phase solution 
    otps1.LT = true;
    y = linsolve(LowerMatrix,IdentityMatrix,otps1)

    %2nd phase solution 
    otps2.UT = true;
    y = linsolve(UpperMatrix,y,otps2)

////////////////////////////////MATLAB output

UpperMatrix =

25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000

LowerMatrix =

1.0000         0         0
2.5600    1.0000         0
5.7600    3.5000    1.0000

IdentityMatrix =

 1     0     0
 0     1     0
 0     0     1

y =

1.0000         0         0

-2.5600 1.0000 0
3.2000 -3.5000 1.0000

y =

0.0476   -0.0833    0.0357

-0.9524 1.4167 -0.4643
4.5714 -5.0000 1.4286

UpperMatrix =

25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000

LowerMatrix =

1.0000         0         0
2.5600    1.0000         0
5.7600    3.5000    1.0000

IdentityMatrix =

 1     0     0
 0     1     0
 0     0     1

y =

1.0000         0         0

-2.5600 1.0000 0
3.2000 -3.5000 1.0000

Inverse_LU_UT

UpperMatrix =

25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000

LowerMatrix =

1.0000         0         0
2.5600    1.0000         0
5.7600    3.5000    1.0000

IdentityMatrix =

 1     0     0
 0     1     0
 0     0     1

y =

1.0000         0         0

-2.5600 1.0000 0
3.2000 -3.5000 1.0000

y =

0.0476   -0.0833    0.0357

-0.9524 1.4167 -0.4643
4.5714 -5.0000 1.4286


I have no idea what am I doing wrong here, I suspect the cublasCreate operation,

Every time I hit the command:

  cublasStatus_t stat;
  cublasHandle_t handle;
  stat = cublasCreate(&handle); 

stat and handle variables are both valid.

BUT VS10 output several of error messages specify the following:

First chance exception at 0x… microsoft C++ exception cudaError_enum at memory location 0x…

Some may say it’s an internal cublas error message, handled by the library itself.

I’m ted sad right now,

Help would be very appreciated :)

To find the inverse of a lower triangular matrix ‘L’ you would use cuBLAS Strsm();

http://docs.nvidia.com/cuda/cublas/index.html#topic_8_7

Here is an example where D_L is the Cholesky factor ‘L’ and D_B starts off as the identity matrix;

cublasHandle_t handle;
		cublasStatus_t ret;
		ret = cublasCreate(&handle);
		if(ret!=CUBLAS_STATUS_SUCCESS){printf("error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

		wTimerRes = 0;
		init = InitMMTimer(wTimerRes);
		startTime = timeGetTime();

		err=cudaMemcpy(D_L,L,nCols*nCols*sizeof(float),_HTD);
		if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
		err=cudaMemcpy(D_B,B,nCols*nCols*sizeof(float),_HTD);
		if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

		ret=cublasStrsm_v2(handle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_UPPER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,nCols,nCols,&t_alphA,D_L,nCols,D_B,nCols);
		if(ret!=CUBLAS_STATUS_SUCCESS){printf("error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

		err=cudaMemcpy(B,D_B,nCols*nCols*sizeof(float),_DTH);
		if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

		endTime = timeGetTime();
		gtime=endTime-startTime;
		cout<<"GPU timing(including all device-host & host-device copies): "<<float(gtime)/1000.0f<<" seconds.\n";
		DestroyMMTimer(wTimerRes, init);

After this call D_B will have the value of inverse(L). For a 4092x4092 ‘L’ matrix it takes about 0.1 seconds for the whole process.