I’m trying to inverse a matrix using linear equation solver through cublas CUDA library.
The original equation is in form of:
Ax = B = I
I - identity matrix
A - The matrix I’m trying to inverse
x - (hopefully) the inverse(A) matrix
I’d like to perform LU decomposition, receive the following:
LUx = B
L - is a lower triangle matrix
U - is a upper triangle matrix
Here is a good example who can show what I’m trying to do:
For the sake of discussion, let’s assume I already have LU decomposition of A. (A = LU)
I’m trying to find the inverse in a two steps series:
Ax = B = I
LUx = B = I
1st step: Declare y:
Ly = I
solve 1st linear equation
2nd step, Solve the following linear equation
Ux = y
find X = inverse(A) - Bingo!@!
Albeit, for now I have no idea what am I doing wrong here…
There might be two assumptions, either I’m not using cublas properly or cublas throws an exception for no reason. keep reading :)
See my code attached , There is matlab code at the end:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
//include “cublas.h”
include “cublas_v2.h”
int main()
{
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
//cublasInit();
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS)
{
printf ("CUBLAS initialization failed\n");
return -1;
}
unsigned int size = 3;
// Allowcate device matrixes
float* p_h_LowerTriangle;
float* p_d_LowerTriangle;
p_h_LowerTriangle = new float;
p_h_LowerTriangle[0] = 1.f;
p_h_LowerTriangle[1] = 0.f;
p_h_LowerTriangle[2] = 0.f;
p_h_LowerTriangle[3] = 2.56f;
p_h_LowerTriangle[4] = 1.f;
p_h_LowerTriangle[5] = 0.f;
p_h_LowerTriangle[6] = 5.76f;
p_h_LowerTriangle[7] = 3.5f;
p_h_LowerTriangle[8] = 1.f;
float* p_h_UpperTriangle;
float* p_d_UpperTriangle;
p_h_UpperTriangle = new float;
p_h_UpperTriangle[0] = 25.f;
p_h_UpperTriangle[1] = 5.f;
p_h_UpperTriangle[2] = 1.f;
p_h_UpperTriangle[3] = 0.f;
p_h_UpperTriangle[4] = -4.8f;
p_h_UpperTriangle[5] = -1.56f;
p_h_UpperTriangle[6] = 0.f;
p_h_UpperTriangle[7] = 0.f;
p_h_UpperTriangle[8] = 0.7f;
float* p_h_IdentityMatrix;
float* p_d_IdentityMatrix;
p_h_IdentityMatrix = new float;
p_h_IdentityMatrix[0] = 1.f;
p_h_IdentityMatrix[1] = 0.f;
p_h_IdentityMatrix[2] = 0.f;
p_h_IdentityMatrix[3] = 0.f;
p_h_IdentityMatrix[4] = 1.f;
p_h_IdentityMatrix[5] = 0.f;
p_h_IdentityMatrix[6] = 0.f;
p_h_IdentityMatrix[7] = 0.f;
p_h_IdentityMatrix[8] = 1.f;
cudaMalloc ((void**)&p_d_LowerTriangle, size*size*sizeof(float));
cudaMalloc ((void**)&p_d_UpperTriangle, size*size*sizeof(float));
cudaMalloc ((void**)&p_d_IdentityMatrix, size*size*sizeof(float));
float* p_d_tempMatrix;
cudaMalloc ((void**)&p_d_tempMatrix, size*size*sizeof(float));
stat = cublasSetMatrix(size,size,sizeof(float),p_h_LowerTriangle,size,p_d_LowerTriangle,size);
stat = cublasSetMatrix(size,size,sizeof(float),p_h_UpperTriangle,size,p_d_UpperTriangle,size);
stat = cublasSetMatrix(size,size,sizeof(float),p_h_IdentityMatrix,size,p_d_IdentityMatrix,size);
cudaDeviceSynchronize();
// 1st phase - solve Ly = b
const float alpha = 1.f;
// Function solves the triangulatr linear system with multiple right hand sides, function overrides b as a result
stat = cublasStrsm(handle,
cublasSideMode_t::CUBLAS_SIDE_LEFT,
cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
cublasOperation_t::CUBLAS_OP_N,
CUBLAS_DIAG_UNIT,
size,
size,
&alpha,
p_d_LowerTriangle,
size,
p_d_IdentityMatrix,
size);
////////////////////////////////////
// TODO: printf 1st phase the results
cudaDeviceSynchronize();
cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_IdentityMatrix,size,p_h_IdentityMatrix,size);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_UpperTriangle,size,p_h_UpperTriangle,size);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_LowerTriangle,size,p_h_LowerTriangle,size);
////////////////////////////////////
// 2nd phase - solve Ux = b
stat = cublasStrsm(handle,
cublasSideMode_t::CUBLAS_SIDE_LEFT,
cublasFillMode_t::CUBLAS_FILL_MODE_UPPER,
cublasOperation_t::CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT,
size,
size,
&alpha,
p_d_UpperTriangle,
size,
p_d_IdentityMatrix,
size);
// TODO print the results
cudaDeviceSynchronize();
////////////////////////////////////
cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
////////////////////////////////////
cublasDestroy(handle);
//cublasShutdown();
cudaFree(p_d_LowerTriangle);
cudaFree(p_d_UpperTriangle);
cudaFree(p_d_IdentityMatrix);
free(p_h_LowerTriangle);
free(p_h_UpperTriangle);
free(p_h_IdentityMatrix);
return 0;
}
Matlab code - works perfect
/////////////////////////// MATLAB
clear all
UpperMatrix = [ 25 5 1 ; 0 -4.8 -1.56 ; 0 0 0.7 ]
LowerMatrix = [1 0 0 ; 2.56 1 0 ; 5.76 3.5 1 ]
IdentityMatrix = eye(3)
% 1st phase solution
otps1.LT = true;
y = linsolve(LowerMatrix,IdentityMatrix,otps1)
%2nd phase solution
otps2.UT = true;
y = linsolve(UpperMatrix,y,otps2)
////////////////////////////////MATLAB output
UpperMatrix =
25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000
LowerMatrix =
1.0000 0 0
2.5600 1.0000 0
5.7600 3.5000 1.0000
IdentityMatrix =
1 0 0
0 1 0
0 0 1
y =
1.0000 0 0
-2.5600 1.0000 0
3.2000 -3.5000 1.0000
y =
0.0476 -0.0833 0.0357
-0.9524 1.4167 -0.4643
4.5714 -5.0000 1.4286
UpperMatrix =
25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000
LowerMatrix =
1.0000 0 0
2.5600 1.0000 0
5.7600 3.5000 1.0000
IdentityMatrix =
1 0 0
0 1 0
0 0 1
y =
1.0000 0 0
-2.5600 1.0000 0
3.2000 -3.5000 1.0000
Inverse_LU_UT
UpperMatrix =
25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000
LowerMatrix =
1.0000 0 0
2.5600 1.0000 0
5.7600 3.5000 1.0000
IdentityMatrix =
1 0 0
0 1 0
0 0 1
y =
1.0000 0 0
-2.5600 1.0000 0
3.2000 -3.5000 1.0000
y =
0.0476 -0.0833 0.0357
-0.9524 1.4167 -0.4643
4.5714 -5.0000 1.4286
I have no idea what am I doing wrong here, I suspect the cublasCreate operation,
Every time I hit the command:
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
stat and handle variables are both valid.
BUT VS10 output several of error messages specify the following:
First chance exception at 0x… microsoft C++ exception cudaError_enum at memory location 0x…
Some may say it’s an internal cublas error message, handled by the library itself.
I’m ted sad right now,
Help would be very appreciated :)