I’m programming Gauss-Jordan method for matrix inversion, but when I made a time test, was slower than CPU. can someone help me?
Here is my code:
.cu :
//kernels
__global__ void Kernel_Identity128(float *Mat)
{
int i = d_size /128 +1;
int j = blockIdx.x;
int k = threadIdx.x;
int n;
int k_size = d_size;
if(j >= d_rsize)
return;
if(k*i > d_size)
return;
n = 0;
while(n< i && i*k + n < d_size)
{
if(i*k + n == start + j)
Mat[j*d_size + i*k + n] = 1;
else
Mat[j*d_size + i*k + n] = 0;
n++;
}
}
__global__ void Kernel_Inverse128(float *Mat,float* InvMat,float* Piv,float* InvPiv)
{
int i = (2*d_size) /128 +1;
int j = blockIdx.x;
int k = threadIdx.x;
int n;
int k_size = d_size;
int k_pivoterow = d_pivoterow;
__shared__ float apiv[128];
if(j >= d_rsize)
return;
if(k*i > k_size)
return;
if(start + j == k_pivoterow)
return;
n = 0;
apiv[k] = -1.0f*Mat[j*k_size + k_pivoterow]/Piv[k_pivoterow];
__syncthreads();
if(i*k + n < k_size)
{
if(i*k + n == k_pivoterow)
Mat[j*k_size + i*k + n] = 0;
while(n< i &&i*k + n < k_size && i*k + n > k_pivoterow)
{
Mat[j*k_size + i*k + n] = apiv[k]*Piv[i*k + n] + Mat[j*k_size + i*k + n];
n++;
}
}
else
{
while(n< i &&i*k + n < 2*k_size)
{
InvMat[j*k_size + (i*k + n)-k_size] = apiv[k] * InvPiv[i*k + n]+InvMat[j*k_size + (i*k + n)-k_size];
n++;
}
}
}
__global__ void Kernel_NormInv128(float *Mat,float* InvMat)
{
int i = d_size /128 +1;
int j = blockIdx.x;
int k = threadIdx.x;
int n;
int k_size = d_size;
if(j >= d_rsize)
return;
if(k*i > k_size)
return;
n = 0;
__shared__ float apiv[128];
apiv[k] = Mat[j*k_size + j];
__syncthreads();
n = 0;
while(n< i &&i*k + n < k_size)
{
Mat[j*k_size + i*k + n] = Mat[j*k_size + i*k + n]/apiv[k];
InvMat[j*k_size + i*k + n] = InvMat[j*k_size + i*k + n]/apiv[k];
n++;
}
}
//externs
extern "C"
cudaError_t CUDA_Identity128(float* d_Mat,int start,int rsize, int Size)
{
cudaError_t error;
dim3 Db = dim3( 128, 1 );
dim3 Dg = dim3( 128, 1 );
int h_size = Size;
int h_start;
int h_rsize = rsize;
h_start = start;
cutilSafeCall(cudaMemcpyToSymbol("d_size",&h_size,sizeof(int)));
cutilSafeCall(cudaMemcpyToSymbol("start",&h_start,sizeof(int)));
cutilSafeCall(cudaMemcpyToSymbol("d_rowsize",&h_size,sizeof(int)));
cutilSafeCall(cudaMemcpyToSymbol("d_rsize",&h_rsize,sizeof(int)));
Kernel_Identity128<<<Dg, Db>>>(d_Mat);
cutilSafeCall(cudaThreadSynchronize());
error = cudaGetLastError();
return error;
}
extern "C"
cudaError_t CUDA_NormInv128(float* d_Mat,float* d_InvMat,int Size,int start,int rsize)
{
cudaError_t error;
dim3 Db = dim3( 128, 1 );
dim3 Dg = dim3( 128, 1 );
int h_size = Size;
int h_start;
int h_rsize = rsize;
h_start = start;
cutilSafeCall(cudaMemcpyToSymbol("d_size",&h_size,sizeof(int)));
cutilSafeCall(cudaMemcpyToSymbol("start",&h_start,sizeof(int)));
cutilSafeCall(cudaMemcpyToSymbol("d_rowsize",&h_size,sizeof(int)));
cutilSafeCall(cudaMemcpyToSymbol("d_rsize",&h_rsize,sizeof(int)));
Kernel_NormInv128<<<Dg, Db>>>(d_Mat,d_InvMat);
cutilSafeCall(cudaThreadSynchronize());
error = cudaGetLastError();
return error;
}
extern "C"
cudaError_t CUDA_Inverse128(float* d_Mat,float* d_InvMat,float* d_Piv,float* d_InvPiv,int start,int piv,int Size,int rsize)
{
cudaError_t error;
dim3 Db = dim3( 128, 1 );
dim3 Dg = dim3( 128, 1 );
int h_size = Size;
int h_start;
int h_rsize = rsize;
int h_piv = piv;
h_start = start;
cutilSafeCall(cudaMemcpyToSymbol("d_size",&h_size,sizeof(int)));
cutilSafeCall(cudaMemcpyToSymbol("start",&h_start,sizeof(int)));
cutilSafeCall(cudaMemcpyToSymbol("d_rowsize",&h_size,sizeof(int)));
cutilSafeCall(cudaMemcpyToSymbol("d_rsize",&h_rsize,sizeof(int)));
cutilSafeCall(cudaMemcpyToSymbol("d_pivoterow",&h_piv,sizeof(int)));
Kernel_Inverse128<<<Dg, Db>>>(d_Mat,d_InvMat,d_Piv,d_InvPiv);
cutilSafeCall(cudaThreadSynchronize());
error = cudaGetLastError();
return error;
}
.cpp
HRESULT Cuda_Math::Identity_Matrix_CUDA128(CMatrix_<float>* A)
{
UINT rsize;
UINT start;
if(!m_DevSet)
{
return S_FALSE;
}
float* d_Mat = NULL;
start = 0;
while(start < A->GetRowSize()){
rsize = A->CUDA_memcpy_to_Array(&d_Mat,start,128);
CUDA_Identity128(d_Mat,start,rsize,A->GetColumnSize());
rsize = A->CUDA_memcpy_to_Array(&d_Mat,start,128,false);
start+=128;
}
cutilSafeCall(cudaFree(d_Mat));
return S_OK;
}
HRESULT Cuda_Math::Inverse_Matrix_CUDA128(CMatrix_<float>* A,CMatrix_<float>* B)
{
if(!m_DevSet)
{
return S_FALSE;
}
float* d_Mat = NULL;
float* d_InvMat = NULL;
float* d_Piv;
float* d_InvPiv;
UINT i,j;
UINT rsize;
UINT start;
cudaError_t CUDA_error;
Identity_Matrix_CUDA128(B);
cutilSafeCall( cudaMalloc( (void**)&d_Mat,(A->GetColumnSize() * 128)*sizeof(float) ) );
cutilSafeCall( cudaMalloc( (void**)&d_InvMat,(A->GetColumnSize() * 128)*sizeof(float) ) );
cutilSafeCall( cudaMalloc( (void**)&d_Piv,(A->GetColumnSize() )*sizeof(float) ) );
cutilSafeCall( cudaMalloc( (void**)&d_InvPiv,(A->GetColumnSize())*sizeof(float) ) );
for(i = 0; i <A->GetRowSize();++i)
{
if((*A)[i][i] == 0)
{
j = i+1;
while((*A)[j][i] == 0)
{
j++;
if(j >= A->GetRowSize())
{
cutilSafeCall(cudaFree(d_Mat));
cutilSafeCall(cudaFree(d_InvMat));
cutilSafeCall(cudaFree(d_Piv));
cutilSafeCall(cudaFree(d_InvPiv));
return S_FALSE;
}
}
A->InterchangeRows(i,j);
B->InterchangeRows(i,j);
}
A->Row_Memcpy_CUDA(d_Piv,i);
B->Row_Memcpy_CUDA(d_InvPiv,i);
//CUDA_PivNorm128(d_Piv,d_InvPiv,i,A->GetColumnSize());
//A->Row_Memcpy_CUDA(d_Piv,i,false);
//B->Row_Memcpy_CUDA(d_InvPiv,i,false);
start = 0;
while(start < A->GetRowSize())
{
rsize = A->CUDA_memcpy_to_Array(&d_Mat,start,128);
B->CUDA_memcpy_to_Array(&d_InvMat,start,128);
CUDA_Inverse128(d_Mat,d_InvMat,d_Piv,d_InvPiv,start,i,A->GetColumnSize(),rsize);
A->CUDA_memcpy_to_Array(&d_Mat,start,128,false);
B->CUDA_memcpy_to_Array(&d_InvMat,start,128,false);
start +=128;
}
}
start = 0;
while(start < A->GetRowSize())
{
rsize = A->CUDA_memcpy_to_Array(&d_Mat,start,128);
B->CUDA_memcpy_to_Array(&d_InvMat,start,128);
CUDA_NormInv128(d_Mat,d_InvMat,A->GetColumnSize(),start,rsize);
A->CUDA_memcpy_to_Array(&d_Mat,start,128,false);
B->CUDA_memcpy_to_Array(&d_InvMat,start,128,false);
start +=128;
}
//B->CUDA_memcpy_to_Array(&d_InvMat,false);
cutilSafeCall(cudaFree(d_Mat));
cutilSafeCall(cudaFree(d_InvMat));
cutilSafeCall(cudaFree(d_Piv));
cutilSafeCall(cudaFree(d_InvPiv));
return S_OK;
}
CUDA_memcpy_to_Array(&Mat,start,128)
This function copy up to 128 rows from a Matrix to a linear array, just calling CUDA_memcpy over and over.
I hope get an answer soon.
Thanks for your answers.