troubles in matrix inversion takes more time than cpu

I’m programming Gauss-Jordan method for matrix inversion, but when I made a time test, was slower than CPU. can someone help me?

Here is my code:

.cu :

//kernels

__global__ void Kernel_Identity128(float *Mat)

{

	int i = d_size /128 +1;

	int j = blockIdx.x;

	int k = threadIdx.x;

	int n;

	int k_size = d_size;

	if(j >= d_rsize)

		return;

	if(k*i > d_size)

		return;

	n = 0;

	while(n< i && i*k + n < d_size)

	{

		if(i*k + n == start + j)

			Mat[j*d_size + i*k + n] = 1;

		else

			Mat[j*d_size + i*k + n] = 0;

		n++;

	}

}

__global__ void Kernel_Inverse128(float *Mat,float* InvMat,float* Piv,float* InvPiv)

{

	int i = (2*d_size) /128 +1;

	int j = blockIdx.x;

	int k = threadIdx.x;

	int n;

	int k_size = d_size;

	int k_pivoterow = d_pivoterow;

	__shared__ float apiv[128];

	if(j >= d_rsize)

		return;

	if(k*i > k_size)

		return;

	if(start + j == k_pivoterow)

		return;

	n = 0;

	apiv[k] = -1.0f*Mat[j*k_size + k_pivoterow]/Piv[k_pivoterow];

	__syncthreads();

	if(i*k + n < k_size)

	{

		if(i*k + n == k_pivoterow)

			Mat[j*k_size + i*k + n] = 0;

		while(n< i &&i*k + n < k_size && i*k + n > k_pivoterow)

		{

			Mat[j*k_size + i*k + n] = apiv[k]*Piv[i*k + n] + Mat[j*k_size + i*k + n];

			n++;

		}

	}

	else

	{

		while(n< i &&i*k + n < 2*k_size)

		{

			InvMat[j*k_size + (i*k + n)-k_size] = apiv[k] * InvPiv[i*k + n]+InvMat[j*k_size + (i*k + n)-k_size];

			n++;

		}

	}

}

__global__ void Kernel_NormInv128(float *Mat,float* InvMat)

{

	int i = d_size /128 +1;

	int j = blockIdx.x;

	int k = threadIdx.x;

	int n;

	int k_size = d_size;

	if(j >= d_rsize)

		return;

	if(k*i > k_size)

		return;

	n = 0;

	__shared__ float apiv[128];

	apiv[k] = Mat[j*k_size + j];

	__syncthreads();

	n = 0;

	while(n< i &&i*k + n < k_size)

	{

		Mat[j*k_size + i*k + n] = Mat[j*k_size + i*k + n]/apiv[k];

		InvMat[j*k_size + i*k + n] = InvMat[j*k_size + i*k + n]/apiv[k];

		n++;

	}

} 

//externs

extern "C" 

cudaError_t CUDA_Identity128(float* d_Mat,int start,int rsize, int Size)

{

	cudaError_t error;

	dim3 Db = dim3( 128, 1 );

	dim3 Dg = dim3( 128, 1 );

	int h_size = Size;

	int h_start;

	int h_rsize = rsize;

	h_start = start;

	cutilSafeCall(cudaMemcpyToSymbol("d_size",&h_size,sizeof(int)));

	cutilSafeCall(cudaMemcpyToSymbol("start",&h_start,sizeof(int)));

	cutilSafeCall(cudaMemcpyToSymbol("d_rowsize",&h_size,sizeof(int)));

	cutilSafeCall(cudaMemcpyToSymbol("d_rsize",&h_rsize,sizeof(int)));

	Kernel_Identity128<<<Dg, Db>>>(d_Mat);

	cutilSafeCall(cudaThreadSynchronize());

	error = cudaGetLastError();

	return error;

}

extern "C" 

cudaError_t CUDA_NormInv128(float* d_Mat,float* d_InvMat,int Size,int start,int rsize)

{

	cudaError_t error;

	dim3 Db = dim3( 128, 1 );

	dim3 Dg = dim3( 128, 1 );

	int h_size = Size;

	int h_start;

	int h_rsize = rsize;

	h_start = start;

	cutilSafeCall(cudaMemcpyToSymbol("d_size",&h_size,sizeof(int)));

	cutilSafeCall(cudaMemcpyToSymbol("start",&h_start,sizeof(int)));

	cutilSafeCall(cudaMemcpyToSymbol("d_rowsize",&h_size,sizeof(int)));

	cutilSafeCall(cudaMemcpyToSymbol("d_rsize",&h_rsize,sizeof(int)));

	Kernel_NormInv128<<<Dg, Db>>>(d_Mat,d_InvMat);

	cutilSafeCall(cudaThreadSynchronize());

	error = cudaGetLastError();

	return error;

}

extern "C" 

cudaError_t CUDA_Inverse128(float* d_Mat,float* d_InvMat,float* d_Piv,float* d_InvPiv,int start,int piv,int Size,int rsize)

{

	cudaError_t error;

	dim3 Db = dim3( 128, 1 );

	dim3 Dg = dim3( 128, 1 );

	int h_size = Size;

	int h_start;

	int h_rsize = rsize;

	int h_piv = piv;

	h_start = start;

	cutilSafeCall(cudaMemcpyToSymbol("d_size",&h_size,sizeof(int)));

	cutilSafeCall(cudaMemcpyToSymbol("start",&h_start,sizeof(int)));

	cutilSafeCall(cudaMemcpyToSymbol("d_rowsize",&h_size,sizeof(int)));

	cutilSafeCall(cudaMemcpyToSymbol("d_rsize",&h_rsize,sizeof(int)));

	cutilSafeCall(cudaMemcpyToSymbol("d_pivoterow",&h_piv,sizeof(int)));

	Kernel_Inverse128<<<Dg, Db>>>(d_Mat,d_InvMat,d_Piv,d_InvPiv);

	cutilSafeCall(cudaThreadSynchronize());

	error = cudaGetLastError();

	return error;

}

.cpp

HRESULT Cuda_Math::Identity_Matrix_CUDA128(CMatrix_<float>* A)

{

	UINT rsize;

	UINT start;

	if(!m_DevSet)

	{

		return S_FALSE;

	}

	float* d_Mat = NULL;

	start = 0;

	while(start < A->GetRowSize()){

		rsize = A->CUDA_memcpy_to_Array(&d_Mat,start,128);

		CUDA_Identity128(d_Mat,start,rsize,A->GetColumnSize());

		rsize = A->CUDA_memcpy_to_Array(&d_Mat,start,128,false);

		start+=128;

	}

	cutilSafeCall(cudaFree(d_Mat));

	return S_OK;

}

HRESULT Cuda_Math::Inverse_Matrix_CUDA128(CMatrix_<float>* A,CMatrix_<float>* B)

{

	if(!m_DevSet)

	{

		return S_FALSE;

	}

	float* d_Mat = NULL;

	float* d_InvMat = NULL;

	float* d_Piv;

	float* d_InvPiv;

	UINT i,j;

	UINT rsize;

	UINT start;

	cudaError_t CUDA_error;

	Identity_Matrix_CUDA128(B);

	cutilSafeCall( cudaMalloc( (void**)&d_Mat,(A->GetColumnSize() * 128)*sizeof(float) ) );

	cutilSafeCall( cudaMalloc( (void**)&d_InvMat,(A->GetColumnSize() * 128)*sizeof(float) ) );

	cutilSafeCall( cudaMalloc( (void**)&d_Piv,(A->GetColumnSize() )*sizeof(float) ) );

	cutilSafeCall( cudaMalloc( (void**)&d_InvPiv,(A->GetColumnSize())*sizeof(float) ) );

	for(i = 0; i <A->GetRowSize();++i)

	{

		if((*A)[i][i] == 0)

		{

			j = i+1;

			while((*A)[j][i] == 0)

			{

				j++;

				if(j >= A->GetRowSize())

				{

					cutilSafeCall(cudaFree(d_Mat));

					cutilSafeCall(cudaFree(d_InvMat));

					cutilSafeCall(cudaFree(d_Piv));

					cutilSafeCall(cudaFree(d_InvPiv));

					return S_FALSE;

				}

			}

			A->InterchangeRows(i,j);

			B->InterchangeRows(i,j);

		}

		A->Row_Memcpy_CUDA(d_Piv,i);

		B->Row_Memcpy_CUDA(d_InvPiv,i);

		//CUDA_PivNorm128(d_Piv,d_InvPiv,i,A->GetColumnSize());

		//A->Row_Memcpy_CUDA(d_Piv,i,false);

		//B->Row_Memcpy_CUDA(d_InvPiv,i,false);

		start = 0;

		while(start < A->GetRowSize())

		{

			rsize = A->CUDA_memcpy_to_Array(&d_Mat,start,128);

			B->CUDA_memcpy_to_Array(&d_InvMat,start,128);

			CUDA_Inverse128(d_Mat,d_InvMat,d_Piv,d_InvPiv,start,i,A->GetColumnSize(),rsize);

			A->CUDA_memcpy_to_Array(&d_Mat,start,128,false);

			B->CUDA_memcpy_to_Array(&d_InvMat,start,128,false);

			start +=128;

		}

	}

	start = 0;

	while(start < A->GetRowSize())

	{

		rsize = A->CUDA_memcpy_to_Array(&d_Mat,start,128);

		B->CUDA_memcpy_to_Array(&d_InvMat,start,128);

		CUDA_NormInv128(d_Mat,d_InvMat,A->GetColumnSize(),start,rsize);

		A->CUDA_memcpy_to_Array(&d_Mat,start,128,false);

		B->CUDA_memcpy_to_Array(&d_InvMat,start,128,false);

		start +=128;

	}

	//B->CUDA_memcpy_to_Array(&d_InvMat,false);

	cutilSafeCall(cudaFree(d_Mat));

	cutilSafeCall(cudaFree(d_InvMat));

	cutilSafeCall(cudaFree(d_Piv));

	cutilSafeCall(cudaFree(d_InvPiv));

	return S_OK;

}

CUDA_memcpy_to_Array(&Mat,start,128)

This function copy up to 128 rows from a Matrix to a linear array, just calling CUDA_memcpy over and over.

I hope get an answer soon.

Thanks for your answers.

Matrices of dimension 128 are small, you will have to batch the inversions to get good performance and fully exploit the parallelism of the GPU. We have posted batched matrix inversion (and solver) code on the registered developer website. It uses a one-matrix-per-threadblock approach and makes good use of shared memory, but this also means the maximum matrix size with the code as-is is limited by the shared memory size. On sm_2x devices, if you instantiate the code for elements of type float, you should be able to handle matrices up to about dimension 110.

For inversion of very small matrices (say 2x2 to maybe 10x10), it is best to use a one-matrix-per-thread approach and keep all data in registers; this also implies using bigger batches to exploit the full parallelism of the GPU.

Actually is supposed to upload 128 row from the matrix, do row pivote, until pivote all the rows, and then select a new pivote and repeat the process. Also works for grater matrix, but I’m trying to invert very large matrix using a low capacity GPU.

Sorry, I misunderstood what you are trying to accomplish. I am not aware of any code I could point you at for the inversion of matrices larger than what is handled by the batched matrix inversion code that was posted to the registered developer website (which uses in-place Gauss-Jordan).

I was trying to view the code but it seems I’m not registered in here: https://nvdeveloper.nvidia.com
But I’m alredy registered in Nvidia developer Zone. Where do I make my account for getting access?