Kernel works just for small matrices

I have created a Kernel in order to get LU decomposition, I have run it for small matrices, but for matrices bigger than 128, it simplies stop to work :-S. Does anyone have an idea why it could happen??

Can you post some code?

Possible reasons:

  1. Out of memory

  2. Resources (shared memory, register file) needed for the threads in a block is out of limit.

  3. Kernel timeout. (There’s a time limit on the kernel launched in windows, which, if exceeded, OS will forcefully kill the kernel)

Hence, it is always recommended to use the following after every cuda* call…

cudaError err_t = cudaGetLastError();

if(err_t != cudaSuccess) {

   fprintf(stdout, "Function failed at %d. Reason: %s\n", __LINE__, cudaGetErrorString(err_t);

}

That said, it is always advisable to post some code for a better understanding… :)

I already check those line in order to check if there is an error, But I got nothing. My first aproach was just using CUBLAS functions, but it was really slow, so that is why i decided to mix CUBLAS and some of my code, Could be that a problem??

Thanks in advance

int LU_Dec (float *A, float *L, float *U, float *piv, unsigned int h_A, unsigned int w_A)

{

cublasStatus status;

 status = cublasInit();

if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! CUBLAS initialization error\n");

    return EXIT_FAILURE;

}

float* d_L = 0;

float* d_U = 0;

status = cublasAlloc(w_A*h_A, sizeof(d_L[0]), (void**)&d_L);

if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! device memory allocation error (L)\n");

    return EXIT_FAILURE;

}

status = cublasAlloc(w_A*h_A, sizeof(d_U[0]), (void**)&d_U);

if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! device memory allocation error (U)\n");

    return EXIT_FAILURE;

}

// Copy table A in U

status = cublasSetVector(w_A*h_A, sizeof(float), A, 1, d_U, 1);

if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! device access error (write A)\n");

    return EXIT_FAILURE;

}

status = cublasSetVector(w_A*h_A, sizeof(float), L, 1, d_L, 1);

if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! device access error (write A)\n");

    return EXIT_FAILURE;

}

for (unsigned int i = 0; i < h_A; i++){   //for all the rows

	int pivotRow = cublasIsamax(h_A - i, d_U + i + (i * w_A), w_A); //Maximum of the relative vector

	pivotRow = pivotRow + i - 1;		  //Correct row number to change

	

	//printf("%d ", pivotRow);

	dim3 dimBlock( BLOCK_SIZE, BLOCK_SIZE );

	dim3 dimGrid(h_A / dimBlock.x, w_A / dimBlock.y);

	//Call the Kernel



	LU_Decomp <<<dimBlock, dimGrid>>> (d_L, d_U, h_A, w_A, i, pivotRow);

	LU_2Decomp <<<dimBlock, dimGrid>>> (d_L, d_U, h_A, w_A, i, pivotRow);

}

cudaError err_t = cudaGetLastError();

if(err_t != cudaSuccess) {

	fprintf(stdout, "Function failed at %d. Reason: %s\n", __LINE__, cudaGetErrorString(err_t));

}

status = cublasGetVector(w_A * h_A, sizeof(U[0]), d_U, 1, U, 1);

if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! device access error (read C)\n");

    return EXIT_FAILURE;

}

status = cublasGetVector(w_A * h_A, sizeof(L[0]), d_L, 1, L, 1);

if (status != CUBLAS_STATUS_SUCCESS) {

    fprintf (stderr, "!!!! device access error (read C)\n");

    return EXIT_FAILURE;

}

status = cublasGetError();

if (status != CUBLAS_STATUS_SUCCESS) {

	fprintf (stderr, "!!!! kernel execution error.\n");

	return EXIT_FAILURE;

}

}

global void LU_Decomp(float *L, float *U, unsigned int row_A, unsigned int col_A, unsigned int i, unsigned int pivotRow)

{

unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;

float valcheck=1;

float temp;

valcheck = U[pivotRow * col_A + i];



if (fabs(valcheck) < 1E-21){

 /// error

}

__syncthreads();

if((col == i) && (row >= i)){

	if (row == i)

		L[i * col_A + col] = 1 ; 

	else 

		if	(row == pivotRow)

			L[pivotRow * col_A + col] = U[i * col_A + col] / valcheck;

		else

			L[row * col_A + col] = U[row * col_A + col] / valcheck;

}



__syncthreads();

if (row == pivotRow){

	temp = U[pivotRow * col_A + col];

	__syncthreads();

	U[pivotRow * col_A + col] = U[i * col_A + col];

	__syncthreads();

	U[i * col_A + col] = temp;

	__syncthreads();

}

}

global void LU_2Decomp(float *L, float *U, unsigned int row_A, unsigned int col_A, unsigned int i, unsigned int pivotRow){

unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;

float temp2;



if ((row > i)&&(col >= i)){

	temp2 = U[row * col_A + col];

	__syncthreads();

	U[row * col_A + col] =  temp2 - L[row * col_A + i]*U[i * col_A + col];

	__syncthreads();

}



if((col < i) && (row >= i)){

	if (row == pivotRow){

		temp2 = L[pivotRow * col_A + col];

		L[pivotRow * col_A + col] = L[i * col_A + col];

		L[i * col_A + col] = temp2;

	}

}

__syncthreads();

}

Sorry, I’ve never worked with cublas so I cannot pin-point the error :(
However, I’ve had the same issue before, when I started working with CUDA. It turned out that I was accessing ‘out-of-bound’ memory in my kernel. So, you could also start looking at the code from this perspective.

You haven’t really said what the actual problem is, and your code isn’t all that well instrumented in the host code that is most likely to be causing the problem, which makes things pretty hard.

But let me take a wild stab in the dark and guess that BLOCK_SIZE is 128? If so then it is probably an algorithm design error. Your use of __syncthreads() looks highly suspect to me. __syncthreads() only provides a thread synchronization barrier to threads running within a given block. There is no block level synchronization. I can imagine that the pivot logic will break when your rows span more thaqn one block.

If you are just looking for a fast LU factorization and don’t much care about getting this code to work, then you might be insterested in this thread, which describes a very fast, dense LU factorization which gives over 350 single precision GFLOP/s on a fast GPU and host for large matrices.

The size block is 16, and the problem is that when the matrix is bigger than 128 the kernel do nothing. I mean I got no values in the L matrix and neither in the U.

Thanks a lot for the link. I will check it.

That probably means that the kernels aren’t ever running. You should call cudaGetLastError() after each kernel launch and check the contents of the return value. It might provide more information about why the kernels aren’t running.

I already check the error after every kernel i launched, but I get not error at all. I think the problem is related with the amount of memory, but I really dont know, Im pretty new in CUDA programming.

The code of my programm is attached, hope youc an help me.

Thanks
LU_Decomp.zip (59 KB)

In the code you posted above, you clearly do not:

for (unsigned int i = 0; i < h_A; i++){ //for all the rows

	int pivotRow = cublasIsamax(h_A - i, d_U + i + (i * w_A), w_A); //Maximum of the relative vector

	pivotRow = pivotRow + i - 1; //Correct row number to change

	//printf("%d ", pivotRow);

	dim3 dimBlock( BLOCK_SIZE, BLOCK_SIZE );

	dim3 dimGrid(h_A / dimBlock.x, w_A / dimBlock.y);

	//Call the Kernel

	LU_Decomp <<<dimBlock, dimGrid>>> (d_L, d_U, h_A, w_A, i, pivotRow);

	LU_2Decomp <<<dimBlock, dimGrid>>> (d_L, d_U, h_A, w_A, i, pivotRow);

}

You should inspect the status after every kernel call. That means after you call LU_Decomp, and again after LU_2Decomp, for every iteration of that loop. The error functions only report the status of the last operation before the error function is called. If there is some sort of error in any intermediate operation, it will be lost.

I check the status inside the for, even in the code i post you can check it, but i was doing some test and in the code i just attached the second kernel is in commentar, but it shouldnt be like that

I don’t have access to a system that can build your attached code project, I am afraid.

Hi. I found the source of my problem, basically i was comparing the results from my program with matlab, and i notice that matlab use double precision to do the calculation and I was doing with single precision.

I already chage my code in order to have double precision calculation. I also change the flag to sm_13, but im still having the same error, Should I do anyother change??

May you provide CPU version of your algorithm such that I can check the result in the same C code and

don’t compare it with result of MATLAB?

I wrote this code in order to check the results

void LU_decomp_gold(double *L, double *U, unsigned int row_A, unsigned int col_A){

unsigned int piv = 0;

double max_val = 0;

for (unsigned int i = 0; i < row_A; i++){

	max_val = 0;

	for(unsigned int j = i; j < row_A; j++){

		if (abs(U[j * col_A + i]) > abs(max_val)){

			max_val = U[j * col_A + i];

			piv = j;

		}

	}

	if (piv != i)

	for(unsigned int j = i; j < col_A; j++){

		double temp = U[i *  col_A + j];

		U[i * col_A + j] = U[piv * col_A + j];

		U[piv * col_A + j] = temp;

	}

	for(unsigned int j = i; j < row_A; j++){

		double  temp = U[j * col_A + i];

		L[j *  col_A + i] = temp / max_val;

	}

	for(unsigned int k = i; k < row_A; k++)

		for (unsigned int j = i + 1; j < col_A; j++)

		U[j * col_A + k] = U[j * col_A + k] - L[j * col_A + i]*U[i * col_A + k];

		

	if(piv != i)

	for(unsigned int j = 0; j < i; j++){

		double temp = L[i *  col_A + j];

		L[i * col_A + j] = L[piv * col_A + j];

		L[piv * col_A + j] = temp;

	}	

}

}

void compare_L2err( const double* reference, const double* data,

				const unsigned int len, const double epsilon)

{

//assert(epsilon >=0);



double error = 0;

double ref = 0;

for (unsigned int i = 0; i<len ; ++i){

	double diff = reference[i] - data[i];

	error +=  diff * diff;

	ref +=  reference[i]*reference[i];

}

double normRef = sqrt(ref);

if(fabs(ref) < 1e-7){

	printf("ERROR, reference 12- norm is 0 (machine zero)\n");

}

double normError = sqrt(error);

error = normError / normRef;

bool result = error < epsilon;

printf("L2(reference) = %4.16f \n",normRef);

printf("L2(reference - data) = %4.16f \n",normError);

if(! result){

	printf("Error, relative 12-norm error %4.16f is greater than epsilon %4.16f \n",error,epsilon); 

}

else{

	printf("relative L2-norm error %4.16f is OK \n",error);

}

}

Why i should check the results with matlab??