cusparseDcsrsv_solve does not even write in all result vector positions

I’m trying to figure out, why my LU preconditioner does not work. Besides possible other problems, I found the following:

cusparseDcsrsv_solve does only overwrite array elements in the result vector out → ptr from positions 0 to 5. The rest remains unchanged.

As you can see, I set LU vector valILU0 as well as input and output vectors arbitrarily for testing. But also with the realistig LU decomposition (see attached below), I come to the same result. It seems, that cusparseDcsrsv_solve stops early for some reason. The problem is not bound to specific input data.

void GPUMatrix :: solve(GPUVector * in, GPUVector * out){
		
		double val = 1;

		for(int i = 0; i < this -> matGPU.row; i ++){
			checkCudaErrors(cudaMemcpy((void*)&in -> ptr[i], (void*)&val, sizeof(double), cudaMemcpyHostToDevice));
			checkCudaErrors(cudaMemcpy((void*)&out -> ptr[i], (void*)&val, sizeof(double), cudaMemcpyHostToDevice));
		}
		
		val = 1;
		
		for(int i = 0; i < this -> matGPU.nonzero; i ++){
			//val = 1 * (double)i;
			checkCudaErrors(cudaMemcpy((void*)&valILU0[i], (void*)&val, sizeof(double), cudaMemcpyHostToDevice));
		}
		
		
        checkCudaErrors(cusparseDcsrsv_solve(
			this -> cusparseHandle, 
			CUSPARSE_OPERATION_NON_TRANSPOSE, 
			this -> matGPU.row, 
			&one, 
			this -> uDescr,
			this -> valILU0, 
			this -> matGPU.csrRowPtr, 
			this -> matGPU.csrColIdx, 
			this -> uInfo, 
			in -> ptr, 
			out -> ptr
		));
		
		
		cudaDeviceSynchronize();
		//usleep(100000);		
		
}

This is the full code for LU decomposition (method “factor”) and solve (method “solve”).

void GPUMatrix :: factor(){

	cudaDeviceSynchronize();
	usleep(100000);
	
    this -> lDescr = 0;
    checkCudaErrors(cusparseCreateMatDescr(&this -> lDescr));
    cusparseSetMatType(this -> lDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(this -> lDescr, CUSPARSE_INDEX_BASE_ZERO);
    cusparseSetMatFillMode(this -> lDescr, CUSPARSE_FILL_MODE_LOWER);
    cusparseSetMatDiagType(this -> lDescr, CUSPARSE_DIAG_TYPE_UNIT);	
	
    this -> uDescr = 0;
    checkCudaErrors(cusparseCreateMatDescr(&this -> uDescr));
    cusparseSetMatType(this -> uDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(this -> uDescr, CUSPARSE_INDEX_BASE_ZERO);
    cusparseSetMatFillMode(this -> uDescr, CUSPARSE_FILL_MODE_UPPER);
    cusparseSetMatDiagType(this -> uDescr, CUSPARSE_DIAG_TYPE_NON_UNIT);	
	
	cudaDeviceSynchronize();
	usleep(100000);	
	
	checkCudaErrors(cusparseCreateSolveAnalysisInfo(&this -> matInfo));
	checkCudaErrors(cusparseCreateSolveAnalysisInfo(&this -> lInfo));
	checkCudaErrors(cusparseCreateSolveAnalysisInfo(&this -> uInfo));
	
	
	int nonzero = this -> matGPU.nonzero;
	checkCudaErrors(cudaMalloc((void **) &this -> tmpOut, this -> matGPU.row * sizeof(double)));
	checkCudaErrors(cudaMalloc((void **) &this -> valILU0, nonzero * sizeof(double)));
    checkCudaErrors(cudaMemcpy(this -> valILU0, this -> matGPU.data, nonzero * sizeof(double), cudaMemcpyDeviceToDevice));
	
	cudaDeviceSynchronize();
	usleep(100000);	
	
	checkCudaErrors(cusparseDcsrsv_analysis(
		this -> cusparseHandle, 
		CUSPARSE_OPERATION_NON_TRANSPOSE,
		this -> matGPU.row,
		nonzero, 
		this -> matDescr, 
		this -> matGPU.data, 
		this -> matGPU.csrRowPtr, 
		this -> matGPU.csrColIdx, 
		this -> matInfo
	));

	cudaDeviceSynchronize();
	usleep(100000);

    checkCudaErrors(cusparseDcsrilu0(
		this -> cusparseHandle, 
		CUSPARSE_OPERATION_NON_TRANSPOSE, 
		this -> matGPU.row,
		this -> matDescr,
		this -> valILU0, 
		this -> matGPU.csrRowPtr, 
		this -> matGPU.csrColIdx, 
		this -> matInfo
	));
	
	cudaDeviceSynchronize();
	usleep(100000);
	
	checkCudaErrors(cusparseDcsrsv_analysis(
		this -> cusparseHandle, 
		CUSPARSE_OPERATION_NON_TRANSPOSE,
		this -> matGPU.row,
		nonzero, 
		this -> lDescr, 
		this -> matGPU.data, 
		this -> matGPU.csrRowPtr, 
		this -> matGPU.csrColIdx, 
		this -> lInfo
	));
	
	checkCudaErrors(cusparseDcsrsv_analysis(
		this -> cusparseHandle, 
		CUSPARSE_OPERATION_NON_TRANSPOSE,
		this -> matGPU.row,
		nonzero, 
		this -> uDescr, 
		this -> matGPU.data, 
		this -> matGPU.csrRowPtr, 
		this -> matGPU.csrColIdx, 
		this -> uInfo
	));
	
	cudaDeviceSynchronize();
	usleep(100000);

	MatrixFileIO :: addDebugOutput("matCPUdata", nonzero, 1, this -> matHost.data);
	MatrixFileIO :: addDebugOutputGPU("matGPUdata", nonzero, 1, this -> matGPU.data);	
	MatrixFileIO :: addDebugOutputGPU("valILU0", nonzero, 1, this -> valILU0);
	MatrixFileIO :: saveDebug("dbg_solve_subdomain.bin");	
	
	this -> factorized = 1;

}

void GPUMatrix :: solve(GPUVector * in, GPUVector * out){
		
		
		cudaDeviceSynchronize();	
		
        checkCudaErrors(cusparseDcsrsv_solve(
			this -> cusparseHandle,
			CUSPARSE_OPERATION_NON_TRANSPOSE,
			this -> matGPU.row,
			&one,
			this -> lDescr,
			this -> valILU0, 
			this -> matGPU.csrRowPtr, 
			this -> matGPU.csrColIdx, 
			this -> lInfo, 
			in -> ptr, 
			this -> tmpOut
		));
		*/
		
		cudaDeviceSynchronize();		
		
		
        checkCudaErrors(cusparseDcsrsv_solve(
			this -> cusparseHandle, 
			CUSPARSE_OPERATION_NON_TRANSPOSE, 
			this -> matGPU.row, 
			&one, 
			this -> uDescr,
			this -> valILU0, 
			this -> matGPU.csrRowPtr, 
			this -> matGPU.csrColIdx, 
			this -> uInfo, 
			this -> tmpOut, 
			out -> ptr
		));
		
		
		cudaDeviceSynchronize();
		//usleep(100000);		
		
}

Ok, it seems that this only happens when the ILU decomposition is “bad”. BUT: The only “good” decompositions I get is from a unit matrix.

What could go wrong when computing the ILU decomposition? I have the following suspicion:

  • My algorithm for building the CSR matrix is wrong? However, the same algorithm works perfectly fine when using a matrix-vector-product by cusparse.
  • ILU decomposition does not work for some matrices? But I get good looking results when decomposing the very same matrices with Matlab’s ilu(…)

So currently I have absolut no clue what’s wrong and I’d really appreciate every hint…