I’m trying to figure out, why my LU preconditioner does not work. Besides possible other problems, I found the following:
cusparseDcsrsv_solve does only overwrite array elements in the result vector out → ptr from positions 0 to 5. The rest remains unchanged.
As you can see, I set LU vector valILU0 as well as input and output vectors arbitrarily for testing. But also with the realistig LU decomposition (see attached below), I come to the same result. It seems, that cusparseDcsrsv_solve stops early for some reason. The problem is not bound to specific input data.
void GPUMatrix :: solve(GPUVector * in, GPUVector * out){
double val = 1;
for(int i = 0; i < this -> matGPU.row; i ++){
checkCudaErrors(cudaMemcpy((void*)&in -> ptr[i], (void*)&val, sizeof(double), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy((void*)&out -> ptr[i], (void*)&val, sizeof(double), cudaMemcpyHostToDevice));
}
val = 1;
for(int i = 0; i < this -> matGPU.nonzero; i ++){
//val = 1 * (double)i;
checkCudaErrors(cudaMemcpy((void*)&valILU0[i], (void*)&val, sizeof(double), cudaMemcpyHostToDevice));
}
checkCudaErrors(cusparseDcsrsv_solve(
this -> cusparseHandle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
this -> matGPU.row,
&one,
this -> uDescr,
this -> valILU0,
this -> matGPU.csrRowPtr,
this -> matGPU.csrColIdx,
this -> uInfo,
in -> ptr,
out -> ptr
));
cudaDeviceSynchronize();
//usleep(100000);
}
This is the full code for LU decomposition (method “factor”) and solve (method “solve”).
void GPUMatrix :: factor(){
cudaDeviceSynchronize();
usleep(100000);
this -> lDescr = 0;
checkCudaErrors(cusparseCreateMatDescr(&this -> lDescr));
cusparseSetMatType(this -> lDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(this -> lDescr, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatFillMode(this -> lDescr, CUSPARSE_FILL_MODE_LOWER);
cusparseSetMatDiagType(this -> lDescr, CUSPARSE_DIAG_TYPE_UNIT);
this -> uDescr = 0;
checkCudaErrors(cusparseCreateMatDescr(&this -> uDescr));
cusparseSetMatType(this -> uDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(this -> uDescr, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatFillMode(this -> uDescr, CUSPARSE_FILL_MODE_UPPER);
cusparseSetMatDiagType(this -> uDescr, CUSPARSE_DIAG_TYPE_NON_UNIT);
cudaDeviceSynchronize();
usleep(100000);
checkCudaErrors(cusparseCreateSolveAnalysisInfo(&this -> matInfo));
checkCudaErrors(cusparseCreateSolveAnalysisInfo(&this -> lInfo));
checkCudaErrors(cusparseCreateSolveAnalysisInfo(&this -> uInfo));
int nonzero = this -> matGPU.nonzero;
checkCudaErrors(cudaMalloc((void **) &this -> tmpOut, this -> matGPU.row * sizeof(double)));
checkCudaErrors(cudaMalloc((void **) &this -> valILU0, nonzero * sizeof(double)));
checkCudaErrors(cudaMemcpy(this -> valILU0, this -> matGPU.data, nonzero * sizeof(double), cudaMemcpyDeviceToDevice));
cudaDeviceSynchronize();
usleep(100000);
checkCudaErrors(cusparseDcsrsv_analysis(
this -> cusparseHandle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
this -> matGPU.row,
nonzero,
this -> matDescr,
this -> matGPU.data,
this -> matGPU.csrRowPtr,
this -> matGPU.csrColIdx,
this -> matInfo
));
cudaDeviceSynchronize();
usleep(100000);
checkCudaErrors(cusparseDcsrilu0(
this -> cusparseHandle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
this -> matGPU.row,
this -> matDescr,
this -> valILU0,
this -> matGPU.csrRowPtr,
this -> matGPU.csrColIdx,
this -> matInfo
));
cudaDeviceSynchronize();
usleep(100000);
checkCudaErrors(cusparseDcsrsv_analysis(
this -> cusparseHandle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
this -> matGPU.row,
nonzero,
this -> lDescr,
this -> matGPU.data,
this -> matGPU.csrRowPtr,
this -> matGPU.csrColIdx,
this -> lInfo
));
checkCudaErrors(cusparseDcsrsv_analysis(
this -> cusparseHandle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
this -> matGPU.row,
nonzero,
this -> uDescr,
this -> matGPU.data,
this -> matGPU.csrRowPtr,
this -> matGPU.csrColIdx,
this -> uInfo
));
cudaDeviceSynchronize();
usleep(100000);
MatrixFileIO :: addDebugOutput("matCPUdata", nonzero, 1, this -> matHost.data);
MatrixFileIO :: addDebugOutputGPU("matGPUdata", nonzero, 1, this -> matGPU.data);
MatrixFileIO :: addDebugOutputGPU("valILU0", nonzero, 1, this -> valILU0);
MatrixFileIO :: saveDebug("dbg_solve_subdomain.bin");
this -> factorized = 1;
}
void GPUMatrix :: solve(GPUVector * in, GPUVector * out){
cudaDeviceSynchronize();
checkCudaErrors(cusparseDcsrsv_solve(
this -> cusparseHandle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
this -> matGPU.row,
&one,
this -> lDescr,
this -> valILU0,
this -> matGPU.csrRowPtr,
this -> matGPU.csrColIdx,
this -> lInfo,
in -> ptr,
this -> tmpOut
));
*/
cudaDeviceSynchronize();
checkCudaErrors(cusparseDcsrsv_solve(
this -> cusparseHandle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
this -> matGPU.row,
&one,
this -> uDescr,
this -> valILU0,
this -> matGPU.csrRowPtr,
this -> matGPU.csrColIdx,
this -> uInfo,
this -> tmpOut,
out -> ptr
));
cudaDeviceSynchronize();
//usleep(100000);
}