Dear everyone,
I’m relative new to cuda programming and I’m working on develop Krylov solver (Quasi-minimum residual method) with incomplete LU decomposition result as the preconditioner.
Here is my problem: Ax = b
I perform ILU decomsotion [L,U]=ilu0(A)
Inside the code, I have to solve some additional triangular system:
L*y=v
and
U’*z=wt (where U’ is the transpose of U matrix)
The ilu is implemeted as:
cusparseStatus = cusparseScsrilu0(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, descr, d_valsILU0, d_row, d_col, infoA);
The ilu result is stored in d_valsILU0 (it contains values for both L and U)
I solve the first equation L*v = y by:
cusparseStatus = cusparseScsrsv_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, &floatone, descrL, d_valsILU0, d_row, d_col, infoA, d_vt, d_y);
where descrL indicates the lower triangular.
The calculated result y (d_y means the value of y in device) is correct comparing to MATLAB.
Then I solve another equaiton U’*z = wt by:
cusparseStatus = cusparseScsrsv_solve(cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, N, &floatone, descrU, d_valsILU0, d_row, d_col, info_u, d_wt, d_z);
Here I used CUSPARSE_OPERATION_TRANSPOSE and I’m pretty sure that in the previous analysis stage, I used also CUSPARSE_OPERATION_TRANSPOSE. Here descrU indicate that the fill model for U is upper and the diagonal component is non-unit.
I just cannot get the correct result for this stage comparing to MATLAB.
However, if I solve U*z=wt instead of the transpose of U. I get correct result. I’m really confused why.
In cuda ilu0, it stores the result of L and U together and I’m wondering if it is possible to get L and U separately?
Thanks.
The following is my messy code:
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// CUDA Runtime
#include <cuda_runtime.h>
// Using updated (v2) interfaces for CUBLAS and CUSPARSE
#include <cusparse.h>
#include <cublas_v2.h>
// Utilities and system includes
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
#include <helper_cuda.h> // helper for CUDA error checking
const char *sSDKname = “conjugateGradientPrecond”;
/* Solve Ax=b using the conjugate gradient method a) without any preconditioning, b) using an Incomplete Cholesky preconditioner and c) using an ILU0 preconditioner. */
int main(int argc, char **argv)
{
const int max_iter = 100;
int k, M = 0, N = 0, nz = 0, *I = NULL, *J = NULL;
int nrow, ncol;//size of row and col;
int *d_col, *d_row;
int qatest = 0;
const float tol = 1e-12f;
float r0, r1, alpha, beta;
float *d_val, *d_x;
float *d_zm1, *d_zm2, *d_rm2;
float *d_p, *d_omega;
float *val = NULL;
float *d_valsILU0;
float *valsILU0;
float rsum, diff, err = 0.0;
float qaerr1, qaerr2 = 0.0;
float dot, numerator, denominator, nalpha;
const float floatone = 1.0;
const float floatonem = -1.0;
const float floatzero = 0.0;
int nErrors = 0;
FILE *fp;
//new declared parameters for qmr
int flag, imin;
float n2b, tolb, normr, normrmin, rho;
float *d_rhs, *d_residual;
float *x, *xmin, *rhs, *d_Ax;
float *d_vt, *resvec, *d_y, *d_z, *d_wt;
float *tmp, *d_tmp;
printf("conjugateGradientPrecond starting...\n");
/* QA testing mode */
if (checkCmdLineFlag(argc, (const char **)argv, "qatest"))
{
qatest = 1;
}
/* This will pick the best possible CUDA capable device */
cudaDeviceProp deviceProp;
int devID = findCudaDevice(argc, (const char **)argv);
printf("GPU selected Device ID = %d \n", devID);
if (devID < 0)
{
printf("Invalid GPU device %d selected, exiting...\n", devID);
exit(EXIT_SUCCESS);
}
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
/* Statistics about the GPU device */
printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);
int version = (deviceProp.major * 0x10 + deviceProp.minor);
if (version < 0x11)
{
printf("%s: requires a minimum CUDA compute 1.1 capability\n", sSDKname);
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();
exit(EXIT_SUCCESS);
}
/*Read Input Matrix*/
fp = fopen("Input","r");
fscanf(fp,"%d %d %d\n",&N, &nrow, &ncol);
M = N;
nz = ncol;
I = (int *)malloc(sizeof(int)*(nrow)); // csr row pointers for matrix A
J = (int *)malloc(sizeof(int)*ncol); // csr column indices for matrix A
val = (float *)malloc(sizeof(float)*nz);
x = (float *)malloc(sizeof(float)*N);
xmin = (float *)malloc(sizeof(float)*N);
rhs = (float *)malloc(sizeof(float)*N);
tmp = (float *)malloc(sizeof(float)*N);
for (int i=0;i<nrow;i++)
fscanf(fp,"%d\n",&(I[i]));
for (int i=0;i<nz;i++)
fscanf(fp,"%d\n",&(J[i]));
for (int i=0;i<nz;i++)
fscanf(fp,"%f\n",&(val[i]));
for (int i=0;i<N;i++)
fscanf(fp,"%f\n",&(rhs[i]));
fclose(fp);
for (int i = 0; i < N; i++)
{
x[i] = 0.0; // Initial approximation of solution
}
//write I J val x into file for check
fp = fopen("I","w");
for (int i=0;i<N+1;i++)
{
fprintf(fp,"%d\n",(I[i]));
}
fclose(fp);
fp = fopen("J","w");
for (int i=0;i<nz;i++)
{
fprintf(fp,"%d\n",(J[i]));
}
fclose(fp);
fp = fopen("val","w");
for (int i=0;i<nz;i++)
{
fprintf(fp,"%f\n",val[i]);
}
fclose(fp);
fp = fopen("b","w");
for (int i=0;i<N;i++)
{
fprintf(fp,"%f\n",rhs[i]);
}
fclose(fp);
/* Create CUBLAS context */
cublasHandle_t cublasHandle = 0;
cublasStatus_t cublasStatus;
cublasStatus = cublasCreate(&cublasHandle);
checkCudaErrors(cublasStatus);
/* Create CUSPARSE context */
cusparseHandle_t cusparseHandle = 0;
cusparseStatus_t cusparseStatus;
cusparseStatus = cusparseCreate(&cusparseHandle);
checkCudaErrors(cusparseStatus);
/* Description of the A matrix*/
cusparseMatDescr_t descr = 0;
cusparseStatus = cusparseCreateMatDescr(&descr);
checkCudaErrors(cusparseStatus);
/* Define the properties of the matrix */
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
/* Allocate required memory */
checkCudaErrors(cudaMalloc((void **)&d_col, nz*sizeof(int)));
checkCudaErrors(cudaMalloc((void **)&d_row, (N+1)*sizeof(int)));
checkCudaErrors(cudaMalloc((void **)&d_val, nz*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_x, N*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_y, N*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_rhs, N*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_residual, N*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_p, N*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_omega, N*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_Ax, N*sizeof(float)));//allocate A*x vector in GPU
checkCudaErrors(cudaMalloc((void **)&d_vt, N*sizeof(float)));//allocate vt vector in GPU
checkCudaErrors(cudaMalloc((void **)&d_y, N*sizeof(float)));//allocate y vector in GPU
checkCudaErrors(cudaMalloc((void **)&d_wt, N*sizeof(float)));//allocate wt vector in GPU
checkCudaErrors(cudaMalloc((void **)&d_z, N*sizeof(float)));//allocate z vector in GPU
checkCudaErrors(cudaMalloc((void **)&d_tmp, N*sizeof(float)));//allocate z vector in GPU
cudaMemcpy(d_col, J, nz*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_row, I, (N+1)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_val, val, nz*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_rhs, rhs, N*sizeof(float), cudaMemcpyHostToDevice);
/* Preconditioned Conjugate Gradient using ILU.
--------------------------------------------
Follows the description by Golub & Van Loan, "Matrix Computations 3rd ed.", Algorithm 10.3.1 */
printf("\nConvergence of conjugate gradient using incomplete LU preconditioning: \n");
int nzILU0 = 2*N-1;
valsILU0 = (float *) malloc(nz*sizeof(float));
checkCudaErrors(cudaMalloc((void **)&d_valsILU0, nz*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_zm1, (N)*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_zm2, (N)*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_rm2, (N)*sizeof(float)));
/* create the analysis info object for the A matrix */
cusparseSolveAnalysisInfo_t infoA = 0;
cusparseStatus = cusparseCreateSolveAnalysisInfo(&infoA);
checkCudaErrors(cusparseStatus);
/* Perform the analysis for the Non-Transpose case */
cusparseStatus = cusparseScsrsv_analysis(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
N, nz, descr, d_val, d_row, d_col, infoA);
checkCudaErrors(cusparseStatus);
/* Copy A data to ILU0 vals as input*/
cudaMemcpy(d_valsILU0, d_val, nz*sizeof(float), cudaMemcpyDeviceToDevice);
/* generate the Incomplete LU factor H for the matrix A using cudsparseScsrilu0 */
//input matrix d_valsILU0 which is exactly A since A is copied to d_valsILU0;
//ILU output is assigned to d_valsILU0
cusparseStatus = cusparseScsrilu0(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, descr, d_valsILU0, d_row, d_col, infoA);
//check if ILU result same as MATLAB (COMMENT AFTER TEST)
cudaMemcpy(valsILU0, d_valsILU0, nz*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0;i<nz;i++)
printf("valsILU0[%d]=%f\n",i,valsILU0[i]);
for (int i=0;i<nz;i++)
printf("val[%d]=%f\n",i,val[i]);
checkCudaErrors(cusparseStatus);
/* Create info objects for the ILU0 preconditioner */
cusparseSolveAnalysisInfo_t info_u;
cusparseCreateSolveAnalysisInfo(&info_u);
cusparseMatDescr_t descrL = 0;
cusparseStatus = cusparseCreateMatDescr(&descrL);
cusparseSetMatType(descrL,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrL,CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatFillMode(descrL, CUSPARSE_FILL_MODE_LOWER);
cusparseSetMatDiagType(descrL, CUSPARSE_DIAG_TYPE_UNIT);
cusparseMatDescr_t descrU = 0;
cusparseStatus = cusparseCreateMatDescr(&descrU);
cusparseSetMatType(descrU,CUSPARSE_MATRIX_TYPE_GENERAL);
//cusparseSetMatType(descrU,CUSPARSE_MATRIX_TYPE_TRIANGULAR);//changed for test
cusparseSetMatIndexBase(descrU,CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatFillMode(descrU, CUSPARSE_FILL_MODE_UPPER);
cusparseSetMatDiagType(descrU, CUSPARSE_DIAG_TYPE_NON_UNIT);
cusparseStatus = cusparseScsrsv_analysis(cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, N, nz, descrU, d_val, d_row, d_col, info_u);
checkCudaErrors(cudaMemcpy(d_rhs, rhs, N*sizeof(float), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice));
cublasSnrm2(cublasHandle, N, d_rhs, 1, &n2b); // norm of vector b;
//printf("n2b is %f\n",n2b);
flag = 1;
for (int i = 0; i < N; i++)
{
xmin[i] = x[i]; // Initial approximation of solution
}
imin = 0;
tolb = tol*n2b;
cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &floatone, descr, d_val, d_row, d_col, d_x, &floatzero, d_Ax); //d_Ax=alpha*A*x+beta*d_Ax = A*x;
cudaMemcpy(d_residual, d_rhs, N*sizeof(float), cudaMemcpyDeviceToDevice);//copy b to residual and apply next line: axpy
cublasSaxpy(cublasHandle, N, &floatonem, d_Ax, 1, d_residual, 1); //d_residual = b-Ax=d_rhs-d_Ax
cublasSnrm2(cublasHandle, N, d_residual, 1, &normr); //calculate normr;
cudaMemcpy(d_vt, d_residual, N*sizeof(float), cudaMemcpyDeviceToDevice);//Assign residual to vt
resvec = (float *)malloc(sizeof(float)*(max_iter+1)); //allocate resvec
resvec[0] = normr;
normrmin = normr;
/*for (int i=0;i<nz;i++)
printf("val[i]=%f\n",val[i]);*/
cusparseStatus = cusparseScsrsv_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, &floatone, descrL,
d_valsILU0, d_row, d_col, infoA, d_vt, d_y);
//sparse triangular solver op(A)*y=a*x; here solver L*d_y=d_vt
//IGNORED CHECK THE INFINITY NORM IN MATLAB. TO BE ADDED!!!!!
//check the result of d_y (COMMENT after use)
cudaMemcpy(tmp, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0;i<N;i++)
printf("y[%d]=%f\n",i,tmp[i]);
for (int i=0;i<N;i++)
tmp[i]=0.0;
cudaMemcpy(d_tmp, tmp, N*sizeof(float), cudaMemcpyHostToDevice);
cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &floatone, descrL, d_valsILU0, d_row, d_col, d_y, &floatzero, d_tmp);
//cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &floatone, descr, d_val, d_row, d_col, d_y, &floatzero, d_tmp);
cudaMemcpy(tmp, d_tmp, N*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0;i<N;i++)
printf("vt_predicted[%d]=%f\n",i,tmp[i]);
cublasSnrm2(cublasHandle, N, d_y, 1, &rho); // norm of vector y; rho = norm(y);
cudaMemcpy(d_wt, d_residual, N*sizeof(float), cudaMemcpyDeviceToDevice);
//check the result of d_z (COMMENT after use)
cudaMemcpy(tmp, d_wt, N*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0;i<N;i++)
printf("wt[%d]=%f\n",i,tmp[i]);
cusparseStatus = cusparseScsrsv_solve(cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, N, &floatone, descrU,
d_valsILU0, d_row, d_col, info_u, d_wt, d_z);
//sparse triangular solver op(A)y=ax; here solver U’*d_z=d_wt
//IGNORED CHECK THE INFINITY NORM IN MATLAB. TO BE ADDED!!!
//check the result of d_z (COMMENT after use)
cudaMemcpy(tmp, d_z, N*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0;i<N;i++)
printf("z[%d]=%f\n",i,tmp[i]);
cudaMemcpy(valsILU0, d_valsILU0, nz*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0;i<nz;i++)
printf("valsILU0[%d]=%f\n",i,valsILU0[i]);
k = 0;
cublasSdot(cublasHandle, N, d_rhs, 1, d_rhs, 1, &r1);
while (r1 > tol*tol && k <= max_iter)
{
// Forward Solve, we can re-use infoA since the sparsity pattern of A matches that of L
cusparseStatus = cusparseScsrsv_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, &floatone, descrL,
d_valsILU0, d_row, d_col, infoA, d_rhs, d_y);
checkCudaErrors(cusparseStatus);
// Back Substitution
cusparseStatus = cusparseScsrsv_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, &floatone, descrU,
d_valsILU0, d_row, d_col, info_u, d_y, d_zm1);
checkCudaErrors(cusparseStatus);
k++;
if (k == 1)
{
cublasScopy(cublasHandle, N, d_zm1, 1, d_p, 1);
}
else
{
cublasSdot(cublasHandle, N, d_rhs, 1, d_zm1, 1, &numerator);
cublasSdot(cublasHandle, N, d_rm2, 1, d_zm2, 1, &denominator);
beta = numerator/denominator;
cublasSscal(cublasHandle, N, &beta, d_p, 1);
cublasSaxpy(cublasHandle, N, &floatone, d_zm1, 1, d_p, 1) ;
}
cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nzILU0, &floatone, descrU, d_val, d_row, d_col, d_p, &floatzero, d_omega);
cublasSdot(cublasHandle, N, d_rhs, 1, d_zm1, 1, &numerator);
cublasSdot(cublasHandle, N, d_p, 1, d_omega, 1, &denominator);
alpha = numerator / denominator;
cublasSaxpy(cublasHandle, N, &alpha, d_p, 1, d_x, 1);
cublasScopy(cublasHandle, N, d_rhs, 1, d_rm2, 1);
cublasScopy(cublasHandle, N, d_zm1, 1, d_zm2, 1);
nalpha = -alpha;
cublasSaxpy(cublasHandle, N, &nalpha, d_omega, 1, d_rhs, 1);
cublasSdot(cublasHandle, N, d_rhs, 1, d_rhs, 1, &r1);
}
printf(" iteration = %3d, residual = %e \n", k, sqrt(r1));
cudaMemcpy(x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost);
/* check result */
err = 0.0;
for (int i = 0; i < N; i++)
{
rsum = 0.0;
for (int j = I[i]; j < I[i+1]; j++)
{
rsum += val[j]*x[J[j]];
}
diff = fabs(rsum - rhs[i]);
if (diff > err)
{
err = diff;
}
}
printf(" Convergence Test: %s \n", (k <= max_iter) ? "OK" : "FAIL");
nErrors += (k > max_iter) ? 1 : 0;
qaerr2 = err;
/* Destroy parameters */
cusparseDestroySolveAnalysisInfo(infoA);
cusparseDestroySolveAnalysisInfo(info_u);
/* Destroy contexts */
cusparseDestroy(cusparseHandle);
cublasDestroy(cublasHandle);
/*write output before free memory*/
fp = fopen("x","w");
for (int i=0;i<N;i++)
{
fprintf(fp,"%f\n",x[i]);
}
fclose(fp);
/* Free device memory */
free(I);
free(J);
free(val);
free(x);
free(rhs);
free(valsILU0);
cudaFree(d_col);
cudaFree(d_row);
cudaFree(d_val);
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_rhs);
cudaFree(d_p);
cudaFree(d_omega);
cudaFree(d_valsILU0);
cudaFree(d_zm1);
cudaFree(d_zm2);
cudaFree(d_rm2);
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();
printf(" Test Summary:\n");
printf(" Counted total of %d errors\n", nErrors);
//printf(" qaerr1 = %f qaerr2 = %f\n\n", fabs(qaerr1), fabs(qaerr2));
exit((nErrors == 0 &&fabs(qaerr2)<1e-5 && fabs(qaerr2) < 1e-5 ? EXIT_SUCCESS : EXIT_FAILURE));
}