Hi,
I am trying to solve a dense linear equation. I need to use float precision as my other parts of program uses float. However, I tried to use the sample code provided by cuda_sample and found that cusolverDnDpotrf does not support float. I checked the API, float is support. Anyone can provide some insight what is happening here. I pasted the sample code below use double precision, but change double to float does not work. Thank you!
/*
- Copyright 2015 NVIDIA Corporation. All rights reserved.
- Please refer to the NVIDIA end user license agreement (EULA) associated
- with this source code for terms and conditions that govern your use of
- this software. Any use, reproduction, disclosure, or distribution of
- this software and related documentation outside the terms of the EULA
- is strictly prohibited.
*/
/*
- Test three linear solvers, including Cholesky, LU and QR.
- The user has to prepare a sparse matrix of “matrix market format” (with extension .mtx).
- For example, the user can download matrices in Florida Sparse Matrix Collection.
- (http://www.cise.ufl.edu/research/sparse/matrices/)
- The user needs to choose a solver by switch -R and
- to provide the path of the matrix by switch -F, then
- the program solves
-
A*x = b where b = ones(m,1)
- and reports relative error
-
|b-A*x|/(|A|*|x|)
- The elapsed time is also reported so the user can compare efficiency of different solvers.
- How to use
-
./cuSolverDn_LinearSolver // Default: cholesky
-
./cuSolverDn_LinearSolver -R=chol -filefile> // cholesky factorization
-
./cuSolverDn_LinearSolver -R=lu -file<file> // LU with partial pivoting
-
./cuSolverDn_LinearSolver -R=qr -file<file> // QR factorization
- Remark: the absolute error on solution x is meaningless without knowing condition number of A.
-
The relative error on residual should be close to machine zero, i.e. 1.e-15.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include <cuda_runtime.h>
#include “cublas_v2.h”
#include “cusolverDn.h”
#include “helper_cuda.h”
#include “helper_cusolver.h”
template
int loadMMSparseMatrix(
char *filename,
char elem_type,
bool csrFormat,
int *m,
int *n,
int *nnz,
T_ELEM **aVal,
int **aRowInd,
int **aColInd,
int extendSymMatrix);
void UsageDN(void)
{
printf( “\n”);
printf( “-h : display this help\n”);
printf( “-R= : choose a linear solver\n”);
printf( " chol (cholesky factorization), this is default\n");
printf( " qr (QR factorization)\n");
printf( " lu (LU factorization)\n");
printf( “-lda= : leading dimension of A , m by default\n”);
printf( “-file=: filename containing a matrix in MM format\n”);
printf( “-device=<device_id> : <device_id> if want to run on specific GPU\n”);
exit( 0 );
}
/*
- solve A*x = b by Cholesky factorization
*/
int linearSolverCHOL(
cusolverDnHandle_t handle,
int n,
const double *Acopy,
int lda,
const double *b,
double *x)
{
int bufferSize = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
int h_info = 0;
double start, stop;
double time_solve;
cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
checkCudaErrors(cusolverDnDpotrf_bufferSize(handle, uplo, n, (double*)Acopy, lda, &bufferSize));
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double)*bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double)*lda*n));
// prepare a copy of A because potrf will overwrite A with L
checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
start = second();
checkCudaErrors(cusolverDnDpotrf(handle, uplo, n, A, lda, buffer, bufferSize, info));
checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if ( 0 != h_info ){
fprintf(stderr, "Error: Cholesky factorization failed\n");
}
checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cusolverDnDpotrs(handle, uplo, n, 1, A, lda, x, n, info));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf (stdout, "timing: cholesky = %10.6f sec\n", time_solve);
if (info ) { checkCudaErrors(cudaFree(info)); }
if (buffer) { checkCudaErrors(cudaFree(buffer)); }
if (A ) { checkCudaErrors(cudaFree(A)); }
return 0;
}
/*
- solve A*x = b by LU with partial pivoting
*/
int linearSolverLU(
cusolverDnHandle_t handle,
int n,
const double *Acopy,
int lda,
const double *b,
double *x)
{
int bufferSize = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
int *ipiv = NULL; // pivoting sequence
int h_info = 0;
double start, stop;
double time_solve;
checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n, (double*)Acopy, lda, &bufferSize));
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double)*bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double)*lda*n));
checkCudaErrors(cudaMalloc(&ipiv, sizeof(int)*n));
// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
start = second();
checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info));
checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if ( 0 != h_info ){
fprintf(stderr, "Error: LU factorization failed\n");
}
checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf (stdout, "timing: LU = %10.6f sec\n", time_solve);
if (info ) { checkCudaErrors(cudaFree(info )); }
if (buffer) { checkCudaErrors(cudaFree(buffer)); }
if (A ) { checkCudaErrors(cudaFree(A)); }
if (ipiv ) { checkCudaErrors(cudaFree(ipiv));}
return 0;
}
/*
- solve A*x = b by QR
*/
int linearSolverQR(
cusolverDnHandle_t handle,
int n,
const double *Acopy,
int lda,
const double *b,
double *x)
{
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
int bufferSize = 0;
int bufferSize_geqrf = 0;
int bufferSize_ormqr = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
double *tau = NULL;
int h_info = 0;
double start, stop;
double time_solve;
const double one = 1.0;
checkCudaErrors(cublasCreate(&cublasHandle));
checkCudaErrors(cusolverDnDgeqrf_bufferSize(handle, n, n, (double*)Acopy, lda, &bufferSize_geqrf));
checkCudaErrors(cusolverDnDormqr_bufferSize(
handle,
CUBLAS_SIDE_LEFT,
CUBLAS_OP_T,
n,
1,
n,
A,
lda,
NULL,
x,
n,
&bufferSize_ormqr));
printf("buffer_geqrf = %d, buffer_ormqr = %d \n", bufferSize_geqrf, bufferSize_ormqr);
bufferSize = (bufferSize_geqrf > bufferSize_ormqr)? bufferSize_geqrf : bufferSize_ormqr ;
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double)*bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double)*lda*n));
checkCudaErrors(cudaMalloc ((void**)&tau, sizeof(double)*n));
// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double)ldan, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
start = second();
// compute QR factorization
checkCudaErrors(cusolverDnDgeqrf(handle, n, n, A, lda, tau, buffer, bufferSize, info));
checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if ( 0 != h_info ){
fprintf(stderr, "Error: LU factorization failed\n");
}
checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*n, cudaMemcpyDeviceToDevice));
// compute Q^T*b
checkCudaErrors(cusolverDnDormqr(
handle,
CUBLAS_SIDE_LEFT,
CUBLAS_OP_T,
n,
1,
n,
A,
lda,
tau,
x,
n,
buffer,
bufferSize,
info));
// x = R \ Q^T*b
checkCudaErrors(cublasDtrsm(
cublasHandle,
CUBLAS_SIDE_LEFT,
CUBLAS_FILL_MODE_UPPER,
CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT,
n,
1,
&one,
A,
lda,
x,
n));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf (stdout, "timing: QR = %10.6f sec\n", time_solve);
if (cublasHandle) { checkCudaErrors(cublasDestroy(cublasHandle)); }
if (info ) { checkCudaErrors(cudaFree(info )); }
if (buffer) { checkCudaErrors(cudaFree(buffer)); }
if (A ) { checkCudaErrors(cudaFree(A)); }
if (tau ) { checkCudaErrors(cudaFree(tau)); }
return 0;
}
void parseCommandLineArguments(int argc, char *argv, struct testOpts &opts)
{
memset(&opts, 0, sizeof(opts));
if (checkCmdLineFlag(argc, (const char **)argv, "-h"))
{
UsageDN();
}
if (checkCmdLineFlag(argc, (const char **)argv, "R"))
{
char *solverType = NULL;
getCmdLineArgumentString(argc, (const char **)argv, "R", &solverType);
if (solverType)
{
if ((STRCASECMP(solverType, "chol") != 0) && (STRCASECMP(solverType, "lu") != 0) && (STRCASECMP(solverType, "qr") != 0))
{
printf("\nIncorrect argument passed to -R option\n");
UsageDN();
}
else
{
opts.testFunc = solverType;
}
}
}
if (checkCmdLineFlag(argc, (const char **)argv, "file"))
{
char *fileName = 0;
getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName);
if (fileName)
{
opts.sparse_mat_filename = fileName;
}
else
{
printf("\nIncorrect filename passed to -file \n ");
UsageDN();
}
}
if (checkCmdLineFlag(argc, (const char **)argv, "lda"))
{
opts.lda = getCmdLineArgumentInt(argc, (const char **)argv, "lda");
}
}
int main (int argc, char *argv)
{
struct testOpts opts;
cusolverDnHandle_t handle = NULL;
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
cudaStream_t stream = NULL;
int rowsA = 0; // number of rows of A
int colsA = 0; // number of columns of A
int nnzA = 0; // number of nonzeros of A
int baseA = 0; // base index in CSR format
int lda = 0; // leading dimension in dense matrix
// CSR(A) from I/O
int *h_csrRowPtrA = NULL;
int *h_csrColIndA = NULL;
double *h_csrValA = NULL;
double *h_A = NULL; // dense matrix from CSR(A)
double *h_x = NULL; // a copy of d_x
double *h_b = NULL; // b = ones(m,1)
double *h_r = NULL; // r = b - A*x, a copy of d_r
double *d_A = NULL; // a copy of h_A
double *d_x = NULL; // x = A \ b
double *d_b = NULL; // a copy of h_b
double *d_r = NULL; // r = b - A*x
// the constants are used in residual evaluation, r = b - A*x
const double minus_one = -1.0;
const double one = 1.0;
double x_inf = 0.0;
double r_inf = 0.0;
double A_inf = 0.0;
int errors = 0;
parseCommandLineArguments(argc, argv, opts);
if (NULL == opts.testFunc)
{
opts.testFunc = "chol"; // By default running Cholesky as NO solver selected with -R option.
}
findCudaDevice(argc, (const char **)argv);
printf("step 1: read matrix market format\n");
if (opts.sparse_mat_filename == NULL)
{
opts.sparse_mat_filename = sdkFindFilePath("gr_900_900_crg.mtx", argv[0]);
if (opts.sparse_mat_filename != NULL)
printf("Using default input file [%s]\n", opts.sparse_mat_filename);
else
printf("Could not find gr_900_900_crg.mtx\n");
}
else
{
printf("Using input file [%s]\n", opts.sparse_mat_filename);
}
if (opts.sparse_mat_filename == NULL)
{
fprintf(stderr, "Error: input matrix is not provided\n");
return EXIT_FAILURE;
}
if (loadMMSparseMatrix<double>(opts.sparse_mat_filename, 'd', true , &rowsA, &colsA,
&nnzA, &h_csrValA, &h_csrRowPtrA, &h_csrColIndA, true))
{
exit(EXIT_FAILURE);
}
baseA = h_csrRowPtrA[0]; // baseA = {0,1}
printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, nnzA, baseA);
if ( rowsA != colsA )
{
fprintf(stderr, "Error: only support square matrix\n");
exit(EXIT_FAILURE);
}
printf("step 2: convert CSR(A) to dense matrix\n");
lda = opts.lda ? opts.lda : rowsA;
if (lda < rowsA)
{
fprintf(stderr, "Error: lda must be greater or equal to dimension of A\n");
exit(EXIT_FAILURE);
}
h_A = (double*)malloc(sizeof(double)*lda*colsA);
h_x = (double*)malloc(sizeof(double)*colsA);
h_b = (double*)malloc(sizeof(double)*rowsA);
h_r = (double*)malloc(sizeof(double)*rowsA);
assert(NULL != h_A);
assert(NULL != h_x);
assert(NULL != h_b);
assert(NULL != h_r);
memset(h_A, 0, sizeof(double)*lda*colsA);
for(int row = 0 ; row < rowsA ; row++)
{
const int start = h_csrRowPtrA[row ] - baseA;
const int end = h_csrRowPtrA[row+1] - baseA;
for(int colidx = start ; colidx < end ; colidx++)
{
const int col = h_csrColIndA[colidx] - baseA;
const double Areg = h_csrValA[colidx];
h_A[row + col*lda] = Areg;
}
}
printf("step 3: set right hand side vector (b) to 1\n");
for(int row = 0 ; row < rowsA ; row++)
{
h_b[row] = 1.0;
}
// verify if A is symmetric or not.
if ( 0 == strcmp(opts.testFunc, "chol") )
{
int issym = 1;
for(int j = 0 ; j < colsA ; j++)
{
for(int i = j ; i < rowsA ; i++)
{
double Aij = h_A[i + j*lda];
double Aji = h_A[j + i*lda];
if ( Aij != Aji )
{
issym = 0;
break;
}
}
}
if (!issym)
{
printf("Error: A has no symmetric pattern, please use LU or QR \n");
exit(EXIT_FAILURE);
}
}
checkCudaErrors(cusolverDnCreate(&handle));
checkCudaErrors(cublasCreate(&cublasHandle));
checkCudaErrors(cudaStreamCreate(&stream));
checkCudaErrors(cusolverDnSetStream(handle, stream));
checkCudaErrors(cublasSetStream(cublasHandle, stream));
checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double)*lda*colsA));
checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double)*colsA));
checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double)*rowsA));
checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*rowsA));
printf("step 4: prepare data on device\n");
checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double)*lda*colsA, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));
printf("step 5: solve A*x = b \n");
// d_A and d_b are read-only
if ( 0 == strcmp(opts.testFunc, "chol") )
{
linearSolverCHOL(handle, rowsA, d_A, lda, d_b, d_x);
}
else if ( 0 == strcmp(opts.testFunc, "lu") )
{
linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x);
}
else if ( 0 == strcmp(opts.testFunc, "qr") )
{
linearSolverQR(handle, rowsA, d_A, lda, d_b, d_x);
}
else
{
fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc);
exit(EXIT_FAILURE);
}
printf("step 6: evaluate residual\n");
checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double)*rowsA, cudaMemcpyDeviceToDevice));
// r = b - A*x
checkCudaErrors(cublasDgemm_v2(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
rowsA,
1,
colsA,
&minus_one,
d_A,
lda,
d_x,
rowsA,
&one,
d_r,
rowsA));
checkCudaErrors(cudaMemcpy(h_x, d_x, sizeof(double)*colsA, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost));
x_inf = vec_norminf(colsA, h_x);
r_inf = vec_norminf(rowsA, h_r);
A_inf = mat_norminf(rowsA, colsA, h_A, lda);
printf("|b - A*x| = %E \n", r_inf);
printf("|A| = %E \n", A_inf);
printf("|x| = %E \n", x_inf);
printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf));
if (handle) { checkCudaErrors(cusolverDnDestroy(handle)); }
if (cublasHandle) { checkCudaErrors(cublasDestroy(cublasHandle)); }
if (stream) { checkCudaErrors(cudaStreamDestroy(stream)); }
if (h_csrValA ) { free(h_csrValA); }
if (h_csrRowPtrA) { free(h_csrRowPtrA); }
if (h_csrColIndA) { free(h_csrColIndA); }
if (h_A) { free(h_A); }
if (h_x) { free(h_x); }
if (h_b) { free(h_b); }
if (h_r) { free(h_r); }
if (d_A) { checkCudaErrors(cudaFree(d_A)); }
if (d_x) { checkCudaErrors(cudaFree(d_x)); }
if (d_b) { checkCudaErrors(cudaFree(d_b)); }
if (d_r) { checkCudaErrors(cudaFree(d_r)); }
return 0;
}