cusolverDnDpotrf does not support float

yuzhuore · August 30, 2017, 7:28pm

Hi,

I am trying to solve a dense linear equation. I need to use float precision as my other parts of program uses float. However, I tried to use the sample code provided by cuda_sample and found that cusolverDnDpotrf does not support float. I checked the API, float is support. Anyone can provide some insight what is happening here. I pasted the sample code below use double precision, but change double to float does not work. Thank you!

/*

Copyright 2015 NVIDIA Corporation. All rights reserved.
Please refer to the NVIDIA end user license agreement (EULA) associated
with this source code for terms and conditions that govern your use of
this software. Any use, reproduction, disclosure, or distribution of
this software and related documentation outside the terms of the EULA
is strictly prohibited.

*/

/*

Test three linear solvers, including Cholesky, LU and QR.
The user has to prepare a sparse matrix of “matrix market format” (with extension .mtx).
For example, the user can download matrices in Florida Sparse Matrix Collection.
(http://www.cise.ufl.edu/research/sparse/matrices/)
The user needs to choose a solver by switch -R and
to provide the path of the matrix by switch -F, then
the program solves
```
     A*x = b  where b = ones(m,1)
```
and reports relative error
```
     |b-A*x|/(|A|*|x|)
```
The elapsed time is also reported so the user can compare efficiency of different solvers.
How to use

 ./cuSolverDn_LinearSolver                     // Default: cholesky

./cuSolverDn_LinearSolver -R=chol -filefile>   // cholesky factorization

./cuSolverDn_LinearSolver -R=lu -file<file>     // LU with partial pivoting

./cuSolverDn_LinearSolver -R=qr -file<file>     // QR factorization

Remark: the absolute error on solution x is meaningless without knowing condition number of A.

The relative error on residual should be close to machine zero, i.e. 1.e-15.

*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

#include <cuda_runtime.h>

#include “cublas_v2.h”
#include “cusolverDn.h”
#include “helper_cuda.h”

#include “helper_cusolver.h”

template
int loadMMSparseMatrix(
char *filename,
char elem_type,
bool csrFormat,
int *m,
int *n,
int *nnz,
T_ELEM **aVal,
int **aRowInd,
int **aColInd,
int extendSymMatrix);

void UsageDN(void)
{
printf( “\n”);
printf( “-h : display this help\n”);
printf( “-R= : choose a linear solver\n”);
printf( " chol (cholesky factorization), this is default\n");
printf( " qr (QR factorization)\n");
printf( " lu (LU factorization)\n");
printf( “-lda= : leading dimension of A , m by default\n”);
printf( “-file=: filename containing a matrix in MM format\n”);
printf( “-device=<device_id> : <device_id> if want to run on specific GPU\n”);

exit( 0 );

}

/*

solve A*x = b by Cholesky factorization

*/
int linearSolverCHOL(
cusolverDnHandle_t handle,
int n,
const double *Acopy,
int lda,
const double *b,
double *x)
{
int bufferSize = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
int h_info = 0;
double start, stop;
double time_solve;
cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;

checkCudaErrors(cusolverDnDpotrf_bufferSize(handle, uplo, n, (double*)Acopy, lda, &bufferSize));

checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double)*bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double)*lda*n));


// prepare a copy of A because potrf will overwrite A with L
checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));

start = second();
start = second();

checkCudaErrors(cusolverDnDpotrf(handle, uplo, n, A, lda, buffer, bufferSize, info));

checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));

if ( 0 != h_info ){
    fprintf(stderr, "Error: Cholesky factorization failed\n");
}

checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*n, cudaMemcpyDeviceToDevice));

checkCudaErrors(cusolverDnDpotrs(handle, uplo, n, 1, A, lda, x, n, info));

checkCudaErrors(cudaDeviceSynchronize());
stop = second();

time_solve = stop - start;
fprintf (stdout, "timing: cholesky = %10.6f sec\n", time_solve);

if (info  ) { checkCudaErrors(cudaFree(info)); }
if (buffer) { checkCudaErrors(cudaFree(buffer)); }
if (A     ) { checkCudaErrors(cudaFree(A)); }

return 0;

}

/*

solve A*x = b by LU with partial pivoting

*/
int linearSolverLU(
cusolverDnHandle_t handle,
int n,
const double *Acopy,
int lda,
const double *b,
double *x)
{
int bufferSize = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
int *ipiv = NULL; // pivoting sequence
int h_info = 0;
double start, stop;
double time_solve;

checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n, (double*)Acopy, lda, &bufferSize));

checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double)*bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double)*lda*n));
checkCudaErrors(cudaMalloc(&ipiv, sizeof(int)*n));


// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));

start = second();
start = second();

checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info));
checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));

if ( 0 != h_info ){
    fprintf(stderr, "Error: LU factorization failed\n");
}

checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();

time_solve = stop - start;
fprintf (stdout, "timing: LU = %10.6f sec\n", time_solve);

if (info  ) { checkCudaErrors(cudaFree(info  )); }
if (buffer) { checkCudaErrors(cudaFree(buffer)); }
if (A     ) { checkCudaErrors(cudaFree(A)); }
if (ipiv  ) { checkCudaErrors(cudaFree(ipiv));}

return 0;

}

/*

solve A*x = b by QR

*/
int linearSolverQR(
cusolverDnHandle_t handle,
int n,
const double *Acopy,
int lda,
const double *b,
double *x)
{
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
int bufferSize = 0;
int bufferSize_geqrf = 0;
int bufferSize_ormqr = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
double *tau = NULL;
int h_info = 0;
double start, stop;
double time_solve;
const double one = 1.0;

checkCudaErrors(cublasCreate(&cublasHandle));

checkCudaErrors(cusolverDnDgeqrf_bufferSize(handle, n, n, (double*)Acopy, lda, &bufferSize_geqrf));
checkCudaErrors(cusolverDnDormqr_bufferSize(
    handle,
    CUBLAS_SIDE_LEFT,
    CUBLAS_OP_T,
    n,
    1,
    n,
    A,
    lda,
    NULL,
    x,
    n,
    &bufferSize_ormqr));

printf("buffer_geqrf = %d, buffer_ormqr = %d \n", bufferSize_geqrf, bufferSize_ormqr);

bufferSize = (bufferSize_geqrf > bufferSize_ormqr)? bufferSize_geqrf : bufferSize_ormqr ; 

checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double)*bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double)*lda*n));
checkCudaErrors(cudaMalloc ((void**)&tau, sizeof(double)*n));

// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double)ldan, cudaMemcpyDeviceToDevice));

checkCudaErrors(cudaMemset(info, 0, sizeof(int)));

start = second();
start = second();

// compute QR factorization
checkCudaErrors(cusolverDnDgeqrf(handle, n, n, A, lda, tau, buffer, bufferSize, info));

checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));

if ( 0 != h_info ){
    fprintf(stderr, "Error: LU factorization failed\n");
}

checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*n, cudaMemcpyDeviceToDevice));

// compute Q^T*b
checkCudaErrors(cusolverDnDormqr(
    handle,
    CUBLAS_SIDE_LEFT,
    CUBLAS_OP_T,
    n,
    1,
    n,
    A,
    lda,
    tau,
    x,
    n,
    buffer,
    bufferSize,
    info));

// x = R \ Q^T*b
checkCudaErrors(cublasDtrsm(
     cublasHandle,
     CUBLAS_SIDE_LEFT,
     CUBLAS_FILL_MODE_UPPER,
     CUBLAS_OP_N,
     CUBLAS_DIAG_NON_UNIT,
     n,
     1,
     &one,
     A,
     lda,
     x,
     n));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();

time_solve = stop - start;
fprintf (stdout, "timing: QR = %10.6f sec\n", time_solve);

if (cublasHandle) { checkCudaErrors(cublasDestroy(cublasHandle)); }
if (info  ) { checkCudaErrors(cudaFree(info  )); }
if (buffer) { checkCudaErrors(cudaFree(buffer)); }
if (A     ) { checkCudaErrors(cudaFree(A)); }
if (tau   ) { checkCudaErrors(cudaFree(tau)); }

return 0;

}

void parseCommandLineArguments(int argc, char *argv, struct testOpts &opts)
{
memset(&opts, 0, sizeof(opts));

if (checkCmdLineFlag(argc, (const char **)argv, "-h"))
{
    UsageDN();
}

if (checkCmdLineFlag(argc, (const char **)argv, "R"))
{
    char *solverType = NULL;
    getCmdLineArgumentString(argc, (const char **)argv, "R", &solverType);

    if (solverType)
    {
        if ((STRCASECMP(solverType, "chol") != 0) && (STRCASECMP(solverType, "lu") != 0) && (STRCASECMP(solverType, "qr") != 0))
        {
            printf("\nIncorrect argument passed to -R option\n");
            UsageDN();
        }
        else
        {
            opts.testFunc = solverType;
        }
    }
}

if (checkCmdLineFlag(argc, (const char **)argv, "file"))
{
    char *fileName = 0;
    getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName);

    if (fileName)
    {
        opts.sparse_mat_filename = fileName;
    }
    else
    {
        printf("\nIncorrect filename passed to -file \n ");
        UsageDN();
    }
}

if (checkCmdLineFlag(argc, (const char **)argv, "lda"))
{
    opts.lda = getCmdLineArgumentInt(argc, (const char **)argv, "lda");
}

}

int main (int argc, char *argv)
{
struct testOpts opts;
cusolverDnHandle_t handle = NULL;
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
cudaStream_t stream = NULL;

int rowsA = 0; // number of rows of A
int colsA = 0; // number of columns of A
int nnzA  = 0; // number of nonzeros of A
int baseA = 0; // base index in CSR format
int lda   = 0; // leading dimension in dense matrix

// CSR(A) from I/O
int *h_csrRowPtrA = NULL;
int *h_csrColIndA = NULL;
double *h_csrValA = NULL;

double *h_A = NULL; // dense matrix from CSR(A)
double *h_x = NULL; // a copy of d_x
double *h_b = NULL; // b = ones(m,1)
double *h_r = NULL; // r = b - A*x, a copy of d_r

double *d_A = NULL; // a copy of h_A
double *d_x = NULL; // x = A \ b
double *d_b = NULL; // a copy of h_b
double *d_r = NULL; // r = b - A*x

// the constants are used in residual evaluation, r = b - A*x
const double minus_one = -1.0;
const double one = 1.0;

double x_inf = 0.0;
double r_inf = 0.0;
double A_inf = 0.0;
int errors = 0;

parseCommandLineArguments(argc, argv, opts);

if (NULL == opts.testFunc)
{
    opts.testFunc = "chol"; // By default running Cholesky as NO solver selected with -R option.
}

findCudaDevice(argc, (const char **)argv);

printf("step 1: read matrix market format\n");

if (opts.sparse_mat_filename == NULL)
{
    opts.sparse_mat_filename =  sdkFindFilePath("gr_900_900_crg.mtx", argv[0]);
    if (opts.sparse_mat_filename != NULL)
        printf("Using default input file [%s]\n", opts.sparse_mat_filename);
    else
        printf("Could not find gr_900_900_crg.mtx\n");
}
else
{
    printf("Using input file [%s]\n", opts.sparse_mat_filename);
}

if (opts.sparse_mat_filename == NULL)
{
    fprintf(stderr, "Error: input matrix is not provided\n");
    return EXIT_FAILURE;
}

if (loadMMSparseMatrix<double>(opts.sparse_mat_filename, 'd', true , &rowsA, &colsA,
           &nnzA, &h_csrValA, &h_csrRowPtrA, &h_csrColIndA, true))
{
    exit(EXIT_FAILURE);
}
baseA = h_csrRowPtrA[0]; // baseA = {0,1}

printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, nnzA, baseA);

if ( rowsA != colsA )
{
    fprintf(stderr, "Error: only support square matrix\n");
    exit(EXIT_FAILURE);
}

printf("step 2: convert CSR(A) to dense matrix\n");

lda = opts.lda ? opts.lda : rowsA;
if (lda < rowsA)
{
    fprintf(stderr, "Error: lda must be greater or equal to dimension of A\n");
    exit(EXIT_FAILURE);
}

h_A = (double*)malloc(sizeof(double)*lda*colsA);
h_x = (double*)malloc(sizeof(double)*colsA);
h_b = (double*)malloc(sizeof(double)*rowsA);
h_r = (double*)malloc(sizeof(double)*rowsA);
assert(NULL != h_A);
assert(NULL != h_x);
assert(NULL != h_b);
assert(NULL != h_r);

memset(h_A, 0, sizeof(double)*lda*colsA);

for(int row = 0 ; row < rowsA ; row++)
{
    const int start = h_csrRowPtrA[row  ] - baseA;
    const int end   = h_csrRowPtrA[row+1] - baseA;
    for(int colidx = start ; colidx < end ; colidx++)
    {
        const int col = h_csrColIndA[colidx] - baseA;
        const double Areg = h_csrValA[colidx];
        h_A[row + col*lda] = Areg;
    }
}

printf("step 3: set right hand side vector (b) to 1\n");
for(int row = 0 ; row < rowsA ; row++)
{
    h_b[row] = 1.0;
}

// verify if A is symmetric or not.
if ( 0 == strcmp(opts.testFunc, "chol") )
{
    int issym = 1;
    for(int j = 0 ; j < colsA ; j++)
    {
        for(int i = j ; i < rowsA ; i++)
        {
            double Aij = h_A[i + j*lda];
            double Aji = h_A[j + i*lda];
            if ( Aij != Aji )
            {
                issym = 0;
                break;
            }
        }
    }
    if (!issym)
    {
        printf("Error: A has no symmetric pattern, please use LU or QR \n");
        exit(EXIT_FAILURE);
    }
}

checkCudaErrors(cusolverDnCreate(&handle));
checkCudaErrors(cublasCreate(&cublasHandle));
checkCudaErrors(cudaStreamCreate(&stream));

checkCudaErrors(cusolverDnSetStream(handle, stream));
checkCudaErrors(cublasSetStream(cublasHandle, stream));


checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double)*lda*colsA));
checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double)*colsA));
checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double)*rowsA));
checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*rowsA));

printf("step 4: prepare data on device\n");
checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double)*lda*colsA, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));

printf("step 5: solve A*x = b \n");
// d_A and d_b are read-only
if ( 0 == strcmp(opts.testFunc, "chol") )
{
    linearSolverCHOL(handle, rowsA, d_A, lda, d_b, d_x);
}
else if ( 0 == strcmp(opts.testFunc, "lu") )
{
    linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x);
}
else if ( 0 == strcmp(opts.testFunc, "qr") )
{
    linearSolverQR(handle, rowsA, d_A, lda, d_b, d_x);
}
else
{
    fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc);
    exit(EXIT_FAILURE);
}
printf("step 6: evaluate residual\n");
checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double)*rowsA, cudaMemcpyDeviceToDevice));

// r = b - A*x
checkCudaErrors(cublasDgemm_v2(
    cublasHandle,
    CUBLAS_OP_N,
    CUBLAS_OP_N,
    rowsA,
    1,
    colsA,
    &minus_one,
    d_A,
    lda,
    d_x,
    rowsA,
    &one,
    d_r,
    rowsA));

checkCudaErrors(cudaMemcpy(h_x, d_x, sizeof(double)*colsA, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost));

x_inf = vec_norminf(colsA, h_x);
r_inf = vec_norminf(rowsA, h_r);
A_inf = mat_norminf(rowsA, colsA, h_A, lda);

printf("|b - A*x| = %E \n", r_inf);
printf("|A| = %E \n", A_inf);
printf("|x| = %E \n", x_inf);
printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf));

if (handle) { checkCudaErrors(cusolverDnDestroy(handle)); }
if (cublasHandle) { checkCudaErrors(cublasDestroy(cublasHandle)); }
if (stream) { checkCudaErrors(cudaStreamDestroy(stream)); }

if (h_csrValA   ) { free(h_csrValA); }
if (h_csrRowPtrA) { free(h_csrRowPtrA); }
if (h_csrColIndA) { free(h_csrColIndA); }

if (h_A) { free(h_A); }
if (h_x) { free(h_x); }
if (h_b) { free(h_b); }
if (h_r) { free(h_r); }

if (d_A) { checkCudaErrors(cudaFree(d_A)); }
if (d_x) { checkCudaErrors(cudaFree(d_x)); }
if (d_b) { checkCudaErrors(cudaFree(d_b)); }
if (d_r) { checkCudaErrors(cudaFree(d_r)); }

return 0;

}

Topic		Replies	Views
CuSolver Sparse on Fortran GPU-Accelerated Libraries cusolver	6	1571	April 28, 2024
Using cuSolverDN in FORTRAN code GPU-Accelerated Libraries	8	4519	November 18, 2015
Solve AX=B With cuSolver CUDA Programming and Performance	10	5939	August 18, 2015
cuSOLVER sparse: cusolverSpDcsrlsvqr() error GPU-Accelerated Libraries	4	3915	June 7, 2015
LU, QR and Cholesky factorizations using GPU CUDA Programming and Performance	100	64052	June 23, 2015
cusolverDnSgetrf() fails on A100 (but not on A10) when called in a tight loop GPU-Accelerated Libraries cusolver	15	2128	February 23, 2022
cuSOLVER into python (possibly for pyCUDA) CUDA Programming and Performance	8	3829	June 4, 2015
LU factorization code CUDA Programming and Performance	45	91291	June 23, 2015
Some questions for the function "cusparseDcsrsv2_solve" nvc, nvc++ and nvfortran cuda	12	235	August 9, 2024
Cusolver solve sparse Ax=b wrong GPU-Accelerated Libraries cusolver	7	308	August 19, 2024

cusolverDnDpotrf does not support float

Related topics