cusparseCsrmvEx with half I/O and float calculation.

Hi, CUDA fans!

I want sparse MV with half I/O and float calculation, to reduce memory load.
But it fails.
I tried this:

  m, n, nnz,
  alpha, CUDA_R_16F,
  descrA, csrSortedValA, CUDA_R_16F, csrSortedRowPtrA, csrSortedColIndA,
  x, CUDA_R_16F,
  beta, CUDA_R_16F,
  y, CUDA_R_16F,

Is there anything wrong with the code, or it is not implemented yet?
If the latter case, is there future plan for it?

It works fine if all types are CUDA_R_32F. All CUDA_R_16F are also good.
But half calculation lacks precision for my task, so I want to try
inner precision to be float.


sorry for multi post. I can not delete first one in TensorCore place.

I’m checking on this.

In the meantime, I suggest using cusparseSpMV instead. There is a fully worked sample code in the cusparse doc for it (for CUDA_R_32F case). Here is a variation of that showing your case:

$ cat
// *** spmv_example.c ***
// How to compile (assume CUDA is installed at /usr/local/cuda/)
//   nvcc spmv_example.c -o spmv_example -L/usr/local/cuda/lib64 -lcusparse -lcudart
// or, for C compiler
//   cc -I/usr/local/cuda/include -c spmv_example.c -o spmv_example.o -std=c99
//   nvcc -lcusparse -lcudart spmv_example.o -o spmv_example
#include <cuda_runtime.h>  // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h>      // cusparseSpMV
#include <stdio.h>         // printf
#include <stdlib.h>        // EXIT_FAILURE
#include <cuda_fp16.h>

#define CHECK_CUDA(func)                                                       \
{                                                                              \
    cudaError_t status = (func);                                               \
    if (status != cudaSuccess) {                                               \
        printf("CUDA API failed at line %d with error: %s (%d)\n",             \
               __LINE__, cudaGetErrorString(status), status);                  \
        return EXIT_FAILURE;                                                   \
    }                                                                          \

#define CHECK_CUSPARSE(func)                                                   \
{                                                                              \
    cusparseStatus_t status = (func);                                          \
    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
               __LINE__, cusparseGetErrorString(status), status);              \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
#ifndef USE_FLOAT
#define DTYPE CUDA_R_16F
typedef half dtype;
#define DTYPE CUDA_R_32F
typedef float dtype;

int main() {
    // Host problem definition
    const int A_num_rows = 4;
    const int A_num_cols = 4;
    const int A_num_nnz  = 9;
    int   hA_csrOffsets[] = { 0, 3, 4, 7, 9 };
    int   hA_columns[]    = { 0, 2, 3, 1, 0, 2, 3, 1, 3 };
    dtype hA_values[]     = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                              6.0f, 7.0f, 8.0f, 9.0f };
    dtype hX[]            = { 1.0f, 2.0f, 3.0f, 4.0f };
    const dtype result[]  = { 19.0f, 8.0f, 51.0f, 52.0f };
    // Device memory management
    int   *dA_csrOffsets, *dA_columns;
    dtype *dA_values, *dX, *dY;
    CHECK_CUDA( cudaMalloc((void**) &dA_csrOffsets,
                           (A_num_rows + 1) * sizeof(int)) )
    CHECK_CUDA( cudaMalloc((void**) &dA_columns, A_num_nnz * sizeof(int)) )
    CHECK_CUDA( cudaMalloc((void**) &dA_values, A_num_nnz * sizeof(dtype)) )
    CHECK_CUDA( cudaMalloc((void**) &dX, A_num_cols * sizeof(dtype)) )
    CHECK_CUDA( cudaMalloc((void**) &dY, A_num_rows * sizeof(dtype)) )

    CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets,
                           (A_num_rows + 1) * sizeof(int),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns, A_num_nnz * sizeof(int),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dA_values, hA_values,
                           A_num_nnz * sizeof(dtype), cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dX, hX, A_num_rows * sizeof(dtype),
                           cudaMemcpyHostToDevice) )
    cusparseHandle_t     handle = 0;
    void*  dBuffer    = NULL;
    size_t bufferSize = 0;
    CHECK_CUSPARSE( cusparseCreate(&handle) )
#ifdef USE_SPMV
    float alpha = 1.0f;
    float beta  = 0.0f;
    cusparseSpMatDescr_t matA;
    cusparseDnVecDescr_t vecX, vecY;
    // Create sparse matrix A in CSR format
    CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_num_nnz,
                                      dA_csrOffsets, dA_columns, dA_values,
                                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                                      CUSPARSE_INDEX_BASE_ZERO, DTYPE) )
    // Create dense vector X
    CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, A_num_cols, dX, DTYPE) )
    // Create dense vector y
    CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, A_num_rows, dY, DTYPE) )
    // allocate an external buffer if needed
    CHECK_CUSPARSE( cusparseSpMV_bufferSize(
                                 handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
                                 CUSPARSE_MV_ALG_DEFAULT, &bufferSize) )
    CHECK_CUDA( cudaMalloc(&dBuffer, bufferSize) )

    // execute SpMV
                                 &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
                                 CUSPARSE_MV_ALG_DEFAULT, dBuffer) )
    CHECK_CUSPARSE( cusparseDestroySpMat(matA) )
    CHECK_CUSPARSE( cusparseDestroyDnVec(vecX) )
    CHECK_CUSPARSE( cusparseDestroyDnVec(vecY) )
    dtype alpha = 1.0f;
    dtype beta  = 0.0f;
    cusparseMatDescr_t descrA;
    CHECK_CUSPARSE( cusparseCreateMatDescr(&descrA) )
    CHECK_CUSPARSE( cusparseCsrmvEx_bufferSize(
      A_num_rows, A_num_cols, A_num_nnz,
      &alpha, DTYPE,
      descrA, dA_values, DTYPE, dA_csrOffsets, dA_columns,
      dX, DTYPE,
      &beta, DTYPE,
      dY, DTYPE,
      &bufferSize) )
    CHECK_CUDA( cudaMalloc(&dBuffer, bufferSize) )
    CHECK_CUSPARSE( cusparseCsrmvEx(
      A_num_rows, A_num_cols, A_num_nnz,
      &alpha, DTYPE,
      descrA, dA_values, DTYPE, dA_csrOffsets, dA_columns,
      dX, DTYPE,
      &beta, DTYPE,
      dY, DTYPE,
      dBuffer) )  // this is line 121
    // destroy matrix/vector descriptors
    CHECK_CUSPARSE( cusparseDestroy(handle) )
    // device result check
    dtype hY[A_num_rows];
    CHECK_CUDA( cudaMemcpy(hY, dY, A_num_rows * sizeof(dtype),
                           cudaMemcpyDeviceToHost) )

    int correct = 1;
    for (int i = 0; i < A_num_rows; i++) {
        if (((float)hY[i]) != ((float)result[i])) {
            correct = 0;
            printf("hY[%d] = %f, result[%d] = %f\n", i, (float)hY[i], i, (float)result[i]);
    if (correct)
        printf("spmv_example test PASSED\n");
        printf("spmv_example test FAILED: wrong result\n");
    // device memory deallocation
    CHECK_CUDA( cudaFree(dBuffer) )
    CHECK_CUDA( cudaFree(dA_csrOffsets) )
    CHECK_CUDA( cudaFree(dA_columns) )
    CHECK_CUDA( cudaFree(dA_values) )
        CHECK_CUDA( cudaFree(dX) )
        CHECK_CUDA( cudaFree(dY) )
    return EXIT_SUCCESS;
$ nvcc -o t1525 -lcusparse -DUSE_SPMV
$ ./t1525
spmv_example test PASSED

Thanks Robert.

I’ll try it later and back to report result.

BTW, I set device pointer mode for cublas and cusparse.

cublasSetPointerMode( cu.blas, CUBLAS_POINTER_MODE_DEVICE );
cusparseSetPointerMode( cu.sp, CUSPARSE_POINTER_MODE_DEVICE );

so alpha and beta parameters are also in device memory.

I believe it is faster, but if there is no difference,
I’ll change it to host pointer mode for simplicity.
Any opinion, please?


I doubt it makes much difference, performance wise. The purpose of it is to simply make it easier to handle situations where the the data may originate in one domain or the other. I would just do whatever makes sense for the dataflow in your program.


I found my code uses some result (like dot product) for later input.
So I have to keep my parameters in device code. It is okay.

And for SpMV and related functions, I tried and link stage failed.
They are not provided for Windows as of CUDA 10.1.

I need my code to be portable.
Is there date of availability for Windows?


I made it to work with cusparseSpMV().

Some troubles occured, about misalignment.
They were from parameter float/half type mistaken.
It’s hard to find with void* interface.
I solved them, anyway.

Now calculation itself is done without error,
but the result is not stable. (lot of NaN and Inf)
I guess it lacks precision in some equations.

Next, I will consider mixed precision, just like;
Sparse Matrix A : 16-bit
Dense Vector x,y : 32-bit
Parameters: 32-bit
Calculation: 32-bit

I need experiments, but it seems not available now.(The interface is ready.)
I hope more cross type flexibility be implemented in cuSparse in the future.

Maybe on CUDA 11?