Buffersize is '0' when using cusparseSpMV_bufferSize

Hello everyone,
I am trying to use the cuSPARSE 10.2 functions “cusparseSpMV”, “cusparseSpMV_bufferSize”, but the bufferSize is ‘0’. I also tried the example provided by CUDA TOOLKIT DOCUMENTION as follows:

// *** spmv_example.c ***
// How to compile (assume CUDA is installed at /usr/local/cuda/)
// nvcc spmv_example.c -o spmv_example -L/usr/local/cuda/lib64 -lcusparse -lcudart
// or, for C compiler
// cc -I/usr/local/cuda/include -c spmv_example.c -o spmv_example.o -std=c99
// nvcc -lcusparse -lcudart spmv_example.o -o spmv_example
#include <cuda_runtime.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h> // cusparseSpMV
#include <stdio.h> // printf
#include <stdlib.h> // EXIT_FAILURE

#define CHECK_CUDA(func)
{
cudaError_t status = (func);
if (status != cudaSuccess) {
printf(“CUDA API failed at line %d with error: %s (%d)\n”,
LINE, cudaGetErrorString(status), status);
return EXIT_FAILURE;
}
}

#define CHECK_CUSPARSE(func)
{
cusparseStatus_t status = (func);
if (status != CUSPARSE_STATUS_SUCCESS) {
printf(“CUSPARSE API failed at line %d with error: %s (%d)\n”,
LINE, cusparseGetErrorString(status), status);
return EXIT_FAILURE;
}
}

int main() {
// Host problem definition
const int A_num_rows = 4;
const int A_num_cols = 4;
const int A_num_nnz = 9;
int hA_csrOffsets = { 0, 3, 4, 7, 9 };
int hA_columns = { 0, 2, 3, 1, 0, 2, 3, 1, 3 };
float hA_values = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f };
float hX = { 1.0f, 2.0f, 3.0f, 4.0f };
const float result = { 19.0f, 8.0f, 51.0f, 52.0f };
float alpha = 1.0f;
float beta = 0.0f;
//--------------------------------------------------------------------------
// Device memory management
int dA_csrOffsets, dA_columns;
float dA_values, dX, dY;
CHECK_CUDA( cudaMalloc((void
) &dA_csrOffsets,
(A_num_rows + 1) * sizeof(int)) )
CHECK_CUDA( cudaMalloc((void
) &dA_columns, A_num_nnz * sizeof(int)) )
CHECK_CUDA( cudaMalloc((void
*) &dA_values, A_num_nnz * sizeof(float)) )
CHECK_CUDA( cudaMalloc((void**) &dX, A_num_cols * sizeof(float)) )
CHECK_CUDA( cudaMalloc((void**) &dY, A_num_rows * sizeof(float)) )

CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets,
                       (A_num_rows + 1) * sizeof(int),
                       cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns, A_num_nnz * sizeof(int),
                       cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dA_values, hA_values,
                       A_num_nnz * sizeof(float), cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dX, hX, A_num_rows * sizeof(float),
                       cudaMemcpyHostToDevice) )
//--------------------------------------------------------------------------
// CUSPARSE APIs
cusparseHandle_t     handle = 0;
cusparseSpMatDescr_t matA;
cusparseDnVecDescr_t vecX, vecY;
void*  dBuffer    = NULL;
size_t bufferSize = 0;
CHECK_CUSPARSE( cusparseCreate(&handle) )
// Create sparse matrix A in CSR format
CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_num_nnz,
                                  dA_csrOffsets, dA_columns, dA_values,
                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                                  CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F) )
// Create dense vector X
CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, A_num_cols, dX, CUDA_R_32F) )
// Create dense vector y
CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, A_num_rows, dY, CUDA_R_32F) )
// allocate an external buffer if needed
CHECK_CUSPARSE( cusparseSpMV_bufferSize(
                             handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                             &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
                             CUSPARSE_MV_ALG_DEFAULT, &bufferSize) )
CHECK_CUSPARSE( cudaMalloc(&dBuffer, bufferSize) )

// execute SpMV
CHECK_CUSPARSE( cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                             &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
                             CUSPARSE_MV_ALG_DEFAULT, dBuffer) )

// destroy matrix/vector descriptors
CHECK_CUSPARSE( cusparseDestroySpMat(matA) )
CHECK_CUSPARSE( cusparseDestroyDnVec(vecX) )
CHECK_CUSPARSE( cusparseDestroyDnVec(vecY) )
CHECK_CUSPARSE( cusparseDestroy(handle) )
//--------------------------------------------------------------------------
// device result check
float hY[A_num_rows];
CHECK_CUDA( cudaMemcpy(hY, dY, A_num_rows * sizeof(float),
                       cudaMemcpyDeviceToHost) )

int correct = 1;
for (int i = 0; i < A_num_rows; i++) {
    if (hY[i] != result[i]) {
        correct = 0;
        break;
    }
}
if (correct)
    printf("spmv_example test PASSED\n");
else
    printf("spmv_example test FAILED: wrong result\n");
//--------------------------------------------------------------------------
// device memory deallocation
CHECK_CUDA( cudaFree(dBuffer) )
CHECK_CUDA( cudaFree(dA_csrOffsets) )
CHECK_CUDA( cudaFree(dA_columns) )
CHECK_CUDA( cudaFree(dA_values) )
CHECK_CUDA( cudaFree(dX) )
CHECK_CUDA( cudaFree(dY) )
return EXIT_SUCCESS;

}

The buffersize is still ‘0’. Can you please give me some suggestions for solving this problem?