I am trying to multiply a 32-bit float sparse matrix with a 64-bit vector. As I understand it, this should be possible as per the table shown in the documentation. It throws the following error when trying to compute the buffer size:
** On entry to cusparseSpMV_bufferSize(): type combination of matA (CUDA_R_32F), vecX (CUDA_R_64F), vecY (CUDA_R_64F), compute (CUDA_R_64F) is not supported
Cuda version:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
I have modified the example in the CUDALibrarySamples, and it throws the same error:
/*
* Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h> // cusparseSpMV
#include <stdio.h> // printf
#include <stdlib.h> // EXIT_FAILURE
#define CHECK_CUDA(func) \
{ \
cudaError_t status = (func); \
if (status != cudaSuccess) { \
printf("CUDA API failed at line %d with error: %s (%d)\n", \
__LINE__, cudaGetErrorString(status), status); \
return EXIT_FAILURE; \
} \
}
#define CHECK_CUSPARSE(func) \
{ \
cusparseStatus_t status = (func); \
if (status != CUSPARSE_STATUS_SUCCESS) { \
printf("CUSPARSE API failed at line %d with error: %s (%d)\n", \
__LINE__, cusparseGetErrorString(status), status); \
return EXIT_FAILURE; \
} \
}
int main(void) {
// Host problem definition
const int A_num_rows = 4;
const int A_num_cols = 4;
const int A_nnz = 9;
int hA_csrOffsets[] = { 0, 3, 4, 7, 9 };
int hA_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3 };
float hA_values[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f };
double hX[] = { 1.0, 2.0, 3.0, 4.0 };
double hY[] = { 0.0, 0.0, 0.0, 0.0 };
double hY_result[] = { 19.0, 8.0, 51.0, 52.0 };
float alpha = 1.0f;
float beta = 0.0f;
//--------------------------------------------------------------------------
// Device memory management
int *dA_csrOffsets, *dA_columns;
float *dA_values, *dX, *dY;
CHECK_CUDA( cudaMalloc((void**) &dA_csrOffsets,
(A_num_rows + 1) * sizeof(int)) )
CHECK_CUDA( cudaMalloc((void**) &dA_columns, A_nnz * sizeof(int)) )
CHECK_CUDA( cudaMalloc((void**) &dA_values, A_nnz * sizeof(float)) )
CHECK_CUDA( cudaMalloc((void**) &dX, A_num_cols * sizeof(double)) )
CHECK_CUDA( cudaMalloc((void**) &dY, A_num_rows * sizeof(double)) )
CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets,
(A_num_rows + 1) * sizeof(int),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns, A_nnz * sizeof(int),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dA_values, hA_values, A_nnz * sizeof(float),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dX, hX, A_num_cols * sizeof(double),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dY, hY, A_num_rows * sizeof(double),
cudaMemcpyHostToDevice) )
//--------------------------------------------------------------------------
// CUSPARSE APIs
cusparseHandle_t handle = NULL;
cusparseSpMatDescr_t matA;
cusparseDnVecDescr_t vecX, vecY;
void* dBuffer = NULL;
size_t bufferSize = 0;
CHECK_CUSPARSE( cusparseCreate(&handle) )
// Create sparse matrix A in CSR format
CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz,
dA_csrOffsets, dA_columns, dA_values,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F) )
// Create dense vector X
CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, A_num_cols, dX, CUDA_R_64F) )
// Create dense vector y
CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, A_num_rows, dY, CUDA_R_64F) )
// allocate an external buffer if needed
CHECK_CUSPARSE( cusparseSpMV_bufferSize(
handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize) )
CHECK_CUDA( cudaMalloc(&dBuffer, bufferSize) )
// execute SpMV
CHECK_CUSPARSE( cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
CUSPARSE_SPMV_ALG_DEFAULT, dBuffer) )
// destroy matrix/vector descriptors
CHECK_CUSPARSE( cusparseDestroySpMat(matA) )
CHECK_CUSPARSE( cusparseDestroyDnVec(vecX) )
CHECK_CUSPARSE( cusparseDestroyDnVec(vecY) )
CHECK_CUSPARSE( cusparseDestroy(handle) )
//--------------------------------------------------------------------------
// device result check
CHECK_CUDA( cudaMemcpy(hY, dY, A_num_rows * sizeof(float),
cudaMemcpyDeviceToHost) )
int correct = 1;
for (int i = 0; i < A_num_rows; i++) {
if (hY[i] != hY_result[i]) { // direct floating point comparison is not
correct = 0; // reliable
break;
}
}
if (correct)
printf("spmv_csr_example test PASSED\n");
else
printf("spmv_csr_example test FAILED: wrong result\n");
//--------------------------------------------------------------------------
// device memory deallocation
CHECK_CUDA( cudaFree(dBuffer) )
CHECK_CUDA( cudaFree(dA_csrOffsets) )
CHECK_CUDA( cudaFree(dA_columns) )
CHECK_CUDA( cudaFree(dA_values) )
CHECK_CUDA( cudaFree(dX) )
CHECK_CUDA( cudaFree(dY) )
return EXIT_SUCCESS;
}
Is this a bug, or am I doing something wrong?