Hello everyone!
I am currently writing a dynamic library for Mathematica that performs some sparse tensor contractions. One of the key functions that I use is
cusparseSpMM
(to multiply a sparse matrix in CSR form with a dense one). I kept getting the
CUSPARSE_STATUS_INVALID_VALUE
error every time I was calling
cusparseSpMM_bufferSize
(required before the actual matrix multiplication).
In an effort to identify the error, I copied the example for
cusparseSpMV
from https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-example1, which worked flawlessly on my computer. I then modified the script such that it performs a matrix-matrix multiplication (by regarding the vectors as n-by-1 matrices). Now it also compiles fine, but I get the same CUSPARSE_STATUS_INVALID_VALUE runtime error.
I am including the code bellow.
// *** sp_example.c ***
// How to compile (assume CUDA is installed at /usr/local/cuda/)
// nvcc sp_example.c -o sp_example -L/usr/local/cuda/lib64 -lcusparse -lcudart
#include <stdio.h> // printf
#include <stdlib.h> // EXIT_FAILURE
#include <cuda_runtime.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h> // cusparseSpMM
#define CHECK_CUDA(func) \
{ \
cudaError_t status = (func); \
if (status != cudaSuccess) { \
printf("CUDA API failed with error (%d) at line %d\n", \
status, __LINE__); \
return EXIT_FAILURE; \
} \
}
#define CHECK_CUSPARSE(func) \
{ \
cusparseStatus_t status = (func); \
if (status != CUSPARSE_STATUS_SUCCESS) { \
printf("CUSPARSE API failed with error (%d) at line %d\n", \
status, __LINE__); \
return EXIT_FAILURE; \
} \
}
int main() {
// Host problem definition
const int A_num_rows = 4;
const int A_num_cols = 4;
const int A_num_nnz = 9;
int hA_csrOffsets[] = { 0, 3, 4, 7, 9 };
int hA_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3 };
double hA_Values[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f };
double h_X[] = { 1.0f, 2.0f, 3.0f, 4.0f };
const double result[] = { 19.0f, 8.0f, 51.0f, 52.0f };
double alpha = 1.0f;
double beta = 0.0f;
//--------------------------------------------------------------------------
// Device memory management
int *dA_csrOffsets, *dA_columns;
double *dA_values, *d_X, *d_Y;
CHECK_CUDA( cudaMalloc((void**) &dA_csrOffsets,
(A_num_rows + 1) * sizeof(int)) )
CHECK_CUDA( cudaMalloc((void**) &dA_columns, A_num_nnz * sizeof(int)) )
CHECK_CUDA( cudaMalloc((void**) &dA_values, A_num_nnz * sizeof(double)) )
CHECK_CUDA( cudaMalloc((void**) &d_X, A_num_cols * sizeof(double)) )
CHECK_CUDA( cudaMalloc((void**) &d_Y, A_num_rows * sizeof(double)) )
CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets,
(A_num_rows + 1) * sizeof(int),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns, A_num_nnz * sizeof(int),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dA_values, hA_Values,
A_num_nnz * sizeof(double), cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(d_X, h_X, A_num_rows * sizeof(double),
cudaMemcpyHostToDevice) )
//--------------------------------------------------------------------------
// CUSPARSE APIs
cusparseHandle_t handle = 0;
cusparseSpMatDescr_t matA;
cusparseDnMatDescr_t vecX, vecY;
void* dBuffer = NULL;
size_t bufferSize = 0;
CHECK_CUSPARSE( cusparseCreate(&handle) )
// Create sparse matrix A in CSR format
CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_num_nnz,
dA_csrOffsets, dA_columns, dA_values,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F) )
// Create dense vector X
CHECK_CUSPARSE( cusparseCreateDnMat(&vecX, A_num_cols, 1, A_num_cols, d_X, CUDA_R_64F, CUSPARSE_ORDER_COL) )
// Create dense vector Y
CHECK_CUSPARSE( cusparseCreateDnMat(&vecY, A_num_rows, 1, A_num_rows, d_Y, CUDA_R_64F, CUSPARSE_ORDER_COL) )
// allocate an external buffer if needed
CHECK_CUSPARSE( cusparseSpMM_bufferSize(
handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
CUSPARSE_MM_ALG_DEFAULT, &bufferSize) )
CHECK_CUSPARSE( cudaMalloc(&dBuffer, bufferSize) )
// execute SpMM
CHECK_CUSPARSE( cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
CUSPARSE_MM_ALG_DEFAULT, dBuffer) )
// destroy matrix/vector descriptors
CHECK_CUSPARSE( cusparseDestroySpMat(matA) )
CHECK_CUSPARSE( cusparseDestroyDnMat(vecX) )
CHECK_CUSPARSE( cusparseDestroyDnMat(vecY) )
CHECK_CUSPARSE( cusparseDestroy(handle) )
//--------------------------------------------------------------------------
// device result check
double h_Y[A_num_rows];
CHECK_CUDA( cudaMemcpy(h_Y, d_Y, A_num_rows * sizeof(double),
cudaMemcpyDeviceToHost) )
int correct = 1;
for (int i = 0; i < A_num_rows; i++) {
if (h_Y[i] != result[i]) {
correct = 0;
break;
}
}
if (correct)
printf("sp_example test PASSED\n");
else
printf("sp_example test FAILED: wrong result\n");
//--------------------------------------------------------------------------
// device memory deallocation
CHECK_CUDA( cudaFree(dBuffer) )
CHECK_CUDA( cudaFree(dA_csrOffsets) )
CHECK_CUDA( cudaFree(dA_columns) )
CHECK_CUDA( cudaFree(dA_values) )
return EXIT_SUCCESS;
}
The code was compiled with
nvcc sp_example.c -o sp_example -L/usr/local/cuda/lib64 -lcusparse -cudart
. The most probable cause is that I somehow mess up the indices when declaring the dense matrices, but I really cannot figure it out.
Thank you very much in advance!
Also, I forgot to mention that the error output is
CUSPARSE API failed with error (3) at line 84
,
which corresponds to the cusparseSpMM_bufferSize call.